App-Sets

 view release on metacpan or  search on metacpan

lib/App/Sets/Sort.pm  view on Meta::CPAN

   } or return;
   my @lines = <$fh>;
   return unless scalar(@lines) == 4;
   return unless defined $lines[3];
   $lines[3] =~ s{\s+}{}gmxs;
   return unless $lines[3] eq 'two';

   return 1;
}

sub sort_filehandle {
   my ($filename, $config) = @_;
   $config ||= {};
   state $has_sort = (!$config->{internal_sort}) && _test_external_sort();

   if ($has_sort) {
      my $fh;
      eval { open $fh, '-|', 'sort', '-u', $filename } and return $fh;
      WARN 'cannot use system sort, falling back to internal implementation';
      $has_sort = 0; # from now on, use internal sort
   }

   return internal_sort_filehandle($filename);
}

sub internal_sort_filehandle {
   my ($filename) = @_;

   # Open input stream
   open my $ifh, '<', $filename
      or LOGDIE "open('$filename'): $OS_ERROR";

   # Maximum values hints taken from Perl Power Tools' sort
   my $max_records = $ENV{SETS_MAX_RECORDS} || 200_000;
   my $max_files = $ENV{SETS_MAX_FILES} || 40;
   my (@records, @fhs);
   while (<$ifh>) {
      chomp;
      push @records, $_;
      if (@records >= $max_records) {
         push @fhs, _flush_to_temp(\@records);
         _compact(\@fhs) if @fhs >= $max_files - 1;
      }
   }

   push @fhs, _flush_to_temp(\@records) if @records;
   _compact(\@fhs);
   return $fhs[0] if @fhs;

   # seems like the file was empty... so it's sorted
   seek $ifh, 0, SEEK_SET;
   return $ifh;
}

sub _flush_to_temp {
   my ($records) = @_;
   my $tfh = tempfile(UNLINK => 1);
   my $previous;
   for my $item (sort @$records) {
      next if defined($previous) && $previous eq $item;
      print {$tfh} $item, $INPUT_RECORD_SEPARATOR;
   }
   @$records = ();
   seek $tfh, 0, SEEK_SET;
   return $tfh;
}

sub _compact {
   my ($fhs) = @_;
   return if @$fhs == 1;

   # where the output will end up
   my $ofh = tempfile(UNLINK => 1);

   # convenience hash for tracking all contributors
   my %its = map {
      my $fh = $fhs->[$_];
      my $head = <$fh>;
      if (defined $head) {
         chomp($head);
         $_ => [ $fh, $head ];
      }
      else { () }
   } 0 .. $#$fhs;

   # iterate until all contributors are exhausted
   while (scalar keys %its) {

      # select the best (i.e. "lower"), cleanup on the way
      my ($fk, @keys) = keys %its;
      my $best = $its{$fk}[1];
      for my $key (@keys) {
         my $head = $its{$key}[1];
         $best = $head if $best gt $head;
      }
      print {$ofh} $best, $INPUT_RECORD_SEPARATOR;

      # get rid of the best in all iterators, cleanup on the way
      KEY:
      for my $key ($fk, @keys) {
         my $head = $its{$key}[1];
         while ($head eq $best) {
            $head = readline $its{$key}[0];
            if (defined $head) {
               chomp($its{$key}[1] = $head);
            }
            else {
               delete $its{$key};
               next KEY;
            }
         }
      }
   }

   # rewind, finalize compacting, return
   seek $ofh, 0, SEEK_SET;
   @$fhs = ($ofh);
   return;
}

1;

__END__

=pod

=head1 NAME

App::Sets::Sort - sort handling

=head1 VERSION

version 0.978

=head1 AUTHOR

Flavio Poletti <polettix@cpan.org>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2011-2016 by Flavio Poletti polettix@cpan.org.

This module is free software.  You can redistribute it and/or
modify it under the terms of the Artistic License 2.0.

This program is distributed in the hope that it will be useful,
but without any warranty; without even the implied warranty of
merchantability or fitness for a particular purpose.

=cut



( run in 1.427 second using v1.01-cache-2.11-cpan-437f7b0c052 )