Lingua-TT

 view release on metacpan or  search on metacpan

TT/Document.pm  view on Meta::CPAN

  my ($doc,$file,%opts) = @_;
  my $ttio = Lingua::TT::IO->fromFile($file,%opts)
    or die((ref($doc)||$doc)."::fromFile(): open failed for '$file': $!");
  my $got = $ttio->getDocument;
  $ttio->close();
  return $got if (!ref($doc));
  @$doc = @$got;
  return $doc;
}

## $doc = $CLASS_OR_OBJECT->toFile($filename_or_fh,%opts)
##  + saves $doc to file
BEGIN { *save = *saveNativeFile = \&toFile; }
sub toFile {
  my ($doc,$file,%opts) = @_;
  my $ttio = Lingua::TT::IO->toFile($file,%opts)
    or die((ref($doc)||$doc)."::toFile(): open failed for '$file': $!");
  my $rc = $ttio->putDocument($doc);
  $ttio->close();
  return $rc ? $doc : undef;
}


##==============================================================================
## Methods: Shuffle & Split

## $doc = $doc->shuffle(%opts)
##  + randomly re-orders sentences in @$doc to @$doc2
##  + %opts:
##    seed => $seed, ##-- calls srand($seed) if defined
sub shuffle {
  my ($doc,%opts) = @_;
  srand($opts{seed}) if (defined($opts{seed}));
  my @keys = map {rand} @$doc;
  @$doc = @$doc[sort {$keys[$a]<=>$keys[$b]} (0..$#$doc)];
  return $doc;
}

##  @docs = $doc->splitN($n,%opts)  ##-- array context
## \@docs = $doc->splitN($n,%opts)  ##-- scalar context
##  + splits $doc deterministically into $n roughly equally-sized @docs
##  + sentence data is shared (refs) between $doc and @docs
##  + for a random split, call $doc->shuffle(seed=>$seed)->splitN($n)
##  + %opts:
##     contiguous => $bool,	##-- if true, output @docs will represent contiguous sections of input (alias: 'contig')
sub splitN {
  my ($doc,$n,%opts) = @_;
  my @odocs  = map {$doc->new} (1..$n);
  my @osizes = map {0} @odocs;
  if ($opts{contiguous} || $opts{contig}) {
    ##-- contiguous mode
    my $oi = 0;
    my $osize = $doc->nTokens / ($n || 1);
    my ($sent);
    foreach $sent (@$doc) {
      push(@{$odocs[$oi]}, $sent);
      $osizes[$oi] += scalar(@$sent);
      ++$oi if ($osizes[$oi] >= $osize);
    }
  } else {
    ##-- best-split mode
    my ($sent,$oi,$oi_min);
    foreach $sent (@$doc) {
      ##-- find smallest @odoc
      $oi_min = 0;
      foreach $oi (1..$#odocs) {
	$oi_min = $oi if ($osizes[$oi] < $osizes[$oi_min]);
      }
      push(@{$odocs[$oi_min]}, $sent);
      $osizes[$oi_min] += scalar(@$sent);
    }
  }
  return wantarray ? @odocs : \@odocs;
}


##==============================================================================
## Footer
1;

__END__



( run in 0.753 second using v1.01-cache-2.11-cpan-71847e10f99 )