Lingua-TT
view release on metacpan or search on metacpan
TT/Document.pm view on Meta::CPAN
my ($doc,$file,%opts) = @_;
my $ttio = Lingua::TT::IO->fromFile($file,%opts)
or die((ref($doc)||$doc)."::fromFile(): open failed for '$file': $!");
my $got = $ttio->getDocument;
$ttio->close();
return $got if (!ref($doc));
@$doc = @$got;
return $doc;
}
## $doc = $CLASS_OR_OBJECT->toFile($filename_or_fh,%opts)
## + saves $doc to file
BEGIN { *save = *saveNativeFile = \&toFile; }
sub toFile {
my ($doc,$file,%opts) = @_;
my $ttio = Lingua::TT::IO->toFile($file,%opts)
or die((ref($doc)||$doc)."::toFile(): open failed for '$file': $!");
my $rc = $ttio->putDocument($doc);
$ttio->close();
return $rc ? $doc : undef;
}
##==============================================================================
## Methods: Shuffle & Split
## $doc = $doc->shuffle(%opts)
## + randomly re-orders sentences in @$doc to @$doc2
## + %opts:
## seed => $seed, ##-- calls srand($seed) if defined
sub shuffle {
my ($doc,%opts) = @_;
srand($opts{seed}) if (defined($opts{seed}));
my @keys = map {rand} @$doc;
@$doc = @$doc[sort {$keys[$a]<=>$keys[$b]} (0..$#$doc)];
return $doc;
}
## @docs = $doc->splitN($n,%opts) ##-- array context
## \@docs = $doc->splitN($n,%opts) ##-- scalar context
## + splits $doc deterministically into $n roughly equally-sized @docs
## + sentence data is shared (refs) between $doc and @docs
## + for a random split, call $doc->shuffle(seed=>$seed)->splitN($n)
## + %opts:
## contiguous => $bool, ##-- if true, output @docs will represent contiguous sections of input (alias: 'contig')
sub splitN {
my ($doc,$n,%opts) = @_;
my @odocs = map {$doc->new} (1..$n);
my @osizes = map {0} @odocs;
if ($opts{contiguous} || $opts{contig}) {
##-- contiguous mode
my $oi = 0;
my $osize = $doc->nTokens / ($n || 1);
my ($sent);
foreach $sent (@$doc) {
push(@{$odocs[$oi]}, $sent);
$osizes[$oi] += scalar(@$sent);
++$oi if ($osizes[$oi] >= $osize);
}
} else {
##-- best-split mode
my ($sent,$oi,$oi_min);
foreach $sent (@$doc) {
##-- find smallest @odoc
$oi_min = 0;
foreach $oi (1..$#odocs) {
$oi_min = $oi if ($osizes[$oi] < $osizes[$oi_min]);
}
push(@{$odocs[$oi_min]}, $sent);
$osizes[$oi_min] += scalar(@$sent);
}
}
return wantarray ? @odocs : \@odocs;
}
##==============================================================================
## Footer
1;
__END__
( run in 0.753 second using v1.01-cache-2.11-cpan-71847e10f99 )