Freq
view release on metacpan or search on metacpan
print IDS "$_\n" for @{$self->{ids}};
close IDS;
@{ $self->{ids} } = ();
# segment conf
open CONF, ">$path/$nsegments/conf";
print CONF "seg_nwords:", $self->{seg_nwords}, "\n";
print CONF "seg_ndocs:", $self->{seg_ndocs}, "\n";
close CONF;
# top level conf
$self->{nwords} += $self->{seg_nwords};
$self->{ndocs} += $self->{seg_ndocs};
open CONF, ">$path/conf";
binmode CONF;
print CONF 'seg_max_words:', $self->{seg_max_words}, "\n";
print CONF "# DO NOT EDIT BELOW THIS LINE\n";
print CONF 'nwords:', $self->{nwords}, "\n";
print CONF 'nsegments:', $self->{nsegments}, "\n";
print CONF 'ndocs:', $self->{ndocs}, "\n";
close CONF;
$self->{seg_nwords} = 0;
$self->{seg_ndocs} = 0;
return "\nwrote segment with $count isrs.\n";
}
sub _serialize_isr {
my $isr = shift;
my $newisr = '';
$newisr .= pack "L", $isr->[NDOCS]; # num of docs
$newisr .= pack "L", $isr->[NWORDS]; # num of positions.
$newisr .= pack "L", $isr->[LASTDOC]; # last document with this word
$newisr .= pack "L", length $isr->[DX]; # runlength
$newisr .= $isr->[DX];
my $runlengths = pack "w*", map {length $_} @{ $isr->[PX] };
$newisr .= pack "L", length $runlengths;
$newisr .= $runlengths;
$newisr .= join '', @{ $isr->[PX] }; # BER delta lists
return $newisr;
}
sub _configure {
my $path = shift;
my $self = {};
# File "conf" contains seg_max_words, nwords.
open CONF, "<$path/conf" or die $!;
while(<CONF>){
next if m|^#|;
chomp;
my ($key, $value) = split m|:|;
$self->{$key} = $value;
}
close CONF;
return $self;
}
sub optimize_index {
my $path = shift;
my @dirs = sort { $a <=> $b }
grep { /^\d+$/ }
map { s/^.+?\/(\d+)$/$1/; $_ }
glob("$path/*");
print STDERR "Compacting segments ", join(" ", @dirs), "\n";
# gather necessary info for each segment
my (@segments, %words);
for my $segment (@dirs){
my $conf = _configure("$path/$segment");
my %cdb;
tie %cdb, 'CDB_File', "$path/$segment/CDB";
my %localwords = ();
$localwords{$_} = 1 for grep {length($_) < 26} keys %cdb;
$words{$_} = 1 for keys %localwords;
print STDERR "Gathered ", scalar keys %words, " words at segment $segment. \r";
push @segments, [ $conf, \%cdb, "$path/$segment", \%localwords ];
}
# new consolidated index segment
mkdir "$path/NEWSEGMENT";
# create new cdb
print STDERR "Creating new compacted segment. \n";
my $newidx = new CDB_File("$path/NEWSEGMENT/NEW",
"$path/NEWSEGMENT/CDB.tmp") or
die "$0: new CDB_File failed: $!\n";
my $ntokens = scalar keys %words;
my $t0 = time;
while(my($word, undef) = each %words){
$ntokens--;
if(time-$t0 > 2){
print STDERR "Compacting $ntokens th word: $word \r";
$t0 = time;
}
my $isr = new_isr();
my $ndocs = 0;
for my $segment (@segments){
my($conf, $cdb, undef, $localwords) = @$segment;
if(exists $localwords->{$word}){
$isr = _append_isr($isr, $ndocs, _read_isr($cdb, $word));
}
$ndocs += $conf->{seg_ndocs};
}
$newidx->insert($word, _serialize_isr($isr));
( run in 1.834 second using v1.01-cache-2.11-cpan-71847e10f99 )