Freq

 view release on metacpan or  search on metacpan

Freq.pm  view on Meta::CPAN

    print IDS "$_\n" for @{$self->{ids}};
    close IDS;
    @{ $self->{ids} } = ();

    # segment conf
    open CONF, ">$path/$nsegments/conf";
    print CONF "seg_nwords:", $self->{seg_nwords}, "\n";
    print CONF "seg_ndocs:", $self->{seg_ndocs}, "\n";
    close CONF;


    # top level conf
    $self->{nwords} += $self->{seg_nwords};
    $self->{ndocs} += $self->{seg_ndocs};

    open CONF, ">$path/conf";
    binmode CONF;
    print CONF 'seg_max_words:', $self->{seg_max_words}, "\n";
    print CONF "# DO NOT EDIT BELOW THIS LINE\n";
    print CONF 'nwords:', $self->{nwords}, "\n";
    print CONF 'nsegments:', $self->{nsegments}, "\n";
    print CONF 'ndocs:', $self->{ndocs}, "\n";
    close CONF;

    $self->{seg_nwords} = 0;
    $self->{seg_ndocs} = 0;

    return "\nwrote segment with $count isrs.\n";
}

sub _serialize_isr {
    my $isr = shift;

    my $newisr = '';
    $newisr .= pack "L", $isr->[NDOCS];   # num of docs
    $newisr .= pack "L", $isr->[NWORDS];  # num of positions.
    $newisr .= pack "L", $isr->[LASTDOC]; # last document with this word

    $newisr .= pack "L", length $isr->[DX]; # runlength
    $newisr .= $isr->[DX]; 

    my $runlengths = pack "w*", map {length $_} @{ $isr->[PX] };
    $newisr .= pack "L", length $runlengths;
    $newisr .= $runlengths;
    $newisr .= join '', @{ $isr->[PX] }; # BER delta lists

    return $newisr;
}


sub _configure {
    my $path = shift;
    my $self = {};    

    # File "conf" contains seg_max_words, nwords.

    open CONF, "<$path/conf" or die $!;
    while(<CONF>){
        next if m|^#|;
        chomp;
        my ($key, $value) = split m|:|;
        $self->{$key} = $value;
    }
    close CONF;

    return $self;
}



sub optimize_index {
    my $path = shift;

    my @dirs = sort { $a <=> $b }
               grep { /^\d+$/ }
               map { s/^.+?\/(\d+)$/$1/; $_ }
               glob("$path/*");

    print STDERR "Compacting segments ", join(" ", @dirs), "\n";

    # gather necessary info for each segment
    my (@segments, %words);
    for my $segment (@dirs){
        my $conf = _configure("$path/$segment");
        my %cdb;
        tie %cdb, 'CDB_File', "$path/$segment/CDB";
        my %localwords = ();
        $localwords{$_} = 1 for grep {length($_) < 26} keys %cdb;
        $words{$_} = 1 for keys %localwords;
        print STDERR "Gathered ", scalar keys %words, " words at segment $segment.           \r";
        push @segments, [ $conf, \%cdb, "$path/$segment", \%localwords ];
    }


    # new consolidated index segment
    mkdir "$path/NEWSEGMENT";

    # create new cdb
    print STDERR "Creating new compacted segment.                        \n";
    my $newidx = new CDB_File("$path/NEWSEGMENT/NEW", 
                            "$path/NEWSEGMENT/CDB.tmp") or 
        die "$0: new CDB_File failed: $!\n";

    my $ntokens = scalar keys %words;
    my $t0 = time;
    while(my($word, undef) = each %words){
        $ntokens--;
        if(time-$t0 > 2){
            print STDERR "Compacting $ntokens th word: $word                       \r";
            $t0 = time;
        }
        my $isr = new_isr();
        my $ndocs = 0;
        for my $segment (@segments){
            my($conf, $cdb, undef, $localwords) = @$segment;
            if(exists $localwords->{$word}){ 
                $isr = _append_isr($isr, $ndocs, _read_isr($cdb, $word));
            }
            $ndocs += $conf->{seg_ndocs};
        }
        $newidx->insert($word, _serialize_isr($isr));



( run in 1.834 second using v1.01-cache-2.11-cpan-71847e10f99 )