Search-ContextGraph
view release on metacpan or search on metacpan
ContextGraph.pm view on Meta::CPAN
=cut
# PURE PERL VERSION
#} else {
foreach my $e ( @edges ) {
$self->{neighbors}{$e->[0]}{$e->[1]} = join ',', $e->[2], $e->[3];
$self->{neighbors}{$e->[1]}{$e->[0]} = join ',', $e->[2], $e->[3];
}
#}
#print "Reweighting graph\n";
$self->reweight_graph() if $self->{auto_reweight};
return 1;
}
=item add_file PATH [, name => NAME, parse => CODE]
Adds a document from a file. By default, uses the PATH provided as the document
identifier, and parses the file by splitting on whitespace. If a fancier title,
or more elegant parsing behavior is desired, pass in named arguments as indicated.
NAME can be any string, CODE should be a reference to a subroutine that takes one
argument (the contents of the file) and returns an array of tokens, or a hash in the
form TOKEN => COUNT, or a reference to the same.
=cut
sub add_file {
my ( $self, $path, %params ) = @_;
croak "Invalid file '$path' provided to add_file method."
unless defined $path and -f $path;
my $title = ( exists $params{name} ? $params{name} : $path );
local $/;
open my $fh, $path or croak "Unable to open $path: $!";
my $content = <$fh>;
my $ref;
if ( exists $params{parse} ) {
croak "code provided is not a reference" unless
ref $params{parse};
croak "code provided is not a subroutine" unless
ref $params{parse} eq 'CODE';
$ref = $params{parse}->( $content );
croak "did not get an appropriate reference back after parsing"
unless ref $ref and ref $ref =~ /(HASH|ARRAY)/;
} else {
my $code = sub {
my $txt = shift;
$txt =~ s/\W/ /g;
my @toks = split m/\s+/, $txt;
\@toks;
};
$ref = $code->($content);
}
return unless $ref;
$self->add( $title, $ref );
}
=item bulk_add DOCS
Add documents to the graph in bulk. Takes as an argument a hash
whose keys are document identifiers, and values are references
to hashes in the form { WORD1 => COUNT, WORD2 => COUNT...}
This method is faster than adding in documents one by one if
you have auto_rebalance turned on.
=cut
sub bulk_add {
my ( $self, %incoming_docs ) = @_;
# Disable graph rebalancing until we've added everything
{
local $self->{auto_reweight} = 0;
foreach my $doc ( keys %incoming_docs ) {
$self->add( $doc, $incoming_docs{$doc});
}
}
$self->reweight_graph() if $self->{auto_reweight};
}
=item degree NODE
Given a raw node, returns the degree (raw node means the node must
be prefixed with 'D:' or 'T:' depending on type )
=cut
sub degree { scalar keys %{$_[0]->{neighbors}{$_[1]}} }
=item delete DOC
Remove a document from the graph. Takes a document identifier
as an argument. Returns 1 if successful, undef otherwise.
=cut
sub delete {
my ( $self, $type, $name ) = @_;
croak "Must provide a node type to delete() method" unless defined $type;
croak "Invalid type $type passed to delete method. Must be one of [TD]"
unless $type =~ /^[TD]$/io;
ContextGraph.pm view on Meta::CPAN
$self->reweight_graph if $self->{auto_reweight};
1;
}
=item has_doc DOC
Returns true if the document with identifier DOC is in the collection
=cut
sub has_doc {
my ( $self, $doc ) = @_;
carp "Received undefined value for has_doc" unless defined $doc;
my $node = _nodeify( 'D', $doc );
return exists $self->{neighbors}{$node} || undef;
}
=item has_term TERM
Returns true if the term TERM is in the collection
=cut
sub has_term {
my ( $self, $term ) = @_;
carp "Received undefined value for has_term" unless defined $term;
my $node = _nodeify( 'T', $term );
return exists $self->{neighbors}{$node} || undef;
}
=item distance NODE1, NODE2, TYPE
Calculates the distance between two nodes of the same type (D or T)
using the formula:
distance = ...
=cut
sub distance {
my ( $self, $n1, $n2, $type ) = @_;
croak unless $type;
$type = lc( $type );
croak unless $type =~ /^[dt]$/;
my $key = ( $type eq 't' ? 'terms' : 'documents' );
my @shared = $self->intersection( $key => [ $n1, $n2 ] );
return 0 unless @shared;
#warn "Found ", scalar @shared, " nodes shared between $n1 and $n2\n";
my $node1 = _nodeify( $type, $n1 );
my $node2 = _nodeify( $type, $n2 );
# formula is w(t1,d1)/deg(d1) + w(t1,d2)/deg(d2) ... ) /deg( t1 )
#warn "Calculating distance\n";
my $sum1 = 0;
my $sum2 = 0;
foreach my $next ( @shared ) {
my ( undef, $lcount1) = split m/,/, $self->{neighbors}{$node1}{$next};
my ( undef, $lcount2) = split m/,/, $self->{neighbors}{$node2}{$next};
my $degree = $self->degree( $next );
#warn "\t degree of $next is $degree\n";
my $elem1 = $lcount1 / $degree;
$sum1 += $elem1;
my $elem2 = $lcount2 / $degree;
$sum2 += $elem2;
}
#warn "sum is $sum1, $sum2\n";
my $final = ($sum1 / $self->degree( $node1 )) + ( $sum2 / $self->degree( $node2 ));
#warn "final is $final\n";
return $final;
}
=item distance_matrix TYPE LIMIT
Used for clustering using linear local embedding. Produces a similarity matrix
in a format I'm too tired to document right now. LIMIT is the maximum number
of neighbors to keep for each node.
=cut
sub distance_matrix {
my ( $self, $type, $limit ) = @_;
croak "Must provide type argument to distance_matrix()"
unless defined $type;
croak "must provide limit" unless $limit;
my @nodes;
if ( lc( $type ) eq 'd' ) {
@nodes = $self->doc_list();
} elsif ( lc( $type ) eq 't' ) {
@nodes = $self->term_list();
} else {
croak "Unsupported type $type";
}
my @ret;
my $count = 0;
foreach my $from ( @nodes ) {
warn $from, " - $count\n";
$count++;
my $index = -1;
my @found;
foreach my $to ( @nodes ) {
$index++;
next if $from eq $to;
my $dist = $self->distance( $from, $to, $type );
push @found, [ $index, $dist ] if $dist;
#print( $index++, ' ', $dist, " " ) if $dist;
}
my @sorted = sort { $b->[1] <=> $a->[1] } @found;
my @final = splice ( @sorted, 0, $limit );
push @ret, join " ", ( map { join ' ', $_->[0], substr($_->[1], 0, 7) }
sort { $a->[0] <=> $b->[0] }
@final), "\n";
#print "\n";
}
return join "\n", @ret;
ContextGraph.pm view on Meta::CPAN
__normalize( \@edges );
foreach my $e ( @edges ) {
my $pair = join ',', $e->[2], $e->[3];
$n->{$node}{$e->[1]} = $n->{$e->[1]}{$node} = $pair;
}
}
$self->{reweight_flag} = 0;
return 1;
}
=item update ID, WORDS
Given a document identifier and a word list, updates the information for
that document in the graph. Returns the number of changes made
=cut
sub update {
my ( $self, $id, $words ) = @_;
croak "update not implemented in XS" if $self->{xs};
croak "Must provide a document identifier to update_document" unless defined $id;
my $dnode = _nodeify( 'D', $id );
return unless exists $self->{neighbors}{$dnode};
croak "must provide a word list "
unless defined $words and
ref $words and
( ref $words eq 'HASH' or
ref $words eq 'ARRAY' );
my $n = $self->{neighbors}{$dnode};
# Get the current word list
my @terms = keys %{ $n };
if ( ref $words eq 'ARRAY' ) {
my %words;
$words{$_}++ foreach @$words;
$words = \%words;
}
local $self->{auto_reweight} = 0;
my $must_reweight = 0;
my %seen;
foreach my $term ( keys %{$words} ) {
my $t = _nodeify( 'T', $term );
if ( exists $n->{$t} ){
# Update the local count, if necessary
my $curr_val = $n->{$t};
my ( undef, $loc ) = split m/,/, $curr_val;
unless ( $loc == $words->{$term} ) {
$n->{$t} = join ',', 1, $words->{$term};
$must_reweight++;
}
}
else {
$n->{$t} =
$self->{neighbors}{$t}{$dnode} =
join ',', 1, $words->{$term};
$must_reweight++;
}
$seen{$t}++;
}
# Check for deleted words
foreach my $t ( @terms ) {
$must_reweight++
unless exists $seen{$t};
}
$self->reweight_graph() if
$must_reweight;
return $must_reweight;
}
=item doc_count [TERM]
Returns a count of all documents that TERM occurs in.
If no argument is provided, returns a document count
for the entire collection.
=cut
sub doc_count {
my ( $self, $term ) = @_;
if ( defined $term ) {
$term = _nodeify( 'T', $term ) unless $term =~ /^T:/;
my $node = $self->{neighbors}{$term};
return 0 unless defined $node;
return scalar keys %{$node};
} else {
return scalar grep /D:/,
keys %{ $self->{'neighbors'} };
}
}
=item doc_list [TERM]
Returns a sorted list of document identifiers that contain
TERM, in ASCII-betical order. If no argument is given,
returns a sorted document list for the whole collection.
ContextGraph.pm view on Meta::CPAN
=cut
sub dump_node {
my ( $self, $node ) = @_;
my @lines;
push @lines, join "\t", "COUNT", "WEIGHT", "NEIGHBOR";
foreach my $n ( keys %{ $self->{neighbors}{$node} } ) {
my $v = $self->{neighbors}{$node}{$n};
my ( $weight, $count ) = split /,/, $v;
push @lines, join "\t", $count, substr( $weight, 0, 8 ), $n;
}
return @lines;
}
=item dump_tdm [FILE]
Dumps internal state in term-document matrix (TDM) format, which looks
like this:
A B C B C B C
A B C B C B C
A B C B C B C
Where each row represents a document, A is the number of terms in the
document, B is the term node and C is the edge weight between the doc
node and B. Mostly used as a legacy format by the module author.
Doc and term nodes are printed in ASCII-betical sorted order, zero-based
indexing. Up to you to keep track of the ID => title mappings, neener-neener!
Use doc_list and term_list to get an equivalently sorted list
=cut
sub dump_tdm {
my ( $self, $file ) = @_;
my $counter = 0;
my %lookup;
$lookup{$_} = $counter++ foreach $self->term_list;
my @docs = $self->doc_list;
my $fh;
if ( defined $file ) {
open $fh, "> $file" or croak
"Could not open TDM output file: $!";
} else {
*fh = *STDOUT;
}
foreach my $doc ( @docs ) {
my $n = $self->{neighbors}{$doc};
my $row_count = scalar keys %{$n};
print $fh $row_count;
foreach my $t ( sort keys %{$doc} ) {
my $index = $lookup{$t};
my ( $weight, undef ) = split m/,/, $n->{$t};
print $fh ' ', $index, ' ', $weight;
}
print $fh "\n";
}
}
=item near_neighbors [NODE]
Returns a list of neighbor nodes of the same type (doc/doc, or term/term) two
hops away.
=cut
sub near_neighbors {
my ( $self, $name, $type ) = @_;
my $node = _nodeify( $type, $name );
my $n = $self->{neighbors}{$node};
my %found;
foreach my $next ( keys %{$n} ) {
foreach my $mynext ( keys %{ $self->{neighbors}{$next} }){
$found{$mynext}++;
}
}
delete $found{$node};
return keys %found;
}
=item term_count [DOC]
Returns the number of unique terms in a document or,
if no document is specified, in the entire collection.
=cut
sub term_count {
my ( $self, $doc ) = @_;
if ( defined $doc ) {
my $node = $self->{neighbors}{ _nodeify( 'D', $doc) };
return 0 unless defined $node;
return scalar keys %{$node};
} else {
return scalar grep /T:/,
keys %{ $self->{neighbors} };
}
}
=item term_list [DOC]
Returns a sorted list of unique terms appearing in the document
with identifier DOC, in ASCII-betical order. If no argument is
given, returns a sorted term list for the whole collection.
=cut
ContextGraph.pm view on Meta::CPAN
sub word_count {
my ( $self, $term ) = @_;
my $n = $self->{neighbors}; # shortcut
my $count = 0;
my @terms;
if ( defined $term ) {
push @terms, $term;
} else {
@terms = $self->term_list();
}
foreach my $term (@terms ) {
$term = _nodeify( 'T', $term) unless $term =~/^T:/o;
foreach my $doc ( keys %{ $n->{$term} } ) {
( undef, my $lcount ) = split /,/, $n->{$term}{$doc};
$count += $lcount;
}
}
return $count;
}
=item search @QUERY
Searches the graph for all of the words in @QUERY. Use find_similar if you
want to do a document similarity instead, or mixed_search if you want
to search on any combination of words and documents. Returns a pair of hashrefs:
the first a reference to a hash of docs and relevance values, the second to
a hash of words and relevance values.
=cut
sub search {
my ( $self, @query ) = @_;
my @nodes = _nodeify( 'T', @query );
my $results = $self->raw_search( @nodes );
my ($docs, $words) = _partition( $results );
return ( $docs, $words);
}
=item simple_search QUERY
This is the DWIM method - takes a query string as its argument, and returns an array
of documents, sorted by relevance.
=cut
sub simple_search {
my ( $self, $query ) = @_;
my @words = map { s/\W+//g; lc($_) }
split m/\s+/, $query;
my @nodes = _nodeify( 'T', @words );
my $results = $self->raw_search( @nodes );
my ($docs, $words) = _partition( $results );
my @sorted_docs = sort { $docs->{$b} <=> $docs->{$a} } keys %{$docs};
return @sorted_docs;
}
=item find_by_title @TITLES
Given a list of patterns, searches for documents with matching titles
=cut
sub find_by_title {
my ( $self, @titles ) = @_;
my @found;
my @docs = $self->doc_list();
my $pattern = join '|', @titles;
my $match_me = qr/$pattern/i;
#warn $match_me, "\n";
foreach my $d ( @docs ) {
# warn $d, "\n";
push @found, $d if $d =~ $match_me;
}
return @found;
}
=item find_similar @DOCS
Given an array of document identifiers, performs a similarity search
and returns a pair of hashrefs. First hashref is to a hash of docs and relevance
values, second is to a hash of words and relevance values.
=cut
sub find_similar {
my ( $self, @docs ) = @_;
my @nodes = _nodeify( 'D', @docs );
my $results = $self->raw_search( @nodes );
my ($docs, $words) = _partition( $results );
return ( $docs, $words);
}
=item merge TYPE, GOOD, @BAD
Combine all the nodes in @BAD into the node with identifier GOOD.
First argument must be one of 'T' or 'D' to indicate term or
document nodes. Used to combine synonyms in the graph.
=cut
sub merge {
my ( $self, $type, $good, @bad ) = @_;
croak "must provide a type argument to merge"
unless defined $type;
croak "Invalid type argument $type to merge [must be one of (D,T)]"
unless $type =~ /^[DT]/io;
my $target = _nodeify( $type, $good );
my @sources = _nodeify( $type, @bad );
my $tnode = $self->{neighbors}{$target};
foreach my $bad_node ( @sources ) {
#print "Examining $bad_node\n";
next if $bad_node eq $target;
my %neighbors = %{$self->{neighbors}{$bad_node}};
foreach my $n ( keys %neighbors ) {
#print "\t $target ($bad_node) neighbor $n\n";
if ( exists $self->{neighbors}{$target}{$n} ) {
#print "\t\t$n has link to $bad_node\n";
# combine the local counts for the term members of the edge
my $curr_val = $tnode->{$n};
my $aug_val = $self->{neighbors}{$bad_node}{$n};
my ($w1, $c1) = split m/,/, $curr_val;
my ($w2, $c2) = split m/,/, $aug_val;
my $new_count = $c1 + $c2;
$curr_val =~ s/,\d+$/,$new_count/;
$tnode->{$n} = $curr_val;
} else {
die "sanity check failed for existence test"
if exists $self->{neighbors}{$target}{$n};
my $val = $self->{neighbors}{$bad_node}{$n};
#print "\tno existing link -- reassigning $target -- $n\n";
# reassign the current value of this edge
$self->{neighbors}{$n}{$target} = $val;
$self->{neighbors}{$target}{$n} = $val;
}
delete $self->{neighbors}{$bad_node}{$n};
delete $self->{neighbors}{$n}{$bad_node};
}
delete $self->{neighbors}{$bad_node};
}
}
=item mixed_search @DOCS
Given a hashref in the form:
{ docs => [ 'Title 1', 'Title 2' ],
terms => ['buffalo', 'fox' ], }
}
Runs a combined search on the terms and documents provided, and
returns a pair of hashrefs. The first hashref is to a hash of docs
and relevance values, second is to a hash of words and relevance values.
=cut
sub mixed_search {
my ( $self, $incoming ) = @_;
croak "must provide hash ref to mixed_search method"
unless defined $incoming &&
ref( $incoming ) &&
ref( $incoming ) eq 'HASH';
my $tref = $incoming->{'terms'} || [];
my $dref = $incoming->{'docs'} || [];
my @dnodes = _nodeify( 'D', @{$dref} );
my @tnodes = _nodeify( 'T', @{$tref} );
my $results = $self->raw_search( @dnodes, @tnodes );
my ($docs, $words) = _partition( $results );
return ( $docs, $words);
}
=item store FILENAME
( run in 0.680 second using v1.01-cache-2.11-cpan-5511b514fd6 )