Search-ContextGraph

 view release on metacpan or  search on metacpan

ContextGraph.pm  view on Meta::CPAN

=cut

		
	# PURE PERL VERSION 
	#}  else 	{
		foreach my $e ( @edges ) {
			$self->{neighbors}{$e->[0]}{$e->[1]} = join ',', $e->[2], $e->[3];
			$self->{neighbors}{$e->[1]}{$e->[0]} = join ',', $e->[2], $e->[3];
		}
	#}
	
	
	#print "Reweighting graph\n";
	$self->reweight_graph() if $self->{auto_reweight};
	return 1;

}


=item add_file PATH [, name => NAME, parse => CODE]

Adds a document from a file.   By default, uses the PATH provided as the document
identifier, and parses the file by splitting on whitespace.  If a fancier title, 
or more elegant parsing behavior is desired, pass in named arguments as indicated.
NAME can be any string, CODE should be a reference to a subroutine that takes one
argument (the contents of the file) and returns an array of tokens, or a hash in the
form TOKEN => COUNT, or a reference to the same.

=cut

sub add_file {
	my ( $self, $path, %params ) = @_;
	
	croak "Invalid file '$path' provided to add_file method."
		unless defined $path and -f $path;
		
	my $title = ( exists $params{name} ? $params{name} : $path );

	local $/;
	open my $fh, $path or croak "Unable to open $path: $!";
	my $content = <$fh>;
	
	my $ref;
	
	if ( exists $params{parse} ) {
		croak "code provided is not a reference" unless
			ref $params{parse};
		croak "code provided is not a subroutine" unless
			ref $params{parse} eq 'CODE';
		
		$ref = $params{parse}->( $content );
		croak "did not get an appropriate reference back after parsing"
			unless ref $ref and ref $ref =~ /(HASH|ARRAY)/;
		
		
	} else {
	
		my $code = sub { 
			my $txt  = shift; 
			$txt =~ s/\W/ /g;
			my @toks = split m/\s+/, $txt;
			\@toks;
		};
		$ref = $code->($content);
	}
	
	return unless $ref;
	$self->add( $title, $ref );
	
}

=item bulk_add DOCS

Add documents to the graph in bulk.  Takes as an argument a hash
whose keys are document identifiers, and values are references
to hashes in the form { WORD1 => COUNT, WORD2 => COUNT...}
This method is faster than adding in documents one by one if
you have auto_rebalance turned on.

=cut

sub bulk_add {

	my ( $self, %incoming_docs ) = @_;

	# Disable graph rebalancing until we've added everything
	{
		local $self->{auto_reweight} = 0;

		foreach my $doc ( keys %incoming_docs ) {
			$self->add( $doc, $incoming_docs{$doc});
		} 
	}
	$self->reweight_graph() if $self->{auto_reweight};
}


=item degree NODE

Given a raw node, returns the degree (raw node means the node must
be prefixed with 'D:' or 'T:' depending on type )

=cut

sub degree { scalar keys %{$_[0]->{neighbors}{$_[1]}} }


=item delete DOC

Remove a document from the graph.  Takes a document identifier
as an argument.  Returns 1 if successful, undef otherwise.

=cut

sub delete {

	my ( $self, $type, $name ) = @_;
	
	croak "Must provide a node type to delete() method" unless defined $type;
	croak "Invalid type $type passed to delete method.  Must be one of [TD]"
		unless $type =~ /^[TD]$/io;

ContextGraph.pm  view on Meta::CPAN

	$self->reweight_graph if $self->{auto_reweight};
	1;
}



=item has_doc DOC

Returns true if the document with identifier DOC is in the collection

=cut

sub has_doc { 
	my ( $self, $doc ) = @_;
	carp "Received undefined value for has_doc" unless defined $doc;
	my $node = _nodeify( 'D', $doc );
	return exists $self->{neighbors}{$node} ||  undef;
}

=item has_term TERM

Returns true if the term TERM is in the collection

=cut

sub has_term { 
	my ( $self, $term ) = @_;
	carp "Received undefined value for has_term" unless defined $term;
	my $node = _nodeify( 'T', $term );
	return exists $self->{neighbors}{$node} || undef;
}	



=item distance NODE1, NODE2, TYPE

Calculates the distance between two nodes of the same type (D or T)
using the formula:

	distance = ...
=cut

sub distance {
	my ( $self, $n1, $n2, $type ) = @_;
	croak unless $type;
	$type = lc( $type );
	croak unless $type =~ /^[dt]$/;
	my $key = ( $type eq 't' ? 'terms' : 'documents' );
	my @shared = $self->intersection( $key => [ $n1, $n2 ] );
	return 0 unless @shared;
	#warn "Found ", scalar @shared, " nodes shared between $n1 and $n2\n";
	
	my $node1 = _nodeify( $type, $n1 );
	my $node2 = _nodeify( $type, $n2 );
	# formula is w(t1,d1)/deg(d1) + w(t1,d2)/deg(d2) ... ) /deg( t1 )
	
	#warn "Calculating distance\n";
	my $sum1 = 0;
	my $sum2 = 0;
	foreach my $next ( @shared ) {
		my ( undef, $lcount1) =  split m/,/, $self->{neighbors}{$node1}{$next};
		my ( undef, $lcount2) =  split m/,/, $self->{neighbors}{$node2}{$next};

		my $degree = $self->degree( $next );
		#warn "\t degree of $next is $degree\n";
		my $elem1 = $lcount1 / $degree;
		$sum1 += $elem1;
		my $elem2 = $lcount2 / $degree;
		$sum2 += $elem2;
	}
	#warn "sum is $sum1, $sum2\n";
	my $final = ($sum1 / $self->degree( $node1 )) + ( $sum2 / $self->degree( $node2 ));
	#warn "final is $final\n";
	return $final;
	
	
}

=item distance_matrix TYPE LIMIT

Used for clustering using linear local embedding.  Produces a similarity matrix
in a format I'm too tired to document right now.  LIMIT is the maximum number
of neighbors to keep for each node.

=cut

sub distance_matrix {
	my ( $self, $type, $limit ) = @_;
	croak "Must provide type argument to distance_matrix()" 
		unless defined $type;
	croak "must provide limit" unless $limit;
	my @nodes;
	if ( lc( $type ) eq 'd' ) {
		@nodes = $self->doc_list();
	} elsif ( lc( $type ) eq 't' ) {
		@nodes = $self->term_list();
	} else {
		croak "Unsupported type $type";
	}
	
	my @ret;
	my $count = 0;
	foreach my $from ( @nodes ) {
		warn $from, " - $count\n";
		$count++;
		my $index = -1;
		my @found;
		foreach my $to ( @nodes ) {
			$index++;
			next if $from eq $to;
			my $dist = $self->distance( $from, $to, $type );
			push @found, [ $index, $dist ] if $dist;
			#print( $index++, ' ', $dist, " " ) if $dist;
		}
		my @sorted = sort { $b->[1] <=> $a->[1] } @found;
		my @final = splice ( @sorted, 0, $limit );
		push @ret, join " ", ( map { join ' ', $_->[0],  substr($_->[1], 0, 7)  } 
						  sort { $a->[0] <=> $b->[0] } 
						  @final), "\n";
		#print "\n";
	}
	return join "\n", @ret;

ContextGraph.pm  view on Meta::CPAN

		__normalize( \@edges );

		foreach my $e ( @edges ) {
			my $pair = join ',', $e->[2], $e->[3];
			$n->{$node}{$e->[1]} = $n->{$e->[1]}{$node} = $pair;
		}
	}
	$self->{reweight_flag} = 0;
	return 1;
}




=item update ID, WORDS

Given a document identifier and a word list, updates the information for
that document in the graph.  Returns the number of changes made

=cut

sub update {

	my ( $self, $id, $words ) = @_;

	croak "update not implemented in XS" if $self->{xs};
	croak "Must provide a document identifier to update_document" unless defined $id;
	my $dnode = _nodeify( 'D', $id );

	return unless exists $self->{neighbors}{$dnode};
	croak "must provide a word list " 
		unless defined $words and 
						ref $words and
					  ( ref $words eq 'HASH' or
						ref $words eq 'ARRAY' );

	my $n = $self->{neighbors}{$dnode};
	
	# Get the current word list
	my @terms = keys %{ $n };

	if ( ref $words eq 'ARRAY' ) {
		my %words;
		$words{$_}++ foreach @$words;
		$words = \%words;
	}

	local $self->{auto_reweight} = 0;

	my $must_reweight = 0;
	my %seen;

	foreach my $term ( keys %{$words} ) {

		my $t = _nodeify( 'T', $term );

		if ( exists $n->{$t} ){

			# Update the local count, if necessary
			my $curr_val = $n->{$t};
			my ( undef, $loc ) = split m/,/, $curr_val;

			unless ( $loc == $words->{$term} ) {
				$n->{$t} = join ',', 1, $words->{$term};
				$must_reweight++;
			}	
			}

		else {

			$n->{$t} = 
				$self->{neighbors}{$t}{$dnode} = 
				join ',', 1, $words->{$term};
			$must_reweight++;
		}

		$seen{$t}++;
	}

	# Check for deleted words
	foreach my $t ( @terms ) {
		$must_reweight++ 
			unless exists $seen{$t};
	}

	$self->reweight_graph() if 
		$must_reweight;

	return $must_reweight;

}


=item doc_count [TERM]

Returns a count of all documents that TERM occurs in.
If no argument is provided, returns a document count
for the entire collection.

=cut

sub doc_count {
	my ( $self, $term ) = @_;
	if ( defined $term ) {
		$term = _nodeify( 'T', $term ) unless $term =~ /^T:/;
		my $node = $self->{neighbors}{$term};
		return 0 unless defined $node;
		return scalar keys %{$node};
	} else {
		return scalar grep /D:/, 
			keys %{ $self->{'neighbors'} };
	}
}


=item doc_list [TERM]

Returns a sorted list of document identifiers that contain
TERM, in ASCII-betical order.  If no argument is given,
returns a sorted document list for the whole collection.

ContextGraph.pm  view on Meta::CPAN

=cut

sub dump_node {
	my ( $self, $node ) = @_;

	my @lines;
	push @lines, join "\t", "COUNT", "WEIGHT", "NEIGHBOR";

	foreach my $n ( keys %{ $self->{neighbors}{$node} } ) {
		my $v = $self->{neighbors}{$node}{$n};
		my ( $weight, $count ) = split /,/, $v;
		push @lines, join "\t", $count, substr( $weight, 0, 8 ), $n;
	}
	return @lines;
}



=item dump_tdm [FILE]

Dumps internal state in term-document matrix (TDM) format, which looks 
like this:

	A B C B C B C
	A B C B C B C
	A B C B C B C

Where each row represents a document, A is the number of terms in the
document, B is the term node and C is the edge weight between the doc
node and B.   Mostly used as a legacy format by the module author. 
Doc and term nodes are printed in ASCII-betical sorted order, zero-based 
indexing.  Up to you to keep track of the ID => title mappings, neener-neener!
Use doc_list and term_list to get an equivalently sorted list

=cut

sub dump_tdm {
	my ( $self, $file ) = @_;

	my $counter = 0;
	my %lookup;
	$lookup{$_} = $counter++ foreach $self->term_list;

	my @docs = $self->doc_list;

	my $fh;
	if ( defined $file ) {
		open $fh, "> $file" or croak
			"Could not open TDM output file: $!";
	} else {
		*fh = *STDOUT;
	}
	foreach my $doc ( @docs ) {
		my $n = $self->{neighbors}{$doc};

		my $row_count = scalar keys %{$n};
		print $fh $row_count;

		foreach my $t ( sort keys %{$doc} ) {
			my $index = $lookup{$t};
			my ( $weight, undef ) = split m/,/, $n->{$t};
			print $fh ' ', $index, ' ', $weight;
		}
		print $fh "\n";
	}
}



=item near_neighbors [NODE] 

Returns a list of neighbor nodes of the same type (doc/doc, or term/term) two
hops away.

=cut

sub near_neighbors {
	my ( $self, $name, $type ) = @_;
	
	my $node = _nodeify( $type, $name );
	
	my $n = $self->{neighbors}{$node};
	
	my %found;
	foreach my $next ( keys %{$n} ) {
		foreach my $mynext ( keys %{ $self->{neighbors}{$next} }){
			$found{$mynext}++;
		}
	}
	delete $found{$node};
	return keys %found;
}


=item term_count [DOC]

Returns the number of unique terms in a document or,
if no document is specified, in the entire collection.

=cut

sub term_count {
	my ( $self, $doc ) = @_;
	if ( defined $doc ) {
		my $node = $self->{neighbors}{ _nodeify( 'D', $doc) };
		return 0 unless defined $node;
		return scalar keys %{$node};
	} else {
		return scalar grep /T:/, 
		keys %{ $self->{neighbors} };
	}
}


=item term_list [DOC]

Returns a sorted list of unique terms appearing in the document
with identifier DOC, in ASCII-betical order.  If no argument is
given, returns a sorted term list for the whole collection.

=cut

ContextGraph.pm  view on Meta::CPAN


sub word_count {

	my ( $self, $term ) = @_;

	my $n = $self->{neighbors}; # shortcut

	my $count = 0;
	my @terms;
	if ( defined $term ) {
		push @terms, $term;
	}	else {
		@terms = $self->term_list();
	}

	foreach my $term (@terms ) {
		$term = _nodeify( 'T', $term) unless $term =~/^T:/o;
		foreach my $doc ( keys %{ $n->{$term} } ) {
			( undef, my $lcount ) = split /,/, $n->{$term}{$doc};
			$count += $lcount;
		}
	}

	return $count;
}





=item search @QUERY

Searches the graph for all of the words in @QUERY.  Use find_similar if you
want to do a document similarity instead, or mixed_search if you want
to search on any combination of words and documents.  Returns a pair of hashrefs:
the first a reference to a hash of docs and relevance values, the second to 
a hash of words and relevance values.

=cut

sub search {
	my ( $self, @query ) = @_;	
	my @nodes = _nodeify( 'T', @query );
	my $results = $self->raw_search( @nodes );	
	my ($docs, $words) = _partition( $results );
	return ( $docs, $words);
}



=item simple_search QUERY

This is the DWIM method - takes a query string as its argument, and returns an array
of documents, sorted by relevance.

=cut

sub simple_search {
	my ( $self, $query ) = @_;
	my @words = map { s/\W+//g; lc($_) }
				split m/\s+/, $query;	
	my @nodes = _nodeify( 'T', @words );
	my $results = $self->raw_search( @nodes );
	my ($docs, $words) = _partition( $results );
	my @sorted_docs = sort { $docs->{$b} <=> $docs->{$a} } keys %{$docs};
	return @sorted_docs;
}

=item find_by_title @TITLES

Given a list of patterns, searches for documents with matching titles

=cut

sub find_by_title {	
	my ( $self, @titles ) = @_;
	my @found;
	my @docs = $self->doc_list();
	my $pattern = join '|', @titles;
	my $match_me = qr/$pattern/i;
	#warn $match_me, "\n";
	foreach my $d ( @docs ) {
	#	warn $d, "\n";
		push @found, $d if $d =~ $match_me;
	}
	return @found;
}


=item find_similar @DOCS

Given an array of document identifiers, performs a similarity search 
and  returns a pair of hashrefs. First hashref is to a hash of docs and relevance
 values, second is to a hash of words and relevance values.

=cut

sub find_similar {
	my ( $self, @docs ) = @_;
	my @nodes = _nodeify( 'D', @docs );
	my $results = $self->raw_search( @nodes );
	my ($docs, $words) = _partition( $results );
	return ( $docs, $words);
}


=item merge TYPE, GOOD, @BAD

Combine all the nodes in @BAD into the node with identifier GOOD.
First argument must be one of 'T' or 'D' to indicate term or
document nodes.    Used to combine synonyms in the graph.

=cut

sub merge {
	my ( $self, $type, $good, @bad ) = @_;
	croak "must provide a type argument to merge"
		unless defined $type;
	croak "Invalid type argument $type to merge [must be one of (D,T)]" 
		unless $type =~ /^[DT]/io;
	
	my $target  = _nodeify( $type, $good );
	my @sources = _nodeify( $type, @bad );
	
	my $tnode = $self->{neighbors}{$target};
	

	foreach my $bad_node ( @sources ) {
		#print "Examining $bad_node\n";
		next if $bad_node eq $target;
		my %neighbors = %{$self->{neighbors}{$bad_node}};
		
		foreach my $n ( keys %neighbors ) {
			
			#print "\t $target ($bad_node) neighbor $n\n";
			if ( exists  $self->{neighbors}{$target}{$n} ) {
				#print "\t\t$n has link to $bad_node\n";
				# combine the local counts for the term members of the edge
				my $curr_val = $tnode->{$n};
				my $aug_val  = $self->{neighbors}{$bad_node}{$n};
				my ($w1, $c1) = split m/,/, $curr_val;
				my ($w2, $c2) = split m/,/, $aug_val;
				my $new_count = $c1 + $c2;
				$curr_val =~ s/,\d+$/,$new_count/;
				$tnode->{$n} = $curr_val;
				
				
			} else {
				
				die "sanity check failed for existence test"
					if exists $self->{neighbors}{$target}{$n};
				
				my $val = $self->{neighbors}{$bad_node}{$n};
				
				#print "\tno existing link -- reassigning $target -- $n\n";
				# reassign the current value of this edge
			    
				$self->{neighbors}{$n}{$target} = $val;
				$self->{neighbors}{$target}{$n} = $val;
			}
			
			delete $self->{neighbors}{$bad_node}{$n};
			delete $self->{neighbors}{$n}{$bad_node};
		}
		delete $self->{neighbors}{$bad_node};
	}
}

=item mixed_search @DOCS

Given a hashref in the form:
    { docs => [ 'Title 1', 'Title 2' ],
      terms => ['buffalo', 'fox' ], }
     }
Runs a combined search on the terms and documents provided, and
returns a pair of hashrefs.  The first hashref is to a hash of docs
and relevance values, second is to a hash of words and relevance values.

=cut

sub mixed_search {
	my ( $self, $incoming ) = @_;

	croak "must provide hash ref to mixed_search method"
		unless defined $incoming &&
		ref( $incoming ) &&
		ref( $incoming ) eq 'HASH';

	my $tref = $incoming->{'terms'} || [];
	my $dref = $incoming->{'docs'}  || [];

	my @dnodes = _nodeify( 'D', @{$dref} );
	my @tnodes = _nodeify( 'T', @{$tref} );

	my $results = $self->raw_search( @dnodes, @tnodes );
	my ($docs, $words) = _partition( $results );
	return ( $docs, $words);
}


=item store FILENAME



( run in 0.680 second using v1.01-cache-2.11-cpan-5511b514fd6 )