KinoSearch1
view release on metacpan or search on metacpan
lib/KinoSearch1/Search/Similarity.pm view on Meta::CPAN
for my $term (@$terms) {
my $doc_freq = $searcher->doc_freq($term);
$idf += 1 + log( $max_doc / ( 1 + $searcher->doc_freq($term) ) );
}
return $idf;
}
# Normalize a Query's weight so that it is comparable to other Queries.
sub query_norm {
my ( $self, $sum_of_squared_weights ) = @_;
return 0 if ( $sum_of_squared_weights == 0 ); # guard against div by zero
return ( 1 / sqrt($sum_of_squared_weights) );
}
# KLUDGE -- see comment at STORABLE_thaw.
sub STORABLE_freeze {
my ( $self, $cloning ) = @_;
return if $cloning;
return "1";
}
package KinoSearch1::Search::TitleSimilarity;
use strict;
use warnings;
use KinoSearch1::Util::ToolSet;
use base qw( KinoSearch1::Search::Similarity );
sub new {
my $self = shift->SUPER::new(@_);
$self->_use_title_tf;
return $self;
}
sub lengthnorm {
return 0 unless $_[1];
return 1 / sqrt( $_[1] );
}
1;
__END__
__XS__
MODULE = KinoSearch1 PACKAGE = KinoSearch1::Search::Similarity
=begin comment
KLUDGE!!
Rather than attempt to serialize a Similarity, we just create a new one.
=end comment
=cut
void
STORABLE_thaw(blank_obj, cloning, serialized)
SV *blank_obj;
SV *cloning;
SV *serialized;
PPCODE:
{
Similarity *sim = Kino1_Sim_new();
SV *deep_obj = SvRV(blank_obj);
sv_setiv(deep_obj, PTR2IV(sim));
}
void
new(either_sv)
SV *either_sv;
PREINIT:
const char *class;
Similarity *sim;
PPCODE:
/* determine the class */
class = sv_isobject(either_sv)
? sv_reftype(either_sv, 0)
: SvPV_nolen(either_sv);
/* build object */
sim = Kino1_Sim_new();
ST(0) = sv_newmortal();
sv_setref_pv(ST(0), class, (void*)sim);
XSRETURN(1);
=for comment
Provide a normalization factor for a field based on the square-root of the
number of terms in it.
=cut
float
lengthnorm(sim, num_terms)
Similarity *sim;
U32 num_terms;
CODE:
num_terms = num_terms < 100 ? 100 : num_terms;
RETVAL = (float)1 / sqrt(num_terms);
OUTPUT: RETVAL
=for comment
Return a score factor based on the frequency of a term in a given document.
The default implementation is sqrt(freq). Other implementations typically
produce ascending scores with ascending freqs, since the more times a doc
matches, the more relevant it is likely to be.
=cut
float
tf(sim, freq)
Similarity *sim;
U32 freq;
CODE:
RETVAL = sim->tf(sim, freq);
OUTPUT: RETVAL
=for comment
_float_to_byte and _byte_to_float encode and decode between 32-bit IEEE
floating point numbers and a 5-bit exponent, 3-bit mantissa float. The range
covered by the single-byte encoding is 7x10^9 to 2x10^-9. The accuracy is
about one significant decimal digit.
=cut
SV*
_float_to_byte(sim, f)
Similarity *sim;
float f;
PREINIT:
char b;
CODE:
b = Kino1_Sim_float2byte(sim, f);
RETVAL = newSVpv(&b, 1);
OUTPUT: RETVAL
float
_byte_to_float(sim, b)
Similarity *sim;
char b;
CODE:
RETVAL = Kino1_Sim_byte2float(sim, b);
OUTPUT: RETVAL
=for comment
The norm_decoder caches the 256 possible byte => float pairs, obviating the
need to call decode_norm over and over for a scoring implementation that
knows how to use it.
=cut
SV*
get_norm_decoder(sim)
Similarity *sim;
CODE:
RETVAL = newSVpv( (char*)sim->norm_decoder, (256 * sizeof(float)) );
OUTPUT: RETVAL
float
coord(sim, overlap, max_overlap)
Similarity *sim;
U32 overlap;
U32 max_overlap;
CODE:
RETVAL = sim->coord(sim, overlap, max_overlap);
OUTPUT: RETVAL
void
_use_title_tf(sim)
Similarity *sim;
PPCODE:
sim->tf = Kino1_Sim_title_tf;
void
DESTROY(sim)
Similarity *sim;
PPCODE:
Kino1_Sim_destroy(sim);
__H__
#ifndef H_KINO_SIMILARITY
#define H_KINO_SIMILARITY 1
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "KinoSearch1UtilMemManager.h"
typedef struct similarity {
float (*tf)(struct similarity*, float);
float (*coord)(struct similarity*, U32, U32);
float *norm_decoder;
} Similarity;
Similarity* Kino1_Sim_new();
float Kino1_Sim_default_tf(Similarity*, float);
float Kino1_Sim_title_tf(Similarity*, float);
char Kino1_Sim_float2byte(Similarity*, float);
float Kino1_Sim_byte2float(Similarity*, char);
float Kino1_Sim_coord(Similarity*, U32, U32);
void Kino1_Sim_destroy(Similarity*);
#endif /* include guard */
__C__
#include "KinoSearch1SearchSimilarity.h"
Similarity*
Kino1_Sim_new() {
int i;
unsigned char aUChar;
Similarity *sim;
Kino1_New(0, sim, 1, Similarity);
/* cache decoded norms */
Kino1_New(0, sim->norm_decoder, 256, float);
for (i = 0; i < 256; i++) {
aUChar = i;
sim->norm_decoder[i] = Kino1_Sim_byte2float(sim, (char)aUChar);
}
sim->tf = Kino1_Sim_default_tf;
sim->coord = Kino1_Sim_coord;
return sim;
}
float
Kino1_Sim_default_tf(Similarity *sim, float freq) {
return( sqrt(freq) );
}
float
Kino1_Sim_title_tf(Similarity *sim, float freq) {
( run in 1.202 second using v1.01-cache-2.11-cpan-5511b514fd6 )