DBIx-TextIndex

 view release on metacpan or  search on metacpan

TextIndex.xs  view on Meta::CPAN

    {
        value &= 0x7f;
        do
        {
	        temp = *(tp + tp_pos); tp_pos++;
	        value = (value << 7) + (temp & 0x7f);
	} while (temp & 0x80);
    }
    *cur_tp_delta = value;
    return tp_pos;
}


/*
unpack_vint_delta(wordptr addr, charptr buffer, N_int length)
{
    ErrCode error = ErrCode_Ok;
    N_word  bits = bits_(addr);
    N_word  offset;
    N_word  index;
    N_word  last_index = 0;
    N_word  temp;

    if (bits > 0)
    {
        BitVector_Empty(addr);
	while ((not error) && (length > 0)) {
	    offset = (N_word) *buffer++; length--;
	    if (offset AND 0x0080)
            {
	        offset &= 0x007F;
		do
		{
		    temp = (N_word) *buffer++; length--;
		    offset = (offset << 7) + (temp & 0x007F);
		} while (temp AND 0x0080);
	    }
	    index = last_index + offset;
	    if (index >= bits) error = ErrCode_Indx;
	    BIT_VECTOR_SET_BIT(addr,index);
	    last_index = index;
	}
    }
    return(error);
}
*/

MODULE = DBIx::TextIndex		PACKAGE = DBIx::TextIndex

PROTOTYPES: DISABLE

BOOT:
{
    /* FIXME: error check */
    bitvec_boot();
}

void
term_docs_hashref(packed)
  SV *packed
PPCODE:
{
    HV *freqs;
    char *string;
    STRLEN len;
    int length;
    unsigned int value;
    int freq_is_next = 0;
    unsigned int doc = 0;
    char temp;

    string = SvPV(packed, len);
    length = len;
    freqs = newHV();
    /* last byte cannot have high bit set */
    if (*(string + length) & 0x80)
        TEXTINDEX_ERROR("unterminated compressed integer");
    while (length > 0) {
	value = *string++; length--;
	if (value & 0x80)
	{
	    value &= 0x7f;
	    do
	    {
		temp = *string++; length--;
		value = (value << 7) + (temp & 0x7f);
	    } while (temp & 0x80);
	}
	if ( freq_is_next ) {
	    hv_store_ent(freqs, newSViv(doc), newSViv(value), 0);
            freq_is_next = 0;
	    continue;
        } 

	doc += value >> 1;
	if (value & 1) {
	    hv_store_ent(freqs, newSViv(doc), newSViv(1), 0);
	} else {
	    freq_is_next = 1;
	}
    }
    XPUSHs(sv_2mortal(newRV_noinc((SV *)freqs)));
}


void
term_docs_arrayref(packed)
  SV *packed
PPCODE:
{
    AV *results;
    char *string;
    STRLEN len;
    int length;
    unsigned int value;
    int freq_is_next = 0;
    unsigned int doc = 0;
    char temp;

    string = SvPV(packed, len);
    length = len;
    results = newAV();
    /* last byte cannot have high bit set */
    if (*(string + length) & 0x80)
        TEXTINDEX_ERROR("unterminated compressed integer");
    while (length > 0) {
	value = *string++; length--;
	if (value & 0x80)
	{
	    value &= 0x7f;
	    do
	    {
		temp = *string++; length--;
		value = (value << 7) + (temp & 0x7f);
	    } while (temp & 0x80);
	}

	if ( freq_is_next ) {
	    av_push(results, newSViv(value));
            freq_is_next = 0;
	    continue;
        }

	doc += value >> 1;
	   av_push(results, newSViv(doc));
	if (value & 1) {
	    av_push(results, newSViv(1));
	} else {
	    freq_is_next = 1;
	}
    }
    XPUSHs(sv_2mortal(newRV_noinc((SV *)results)));
}

void
term_doc_ids_arrayref(packed)
  SV *packed
PPCODE:
{
    AV *results;
    char *string;
    STRLEN len;
    int length;
    unsigned int value;
    int freq_is_next = 0;
    unsigned int doc = 0;
    char temp;

    string = SvPV(packed, len);
    length = len;
    results = newAV();
    /* last byte cannot have high bit set */
    if (*(string + length) & 0x80)
        TEXTINDEX_ERROR("unterminated compressed integer");
    while (length > 0) {
	value = *string++; length--;
	if (value & 0x80)
	{
	    value &= 0x7f;
	    do
	    {
		temp = *string++; length--;
		value = (value << 7) + (temp & 0x7f);
	    } while (temp & 0x80);
	}

	if ( freq_is_next ) {
            freq_is_next = 0;
	    continue;
        }

	doc += value >> 1;
	   av_push(results, newSViv(doc));

	if (! (value & 1)) {
	    freq_is_next = 1;
	}
    }
    XPUSHs(sv_2mortal(newRV_noinc((SV *)results)));
}


void
term_docs_array(packed)
  SV *packed
PPCODE:
{
    char *string;
    STRLEN len;
    int length;
    unsigned int value;
    int freq_is_next = 0;
    unsigned int doc = 0;
    char temp;

    string = SvPV(packed, len);
    length = len;
    /* last byte cannot have high bit set */
    if (*(string + length) & 0x80)
        TEXTINDEX_ERROR("unterminated compressed integer");
    while (length > 0) {
	value = *string++; length--;
	if (value & 0x80)
	{
	    value &= 0x7f;
	    do
	    {
		temp = *string++; length--;
		value = (value << 7) + (temp & 0x7f);
	    } while (temp & 0x80);
	}
	if ( freq_is_next ) {
	    XPUSHs(sv_2mortal(newSViv(value)));
            freq_is_next = 0;
	    continue;
        }

	doc += value >> 1;
	    XPUSHs(sv_2mortal(newSViv(doc)));
	if (value & 1) {
	    XPUSHs(sv_2mortal(newSViv(1)));
	} else {
	    freq_is_next = 1;
	}
    }
}


void
term_docs_and_freqs(packed)
  SV *packed
PPCODE:
{
    AV *docs;
    AV *freqs;
    char *string;
    STRLEN len;
    int length;
    unsigned int value;
    int freq_is_next = 0;
    unsigned int doc = 0;
    char temp;

    string = SvPV(packed, len);
    length = len;
    docs = (AV *)sv_2mortal((SV *)newAV());
    freqs = (AV *)sv_2mortal((SV *)newAV());
    /* last byte cannot have high bit set */
    if (*(string + length) & 0x80)
        TEXTINDEX_ERROR("unterminated compressed integer");
    while (length > 0) {
	value = *string++; length--;
	if (value & 0x80)
	{
	    value &= 0x7f;
	    do
	    {
		temp = *string++; length--;
		value = (value << 7) + (temp & 0x7f);
	    } while (temp & 0x80);
	}
	if ( freq_is_next ) {
	    av_push(freqs, newSViv(value));
            freq_is_next = 0;
	    continue;
        } 

	doc += value >> 1;
	    av_push(docs, newSViv(doc));
	if (value & 1) {
	    av_push(freqs, newSViv(1));
	} else {
	    freq_is_next = 1;
	}
    }

    XPUSHs(newRV_inc((SV *)docs));
    XPUSHs(newRV_inc((SV *)freqs));
}


void
pack_vint(ints_arrayref)
  SV *ints_arrayref
PPCODE:
{
    char *packed;
    AV *term_freqs;
    I32 length = 0;
    unsigned int i, j, value;
    register unsigned long buff;
    if (! TEXTINDEX_DEREF_AV(ints_arrayref, term_freqs )) {
        TEXTINDEX_ERROR("args must be arrayref");
    }
    length = av_len(term_freqs);
    if (length < 0)
        XSRETURN_UNDEF;
    New(1,  packed, (4 * (length + 1)), char );
    j = 0;
    for (i = 0 ; i <= length ; i++) {
        value = SvIV(*av_fetch(term_freqs, i, 0));
 	buff = value & 0x7f;
	while ((value >>= 7)) {
	    buff <<= 8;
            buff |= ((value & 0x7f) | 0x80);
        }

        while (1) {
            *(packed + j) = buff;
            j++;
            if (buff & 0x80)
                buff >>= 8;
            else
                break;
        }
    }
    XPUSHs(sv_2mortal(newSVpv(packed, j)));
    Safefree(packed);
}


void
pack_vint_delta(ints_arrayref)
  SV *ints_arrayref
PPCODE:
{
    char *packed;
    AV *ints_array;
    I32 length = 0;
    unsigned int i, j, value, last_value, delta_value;
    register unsigned long buff;
    if (! TEXTINDEX_DEREF_AV(ints_arrayref, ints_array )) {
        TEXTINDEX_ERROR("args must be arrayref");
    }
    length = av_len(ints_array);
    if (length < 0)
        XSRETURN_UNDEF;
    New(1,  packed, (4 * (length + 1)), char);
    j = 0;
    last_value = 0;
    for (i = 0 ; i <= length ; i++) {
        value = SvIV(*av_fetch(ints_array, i, 0));
	delta_value = value - last_value;
	last_value = value;

 	buff = delta_value & 0x7f;
	while ((delta_value >>= 7)) {
	    buff <<= 8;
            buff |= ((delta_value & 0x7f) | 0x80);
        }
        while (1) {
            *(packed + j) = buff;
            j++;
            if (buff & 0x80)
                buff >>= 8;
            else
                break;
        }
    }
    XPUSHs(sv_2mortal(newSVpv(packed, j)));
    Safefree(packed);
}

void
pack_term_docs(term_docs_arrayref)
  SV *term_docs_arrayref
PPCODE:
{
    char *packed;
    I32 length = 0;
    unsigned int i, j, last_doc, value;
    register unsigned long buff;
    if (( !SvROK(term_docs_arrayref)
           || (SvTYPE(SvRV(term_docs_arrayref)) != SVt_PVAV) ))
    {
        TEXTINDEX_ERROR("args must be arrayref");
    }
    length = av_len((AV *)SvRV(term_docs_arrayref));
    if (length < 1)
        XSRETURN_UNDEF;
    if ((length + 1) % 2 != 0)
        TEXTINDEX_ERROR("array must contain even number of elements");
    New(1,  packed, (4 * (length + 1)), char);
    if (packed == NULL)
        TEXTINDEX_ERROR("unable to allocate memory");
    j = 0;
    last_doc = 0;
    for (i = 0 ; i <= length ; i+= 2) {
        int doc  = SvIV(*av_fetch((AV *)SvRV(term_docs_arrayref), i, 0));
	int freq = SvIV(*av_fetch((AV *)SvRV(term_docs_arrayref), i + 1, 0));

	value = (doc - last_doc) << 1;
	if (freq == 1)
            value += 1;

        buff = value & 0x7f;
        while ((value >>= 7)) {
	    buff <<= 8;
            buff |= ((value & 0x7f) | 0x80);
        }
        while (1) {
            *(packed + j) = buff;
            j++;
            if (buff & 0x80)
                buff >>= 8;
            else
                break;
        }
        if (freq > 1) {
            buff = freq & 0x7f;
            while ((freq >>= 7)) {
	        buff <<= 8;
                buff |= ((freq & 0x7f) | 0x80);
            }
            while (1) {
                *(packed + j) = buff;
                j++;
                if (buff & 0x80)
                    buff >>= 8;
                else
                    break;
            }
        }
        last_doc = doc;
    }
    XPUSHs(sv_2mortal(newSVpv((char *)packed, j)));
    Safefree(packed);
}

void
pack_term_docs_append_vint(packed, vint)
  SV *packed
  SV *vint
PPCODE:
{
    char *str_a, *str_b, *newpack;
    STRLEN len_a, len_b;
    I32 length_a = 0;
    I32 length_b = 0;
    int length = 0;
    int freq_is_next = 0;
    unsigned int value, val, i, j, freq;
    unsigned int doc = 0;
    unsigned int max_doc = 0;
    unsigned int last_doc = 0;
    register unsigned long buff;
    char temp;

    str_a = SvPV(packed, len_a);
    length_a = len_a;

    str_b = SvPV(vint, len_b);
    length_b = len_b;

    if (length_b < 1) {
        XPUSHs(sv_2mortal(newSVpv((char *)str_a, length_a)));
        return;
    }	

    New(2, newpack, ( length_a + (4 * (length_b + 1)) ), char);
    if (newpack == NULL)
        TEXTINDEX_ERROR("unable to allocate memory");

    Copy(str_a, newpack, length_a, char);

    /* Step 1: get max_doc (highest doc id) from 1st arg (packed) */

    length = length_a;
    /* last byte cannot have high bit set */
    if (*(str_a + length) & 0x80)
        TEXTINDEX_ERROR("unterminated compressed integer");
    while (length > 0) {
	value = *str_a++; length--;
	if (value & 0x80)
	{
	    value &= 0x7f;
	    do
	    {
		temp = *str_a++; length--;
		value = (value << 7) + (temp & 0x7f);
	    } while (temp & 0x80);
	}
	if ( freq_is_next ) {
            freq_is_next = 0;
	    continue;
        } 

	doc += value >> 1;
            max_doc = doc;

	if (! (value & 1)) {
	    freq_is_next = 1;
	}
    }

TextIndex.xs  view on Meta::CPAN

	        if (length < 0)
	            TEXTINDEX_ERROR("unterminated compressed integer"); 
		value = (value << 7) + (temp & 0x7f);
	    } while (temp & 0x80);
	}
 	if (i % 2 == 0) {
            doc = value;
        } else {
            freq = value;

	    val = (doc - last_doc) << 1;
            if (freq == 1)
                val += 1;

            buff = val & 0x7f;
            while ((val >>= 7)) {
	        buff <<= 8;
                buff |= ((val & 0x7f) | 0x80);
            }

            while (1) {
                *(newpack + j) = buff;
                j++;
                if (buff & 0x80)
                    buff >>= 8;
                else
                    break;
            }
            if (freq > 1) {
                buff = freq & 0x7f;
                while ((freq >>= 7)) {
	            buff <<= 8;
                    buff |= ((freq & 0x7f) | 0x80);
                }
                while (1) {
                    *(newpack + j) = buff;
                    j++;
                    if (buff & 0x80)
                        buff >>= 8;
                    else
                        break;
                }
            }
            last_doc = doc;
        }
        i++;
    }
    XPUSHs(sv_2mortal(newSVpv((char *)newpack, j)));
    Safefree(newpack);
}

void
pos_search(and_vec_ref, term_docs_arrayref, term_pos_arrayref, prox_SV, \
	   and_vec_min_SV, and_vec_max_SV)
  SV *and_vec_ref
  SV *term_docs_arrayref
  SV *term_pos_arrayref
  SV *prox_SV
  SV *and_vec_min_SV
  SV *and_vec_max_SV
PPCODE:
{
    I32 *length_td,
        *length_tp;
    unsigned int term_count,
                 prox        = SvIV(prox_SV),
	         and_vec_min = SvIV(and_vec_min_SV),
		 and_vec_max = SvIV(and_vec_max_SV),
		 doc,
		 doc_n,
		 *last_doc,
		 freq,
                 freq_n,
		 *freqs,
		 *td_pos,
		 **positions,
		 *tp_idx,
		 *tp_pos,
                 cur_tp_delta,
		 cur_tp_delta_n,
                 *cur_tp_idx,
		 seq_count,
		 last_pos,
		 next_pos,
		 a,
                 i,
		 j,
		 k;
    unsigned int *and_vec;
    SV *and_vec_obj;
    AV *term_docs;
    AV *term_pos;
    AV *results;
    char **tp;
    char **td;
    STRLEN len;

    if (! TEXTINDEX_DEREF_BITVEC(and_vec_ref, and_vec_obj, and_vec)) {
        TEXTINDEX_ERROR("arg1 must be Bit::Vector object");
    }
    if (! TEXTINDEX_DEREF_AV(term_docs_arrayref, term_docs)) {
        TEXTINDEX_ERROR("arg2 must be arrayref");
    }
    if (! TEXTINDEX_DEREF_AV(term_pos_arrayref, term_pos)) {
        TEXTINDEX_ERROR("arg3 must be arrayref");
    }

    results = newAV();

    if (prox < 1) prox = 1;

    term_count = av_len(term_docs) + 1;

    if (term_count <= 0)
        XSRETURN_UNDEF;

    /* Allocate memory for arrays */
    New(1, td, term_count, char *);
    New(2, length_td, term_count, I32);
    New(3, tp, term_count, char *);
    New(4, length_tp, term_count, I32);

TextIndex.xs  view on Meta::CPAN

	    }
	    freqs[j] = freq_n;
	    for (a = 1; a < freq_n; a++) {
		positions[j][a] = positions[j][a] + positions[j][a-1];
	    }
	}
	/* Loop through the accumulated position arrays */
	for (a = 0; a < freq; a++) {
	    seq_count = 1;
	    last_pos = positions[0][a];
	    for (j = 1; j <= term_count - 1; j++) {
		for (k = 0; k < freqs[j]; k++) {
		    next_pos = positions[j][k];
		    if (next_pos > last_pos && next_pos <= last_pos + prox) {
			seq_count++;
			last_pos = next_pos;
		    } /* FIXME: we can break out early by testing for skipped positions */
		}
	    }
	    if (seq_count == term_count) {
		av_push(results, newSViv(doc));
		break;
	    }
	}
    }
    Safefree(td);
    Safefree(length_td);
    Safefree(tp);
    Safefree(length_tp);
    Safefree(td_pos);
    Safefree(last_doc);
    Safefree(tp_idx);
    Safefree(cur_tp_idx);
    Safefree(tp_pos);
    Safefree(freqs);
    for (j = 0; j <= term_count - 1; j++) {
	Safefree(positions[j]);
    }
    Safefree(positions);
    XPUSHs(sv_2mortal(newRV_noinc((SV *)results)));
}


void
score_term_docs_okapi(term_docs, score_hashref, bitvec_ref, acc_lim_SV, \
                      res_min_SV, res_max_SV, idf_SV, f_t_SV, W_D_arrayref, \
                      avg_W_d_SV, w_qt_SV, k1_SV, b_SV)
  SV *term_docs
  SV *score_hashref
  SV *bitvec_ref
  SV *acc_lim_SV
  SV *res_min_SV
  SV *res_max_SV
  SV *f_t_SV
  SV *idf_SV
  SV *W_D_arrayref
  SV *avg_W_d_SV
  SV *w_qt_SV
  SV *k1_SV
  SV *b_SV
PPCODE:
{
    int          acc_size,
                 length;
    unsigned int acc_lim    =  SvIV(acc_lim_SV),
                 f_t        =  SvIV(f_t_SV),
                 res_min    =  SvIV(res_min_SV),
                 res_max    =  SvIV(res_max_SV),
                 doc,
                 last_doc,
                 f_dt,
                 old_score,
                 i,
                 pos,
                 *bitvec;

    double       idf        =  SvNV(idf_SV),
                 avg_W_d    =  SvNV(avg_W_d_SV),
                 w_qt       =  SvNV(w_qt_SV),
                 k1         =  SvNV(k1_SV),
                 b          =  SvNV(b_SV),
                 W_d,
                 TF,
                 doc_score;
    char *string;
    SV *bitvec_obj;
    SV *doc_id;
    AV *W_D;
    HV *score;
    HE *score_he;
    STRLEN len;

    string  = SvPV(term_docs, len);
    length  = len;

    if (! TEXTINDEX_DEREF_AV(W_D_arrayref, W_D)) {
        TEXTINDEX_ERROR("arg9 must be arrayref");
    }
    if (! TEXTINDEX_DEREF_HV(score_hashref, score)) {
        TEXTINDEX_ERROR("arg2 must be arrayref");
    }
    if (! TEXTINDEX_DEREF_BITVEC(bitvec_ref, bitvec_obj, bitvec)) {
        TEXTINDEX_ERROR("arg3 must be Bit::Vector object");
    }
    if (av_len(W_D) + 1 < res_max + 1) {
        TEXTINDEX_ERROR("bad W_D data was passed or res_max less than zero");
    }
    pos = 0;
    last_doc = 0;
    acc_size = 0;
    for (i = 0; (i < f_t) && (acc_size < acc_lim); i++) {
	pos = get_doc_freq_pair(string, pos, last_doc, &doc, &f_dt);
	last_doc = doc;
	if (doc > res_max) break;
        if (doc < res_min) continue;
        if ( ! bitvec_test_bit(bitvec, doc) ) continue;
        W_d = SvNV(*av_fetch(W_D, doc, 0));
        TF = (((k1 + 1) * f_dt) / (k1 * ((1 - b)+((b * W_d)/avg_W_d)) + f_dt));
        doc_score = idf * TF * w_qt;
        doc_id = newSViv(doc);
        score_he = hv_fetch_ent(score, doc_id, TRUE, 0);



( run in 0.489 second using v1.01-cache-2.11-cpan-71847e10f99 )