Algorithm-MinPerfHashTwoLevel
view release on metacpan or search on metacpan
MinPerfHashTwoLevel.xs view on Meta::CPAN
idx2_ptr++;
}
break;
}
set_xor_val_in_buckets(aTHX_ xor_val, buckets_av, idx1, idx2_start, is_used, keys_in_bucket_av);
}
return 0;
}
U32
place_singletons(pTHX_ U32 bucket_count, SV *idx1_packed_sv, AV *keybuckets_av, char *is_used, U32 *idx2_start, AV *buckets_av) {
STRLEN idx1_packed_sv_len;
U32 *idx1_start= (U32 *)SvPV(idx1_packed_sv,idx1_packed_sv_len);
U32 *idx1_ptr;
U32 *idx1_end= idx1_start + (idx1_packed_sv_len / sizeof(U32));
U32 singleton_pos= 0;
for (idx1_ptr= idx1_start; idx1_ptr < idx1_end; idx1_ptr++) {
U32 idx1= *idx1_ptr;
AV *keys_in_bucket_av;
U32 xor_val;
SV **got;
while (singleton_pos < bucket_count && is_used[singleton_pos]) {
singleton_pos++;
}
if (singleton_pos == bucket_count) {
warn("failed to place singleton! idx: %d",idx1);
return idx1 + 1;
}
xor_val= (U32)(-singleton_pos-1);
got= av_fetch(keybuckets_av, idx1, 0);
if (!got)
croak("panic: no keybuckets_av for idx %u",idx1);
keys_in_bucket_av= (AV *)SvRV(*got);
*idx2_start= singleton_pos;
set_xor_val_in_buckets(aTHX_ xor_val, buckets_av, idx1, idx2_start, is_used, keys_in_bucket_av);
}
return 0;
}
U32
solve_collisions_by_length(pTHX_ U32 bucket_count, U32 max_xor_val, AV *by_length_av, AV *h2_packed_av, AV *keybuckets_av, U32 variant, AV *buckets_av) {
U32 bad_idx= 0;
I32 singleton_pos= 0;
IV len_idx;
char *is_used;
U32 *idx2_start;
/* this is used to quickly tell if we have used a particular bucket yet */
Newxz(is_used,bucket_count,char);
SAVEFREEPV(is_used);
/* used to keep track the indexes that a set of keys map into
* stored in an SV just because - we actually treat it as an array of U32 */
Newxz(idx2_start, av_top_index(by_length_av)+1, U32);
SAVEFREEPV(idx2_start);
/* now loop through and process the keysets from most collisions to least */
for (len_idx= av_top_index(by_length_av); len_idx > 0 && !bad_idx; len_idx--) {
SV **idx1_packed_sv= av_fetch(by_length_av, len_idx, 0);
/* deal with the possibility that there are gaps in the length grouping,
* for instance we might have some 13 way collisions and some 11 way collisions
* without any 12-way collisions. (this should be rare - but is possible) */
if (!idx1_packed_sv || !SvPOK(*idx1_packed_sv))
continue;
if (len_idx == 1) {
bad_idx= place_singletons(aTHX_ bucket_count, *idx1_packed_sv, keybuckets_av,
is_used, idx2_start, buckets_av);
} else {
bad_idx= solve_collisions(aTHX_ bucket_count, max_xor_val, *idx1_packed_sv, h2_packed_av, keybuckets_av,
variant, is_used, idx2_start, buckets_av);
}
}
return bad_idx;
}
#define MY_CXT_KEY "Algorithm::MinPerfHashTwoLevel::_stash" XS_VERSION
#define SETOFS(i,he,table,key_ofs,key_len,str_buf_start,str_buf_pos,str_buf_end,str_ofs_hv) \
STMT_START { \
if (he) { \
SV *sv= HeVAL(he); \
if (SvOK(sv)) { \
STRLEN pv_len; \
char *pv; \
SV *ofs_sv; \
if (flags & MPH_F_NO_DEDUPE) { \
ofs_sv= NULL; \
} else { \
HE *ofs= hv_fetch_ent(str_ofs_hv,sv,1,0); \
ofs_sv= ofs ? HeVAL(ofs) : NULL; \
if (!ofs_sv) \
croak("panic: out of memory getting str ofs for " #he "for %u",i); \
} \
if (ofs_sv && SvOK(ofs_sv)){ \
table[i].key_ofs= SvUV(ofs_sv); \
table[i].key_len= sv_len(sv); \
} else { \
pv= SvPV(sv,pv_len); \
table[i].key_len= pv_len; \
if (pv_len) { \
table[i].key_ofs= str_buf_pos - str_buf_start; \
if (str_buf_pos + pv_len > str_buf_end) \
croak("panic: string buffer too small in SETOFS, something went horribly wrong."); \
Copy(pv,str_buf_pos,pv_len,char); \
str_buf_pos += pv_len; \
} else { \
table[i].key_ofs= 1; \
} \
if (ofs_sv) \
sv_setuv(ofs_sv,table[i].key_ofs); \
} \
} else { \
table[i].key_ofs= 0; \
table[i].key_len= 0; \
} \
} else { \
MinPerfHashTwoLevel.xs view on Meta::CPAN
if (he) {
variant= SvUV(HeVAL(he));
} else {
croak("panic: no variant in self?");
}
he= hv_fetch_ent_with_keysv(self_hv,MPH_KEYSV_COMPUTE_FLAGS,0);
if (he) {
compute_flags= SvUV(HeVAL(he));
} else {
croak("panic: no compute_flags in self?");
}
he= hv_fetch_ent_with_keysv(self_hv,MPH_KEYSV_STATE,0);
if (he) {
SV *state_sv= HeVAL(he);
state_pv= (U8 *)SvPV(state_sv,state_len);
if (state_len != MPH_STATE_BYTES) {
croak("Error: state vector must be at exactly %d bytes",(int)MPH_SEED_BYTES);
}
} else {
croak("panic: no state in self?");
}
he= hv_fetch_ent_with_keysv(self_hv,MPH_KEYSV_BUF_LENGTH,1);
if (he) {
buf_length_sv= HeVAL(he);
} else {
croak("panic: out of memory in lvalue fetch for 'buf_length' in self");
}
he= hv_fetch_ent_with_keysv(self_hv,MPH_KEYSV_SOURCE_HASH,0);
if (he) {
source_hv= (HV*)SvRV(HeVAL(he));
} else {
croak("panic: no source_hash in self");
}
he= hv_fetch_ent_with_keysv(self_hv,MPH_KEYSV_BUCKETS,1);
if (he) {
SV *rv= HeVAL(he);
if (SvROK(rv)) {
AV *old_buckets_av= (AV*)SvRV(rv);
SvREFCNT_dec(old_buckets_av);
} else {
sv_upgrade(rv, SVt_RV);
}
buckets_av= newAV();
SvRV_set(rv,(SV*)buckets_av);
SvROK_on(rv);
} else {
croak("panic: out of memory in lvalue fetch for 'buckets' in self");
}
/**** build an array of hashes in keys_av based on the normalized contents of source_hv */
keys_av= (AV *)sv_2mortal((SV*)newAV());
bucket_count= normalize_source_hash(aTHX_ source_hv, keys_av, compute_flags, buf_length_sv, state_pv);
max_xor_val= INT32_MAX;
/* if the caller wants deterministic results we sort the keys_av
* before we compute collisions - depending on the order we process
* the keys we might resolve the collisions differently */
if (compute_flags & MPH_F_DETERMINISTIC)
sortsv(AvARRAY(keys_av),bucket_count,_compare);
/**** find the collisions from the data we just computed, build an AoAoH and AoS of the
**** collision data */
keybuckets_av= (AV*)sv_2mortal((SV*)newAV()); /* AoAoH - hashes from keys_av */
h2_packed_av= (AV*)sv_2mortal((SV*)newAV()); /* AoS - packed h1 */
find_first_level_collisions(aTHX_ bucket_count, keys_av, keybuckets_av, h2_packed_av);
/* Sort the buckets by size by constructing an AoS, with the outer array indexed by length,
* and the inner string being the list of items of that length. (Thus the contents of index
* 0 is empty/undef).
* The end result is we can process the collisions from the most keys to a bucket to the
* least in O(N) and not O(N log2 N).
*
* the length of the array (av_top_index+1) reflect the number of items in the bucket
* with the most collisions - we use this later to size some of our data structures.
*/
by_length_av= idx_by_length(aTHX_ keybuckets_av);
RETVAL= solve_collisions_by_length(aTHX_ bucket_count, max_xor_val, by_length_av, h2_packed_av, keybuckets_av,
variant, buckets_av);
}
OUTPUT:
RETVAL
MODULE = Algorithm::MinPerfHashTwoLevel PACKAGE = Tie::Hash::MinPerfHashTwoLevel::OnDisk
SV *
packed_xs(variant,buf_length_sv,state_sv,comment_sv,flags,buckets_av)
U32 variant
SV* buf_length_sv
SV* state_sv
SV* comment_sv
AV *buckets_av
U32 flags
PREINIT:
dMY_CXT;
PROTOTYPE: $$$$$\@
CODE:
{
U32 buf_length= SvUV(buf_length_sv);
U32 bucket_count= av_top_index(buckets_av) + 1;
U32 header_rlen= _roundup(sizeof(struct mph_header),16);
STRLEN state_len;
char *state_pv= SvPV(state_sv, state_len);
U32 alignment= sizeof(U64);
U32 state_rlen= _roundup(state_len,alignment);
U32 table_rlen= _roundup(sizeof(struct mph_bucket) * bucket_count,alignment);
U32 key_flags_rlen= _roundup((bucket_count * 2 + 7 ) / 8,alignment);
U32 val_flags_rlen= _roundup((bucket_count + 7) / 8,alignment);
U32 str_rlen= _roundup( buf_length
+ 2
+ ( SvOK(comment_sv) ? sv_len(comment_sv) + 1 : 1 )
+ ( 2 + 8 ),
alignment );
U32 total_size;
HV *str_ofs_hv= (HV *)sv_2mortal((SV*)newHV());
SV *sv_buf;
char *start;
struct mph_header *head;
char *state;
struct mph_bucket *table;
char *key_flags;
char *val_flags;
char *str_buf_start;
char *str_buf_end;
char *str_buf_pos;
U32 i;
( run in 0.797 second using v1.01-cache-2.11-cpan-62a16548d74 )