Algorithm-MinPerfHashTwoLevel
view release on metacpan or search on metacpan
MinPerfHashTwoLevel.xs view on Meta::CPAN
#define PERL_NO_GET_CONTEXT
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#define NEED_newRV_noinc
#define NEED_sv_2pv_flags
#include "ppport.h"
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <assert.h>
#include "mph2l.h"
#include "mph_hv_macro.h"
#include "mph_siphash.h"
#define MAX_VARIANT 5
#define MIN_VARIANT 5
MPH_STATIC_INLINE void
sv_set_from_bucket(pTHX_ SV *sv, U8 *strs, const U32 ofs, const U32 len, const U32 idx, const U8 *flags, const U32 bits, const U8 utf8_default, const U8 utf8_default_shift) {
U8 *ptr;
U8 is_utf8;
if (ofs) {
ptr= (strs) + (ofs);
if (utf8_default) {
is_utf8= utf8_default >> utf8_default_shift;
} else {
GETBITS(is_utf8,flags,idx,bits);
}
} else {
ptr= 0;
is_utf8= 0;
}
/* note that sv_setpvn() will cause the sv to
* become undef if ptr is 0 */
sv_setpvn_mg((sv),ptr,len);
if (is_utf8 > 1) {
sv_utf8_upgrade(sv);
}
else
if (is_utf8) {
SvUTF8_on(sv);
}
else
if (ptr) {
SvUTF8_off(sv);
}
}
MPH_STATIC_INLINE int
lookup_bucket(pTHX_ struct mph_header *mph, U32 index, SV *key_sv, SV *val_sv)
{
struct mph_bucket *bucket;
U8 *strs;
U8 *mph_u8= (U8*)mph;
U64 gf= mph->general_flags;
if (index >= mph->num_buckets) {
return 0;
}
bucket= (struct mph_bucket *)((char *)mph + mph->table_ofs) + index;
strs= (U8 *)mph + mph->str_buf_ofs;
if (val_sv) {
sv_set_from_bucket(aTHX_ val_sv,strs,bucket->val_ofs,bucket->val_len,index,mph_u8 + mph->val_flags_ofs,1,
gf & MPH_VALS_ARE_SAME_UTF8NESS_MASK, MPH_VALS_ARE_SAME_UTF8NESS_SHIFT);
}
if (key_sv) {
sv_set_from_bucket(aTHX_ key_sv,strs,bucket->key_ofs,bucket->key_len,index,mph_u8 + mph->key_flags_ofs,2,
gf & MPH_KEYS_ARE_SAME_UTF8NESS_MASK, MPH_KEYS_ARE_SAME_UTF8NESS_SHIFT);
}
return 1;
}
MPH_STATIC_INLINE int
lookup_key(pTHX_ struct mph_header *mph, SV *key_sv, SV *val_sv)
{
U8 *strs= (U8 *)mph + mph->str_buf_ofs;
struct mph_bucket *buckets= (struct mph_bucket *) ((char *)mph + mph->table_ofs);
struct mph_bucket *bucket;
U8 *state= (char *)mph + mph->state_ofs;
STRLEN key_len;
U8 *key_pv;
U64 h0;
U32 h1;
U32 h2;
U32 index;
U8 *got_key_pv;
STRLEN got_key_len;
if (SvUTF8(key_sv)) {
SV *tmp= sv_2mortal(newSVsv(key_sv));
sv_utf8_downgrade(tmp,1);
key_sv= tmp;
}
key_pv= SvPV(key_sv,key_len);
h0= mph_hash_with_state(state,key_pv,key_len);
h1= h0 >> 32;
index= h1 % mph->num_buckets;
bucket= buckets + index;
if (!bucket->xor_val)
return 0;
h2= h0 & 0xFFFFFFFF;
if ( bucket->index < 0 ) {
index = -bucket->index-1;
} else {
HASH2INDEX(index,h2,bucket->xor_val,mph->num_buckets);
}
bucket= buckets + index;
got_key_pv= strs + bucket->key_ofs;
if (bucket->key_len == key_len && memEQ(key_pv,got_key_pv,key_len)) {
if (val_sv) {
U64 gf= mph->general_flags;
sv_set_from_bucket(aTHX_ val_sv,strs,bucket->val_ofs,bucket->val_len,index,((U8*)mph)+mph->val_flags_ofs,1,
gf & MPH_VALS_ARE_SAME_UTF8NESS_MASK, MPH_VALS_ARE_SAME_UTF8NESS_SHIFT);
}
return 1;
}
return 0;
}
IV
mph_mmap(pTHX_ char *file, struct mph_obj *obj, SV *error, U32 flags) {
struct stat st;
struct mph_header *head;
int fd = open(file, O_RDONLY, 0);
void *ptr;
U32 alignment;
if (error)
sv_setpvs(error,"");
if (fd < 0) {
if (error)
sv_setpvf(error,"file '%s' could not be opened for read", file);
return MPH_MOUNT_ERROR_OPEN_FAILED;
}
if (fstat(fd,&st)==-1) {
if (error)
sv_setpvf(error,"file '%s' could not be fstat()ed", file);
return MPH_MOUNT_ERROR_FSTAT_FAILED;
}
if (st.st_size < sizeof(struct mph_header)) {
if (error)
sv_setpvf(error,"file '%s' is too small to be a valid PH2L file", file);
return MPH_MOUNT_ERROR_TOO_SMALL;
}
ptr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED | MPH_MAP_POPULATE, fd, 0);
close(fd); /* kernel holds its own refcount on the file, we do not need to keep it open */
if (ptr == MAP_FAILED) {
if (error)
sv_setpvf(error,"failed to create mapping to file '%s'", file);
MinPerfHashTwoLevel.xs view on Meta::CPAN
if (head->magic_num == MAGIC_BIG_ENDIAN_DECIMAL) {
if (error)
sv_setpvf(error,"this is a big-endian machine, cant handle PH2L files here");
}
if (error)
sv_setpvf(error,"file '%s' is not a PH2L file", file);
return MPH_MOUNT_ERROR_BAD_MAGIC;
}
if (head->variant < MIN_VARIANT) {
if (error)
sv_setpvf(error,"unsupported old version '%d' in '%s'", head->variant, file);
return MPH_MOUNT_ERROR_BAD_VERSION;
}
if (head->variant > MAX_VARIANT) {
if (error)
sv_setpvf(error,"unknown version '%d' in '%s'", head->variant, file);
return MPH_MOUNT_ERROR_BAD_VERSION;
}
alignment = sizeof(U64);
if (st.st_size % alignment) {
if (error)
sv_setpvf(error,"file '%s' does not have a size which is a multiple of 16 bytes", file);
return MPH_MOUNT_ERROR_BAD_SIZE;
}
if (
head->table_ofs < head->state_ofs ||
head->key_flags_ofs < head->table_ofs ||
head->val_flags_ofs < head->key_flags_ofs ||
head->str_buf_ofs < head->val_flags_ofs ||
st.st_size < head->str_buf_ofs
) {
if (error)
sv_setpvf(error,"corrupt header offsets in '%s'", file);
return MPH_MOUNT_ERROR_BAD_OFFSETS;
}
if (flags & MPH_F_VALIDATE) {
char *start= ptr;
char *state_pv= start + head->state_ofs;
char *str_buf_start= start + head->str_buf_ofs;
char *str_buf_end= start + st.st_size;
U64 have_file_checksum= mph_hash_with_state(state_pv, start, st.st_size - sizeof(U64));
U64 want_file_checksum= *((U64 *)(str_buf_end - sizeof(U64)));
if (have_file_checksum != want_file_checksum) {
if (error)
sv_setpvf(error,"file checksum '%016lx' != '%016lx' in file '%s'",
have_file_checksum,want_file_checksum,file);
return MPH_MOUNT_ERROR_CORRUPT_FILE;
}
}
return head->variant;
}
void
mph_munmap(struct mph_obj *obj) {
munmap(obj->header,obj->bytes);
}
STRLEN
normalize_with_flags(pTHX_ SV *sv, SV *normalized_sv, SV *is_utf8_sv, int downgrade) {
STRLEN len;
if (SvROK(sv)) {
croak("Error: Not expecting a reference value in source hash");
}
sv_setsv(normalized_sv,sv);
if (SvOK(sv)) {
STRLEN pv_len;
char *pv= SvPV(sv,pv_len);
if (pv_len > 0xFFFF)
croak("Error: String in source hash is too long to store, max length is %u got length %lu", 0xFFFF, pv_len);
if (SvUTF8(sv)) {
if (downgrade)
sv_utf8_downgrade(normalized_sv,1);
if (SvUTF8(normalized_sv)) {
SvUTF8_off(normalized_sv);
sv_setiv(is_utf8_sv,1);
} else {
sv_setiv(is_utf8_sv,2);
}
}
return pv_len;
} else {
sv_setiv(is_utf8_sv, 0);
return 0;
}
}
U32
_roundup(const U32 n, const U32 s) {
const U32 r= n % s;
if (r) {
return n + s - r;
} else {
return n;
}
}
START_MY_CXT
I32
_compare(pTHX_ SV *a, SV *b) {
dMY_CXT;
HE *a_he= hv_fetch_ent_with_keysv((HV*)SvRV(a),MPH_KEYSV_KEY_NORMALIZED,0);
HE *b_he= hv_fetch_ent_with_keysv((HV*)SvRV(b),MPH_KEYSV_KEY_NORMALIZED,0);
return sv_cmp(HeVAL(a_he),HeVAL(b_he));
}
U32
normalize_source_hash(pTHX_ HV *source_hv, AV *keys_av, U32 compute_flags, SV *buf_length_sv, char *state_pv) {
dMY_CXT;
HE *he;
U32 buf_length= 0;
U32 ctr;
hv_iterinit(source_hv);
while (he= hv_iternext(source_hv)) {
SV *val_sv= HeVAL(he);
SV *val_normalized_sv;
SV *val_is_utf8_sv;
SV *key_sv;
SV *key_normalized_sv;
SV *key_is_utf8_sv;
HV *hv;
U8 *key_pv;
STRLEN key_len;
U64 h0;
if (!val_sv) croak("panic: no sv for value?");
if (!SvOK(val_sv) && (compute_flags & MPH_F_FILTER_UNDEF)) continue;
hv= newHV();
val_normalized_sv= newSV(0);
val_is_utf8_sv= newSVuv(0);
key_sv= newSVhek(HeKEY_hek(he));
key_normalized_sv= newSV(0);
key_is_utf8_sv= newSVuv(0);
hv_ksplit(hv,15);
hv_store_ent_with_keysv(hv,MPH_KEYSV_KEY, key_sv);
hv_store_ent_with_keysv(hv,MPH_KEYSV_KEY_NORMALIZED, key_normalized_sv);
hv_store_ent_with_keysv(hv,MPH_KEYSV_KEY_IS_UTF8, key_is_utf8_sv);
hv_store_ent_with_keysv(hv,MPH_KEYSV_VAL, SvREFCNT_inc_simple_NN(val_sv));
hv_store_ent_with_keysv(hv,MPH_KEYSV_VAL_NORMALIZED, val_normalized_sv);
hv_store_ent_with_keysv(hv,MPH_KEYSV_VAL_IS_UTF8, val_is_utf8_sv);
/* install everything into the keys_av just in case normalize_with_flags() dies */
av_push(keys_av,newRV_noinc((SV*)hv));
buf_length += normalize_with_flags(aTHX_ key_sv, key_normalized_sv, key_is_utf8_sv, 1);
buf_length += normalize_with_flags(aTHX_ val_sv, val_normalized_sv, val_is_utf8_sv, 0);
key_pv= (U8 *)SvPV(key_normalized_sv,key_len);
h0= mph_hash_with_state(state_pv,key_pv,key_len);
hv_store_ent_with_keysv(hv,MPH_KEYSV_H0, newSVuv(h0));
}
if (buf_length_sv)
sv_setuv(buf_length_sv, buf_length);
/* we now know how many keys there are, and what the max_xor_val should be */
return av_top_index(keys_av)+1;
}
void
find_first_level_collisions(pTHX_ U32 bucket_count, AV *keys_av, AV *keybuckets_av, AV *h2_packed_av) {
dMY_CXT;
U32 i;
for (i=0; i<bucket_count;i++) {
U64 h0;
U32 h1;
U32 h2;
U32 idx1;
SV **got_psv;
SV* h0_sv;
HE* h0_he;
HV *hv;
AV *av;
got_psv= av_fetch(keys_av,i,0);
if (!got_psv || !SvROK(*got_psv)) croak("panic: bad item in keys_av");
hv= (HV *)SvRV(*got_psv);
h0_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_H0,0);
if (!h0_he) croak("panic: no h0_he?");
h0_sv= HeVAL(h0_he);
h0= SvUV(h0_sv);
h1= h0 >> 32;
h2= h0 & 0xFFFFFFFF;
idx1= h1 % bucket_count;
got_psv= av_fetch(h2_packed_av,idx1,1);
if (!got_psv)
croak("panic: out of memory creating new h2_packed_av element");
if (!SvPOK(*got_psv))
sv_setpvs(*got_psv,"");
sv_catpvn(*got_psv, (char *)&h2, 4);
got_psv= av_fetch(keybuckets_av,idx1,1);
if (!got_psv)
croak("panic: out of memory creating new keybuckets_av element");
if (!SvROK(*got_psv)) {
av= newAV();
sv_upgrade(*got_psv,SVt_RV);
SvRV_set(*got_psv,(SV *)av);
SvROK_on(*got_psv);
} else {
av= (AV *)SvRV(*got_psv);
}
av_push(av,newRV_inc((SV*)hv));
}
MinPerfHashTwoLevel.xs view on Meta::CPAN
* the length of the array (av_top_index+1) reflect the number of items in the bucket
* with the most collisions - we use this later to size some of our data structures.
*/
by_length_av= idx_by_length(aTHX_ keybuckets_av);
RETVAL= solve_collisions_by_length(aTHX_ bucket_count, max_xor_val, by_length_av, h2_packed_av, keybuckets_av,
variant, buckets_av);
}
OUTPUT:
RETVAL
MODULE = Algorithm::MinPerfHashTwoLevel PACKAGE = Tie::Hash::MinPerfHashTwoLevel::OnDisk
SV *
packed_xs(variant,buf_length_sv,state_sv,comment_sv,flags,buckets_av)
U32 variant
SV* buf_length_sv
SV* state_sv
SV* comment_sv
AV *buckets_av
U32 flags
PREINIT:
dMY_CXT;
PROTOTYPE: $$$$$\@
CODE:
{
U32 buf_length= SvUV(buf_length_sv);
U32 bucket_count= av_top_index(buckets_av) + 1;
U32 header_rlen= _roundup(sizeof(struct mph_header),16);
STRLEN state_len;
char *state_pv= SvPV(state_sv, state_len);
U32 alignment= sizeof(U64);
U32 state_rlen= _roundup(state_len,alignment);
U32 table_rlen= _roundup(sizeof(struct mph_bucket) * bucket_count,alignment);
U32 key_flags_rlen= _roundup((bucket_count * 2 + 7 ) / 8,alignment);
U32 val_flags_rlen= _roundup((bucket_count + 7) / 8,alignment);
U32 str_rlen= _roundup( buf_length
+ 2
+ ( SvOK(comment_sv) ? sv_len(comment_sv) + 1 : 1 )
+ ( 2 + 8 ),
alignment );
U32 total_size;
HV *str_ofs_hv= (HV *)sv_2mortal((SV*)newHV());
SV *sv_buf;
char *start;
struct mph_header *head;
char *state;
struct mph_bucket *table;
char *key_flags;
char *val_flags;
char *str_buf_start;
char *str_buf_end;
char *str_buf_pos;
U32 i;
STRLEN pv_len;
char *pv;
U32 key_is_utf8_count[3]={0,0,0};
U32 val_is_utf8_count[2]={0,0};
U32 used_flags;
U32 the_flag;
IV key_is_utf8_generic=-1;
IV val_is_utf8_generic=-1;
for (i= 0; i < bucket_count; i++) {
SV **got= av_fetch(buckets_av,i,0);
HV *hv= (HV *)SvRV(*got);
HE *key_is_utf8_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_KEY_IS_UTF8,0);
HE *val_is_utf8_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_VAL_IS_UTF8,0);
key_is_utf8_count[SvUV(HeVAL(key_is_utf8_he))]++;
val_is_utf8_count[SvUV(HeVAL(val_is_utf8_he))]++;
}
used_flags= 0;
if (key_is_utf8_count[0]) { the_flag= 0; used_flags++; }
if (key_is_utf8_count[1]) { the_flag= 1; used_flags++; }
if (key_is_utf8_count[2]) { the_flag= 2; used_flags++; }
if (used_flags == 1) {
key_is_utf8_generic= the_flag;
key_flags_rlen= 0;
}
used_flags= 0;
if (val_is_utf8_count[0]) { the_flag= 0; used_flags++; }
if (val_is_utf8_count[1]) { the_flag= 1; used_flags++; }
if (used_flags == 1) {
val_is_utf8_generic= the_flag;
val_flags_rlen= 0;
}
total_size=
+ header_rlen
+ state_rlen
+ table_rlen
+ key_flags_rlen
+ val_flags_rlen
+ str_rlen
;
sv_buf= newSV(total_size);
SvPOK_on(sv_buf);
SvCUR_set(sv_buf,total_size);
start= SvPVX(sv_buf);
Zero(start,total_size,char);
head= (struct mph_header *)start;
head->magic_num= 1278363728;
head->variant= variant;
head->num_buckets= bucket_count;
head->state_ofs= header_rlen;
head->table_ofs= head->state_ofs + state_rlen;
head->key_flags_ofs= head->table_ofs + table_rlen;
head->val_flags_ofs= head->key_flags_ofs + key_flags_rlen;
head->str_buf_ofs= head->val_flags_ofs + val_flags_rlen;
if (val_is_utf8_generic >= 0)
head->general_flags |= (MPH_VALS_ARE_SAME_UTF8NESS_FLAG_BIT | (val_is_utf8_generic << MPH_VALS_ARE_SAME_UTF8NESS_SHIFT));
if (key_is_utf8_generic >= 0)
head->general_flags |= (MPH_KEYS_ARE_SAME_UTF8NESS_FLAG_BIT | (key_is_utf8_generic << MPH_KEYS_ARE_SAME_UTF8NESS_SHIFT));
state= start + head->state_ofs;
table= (struct mph_bucket *)(start + head->table_ofs);
key_flags= start + head->key_flags_ofs;
val_flags= start + head->val_flags_ofs;
str_buf_start= start + head->str_buf_ofs;
str_buf_end= start + total_size;
str_buf_pos= str_buf_start + 2;
Copy(state_pv,state,state_len,char);
pv= SvPV(comment_sv,pv_len);
Copy(pv,str_buf_pos,pv_len,char);
str_buf_pos += pv_len + 1; /* +1 to add a trailing null */
for (i= 0; i < bucket_count; i++) {
SV **got= av_fetch(buckets_av,i,0);
HV *hv= (HV *)SvRV(*got);
HE *key_normalized_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_KEY_NORMALIZED,0);
HE *val_normalized_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_VAL_NORMALIZED,0);
HE *xor_val_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_XOR_VAL,0);
if (xor_val_he) {
table[i].xor_val= SvUV(HeVAL(xor_val_he));
} else {
table[i].xor_val= 0;
}
SETOFS(i,key_normalized_he,table,key_ofs,key_len,str_buf_start,str_buf_pos,str_buf_end,str_ofs_hv);
SETOFS(i,val_normalized_he,table,val_ofs,val_len,str_buf_start,str_buf_pos,str_buf_end,str_ofs_hv);
if ( key_is_utf8_generic < 0) {
HE *key_is_utf8_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_KEY_IS_UTF8,0);
if (key_is_utf8_he) {
UV u= SvUV(HeVAL(key_is_utf8_he));
SETBITS(u,key_flags,i,2);
} else {
croak("panic: out of memory? no key_is_utf8_he for %u",i);
}
}
if ( val_is_utf8_generic < 0 ) {
HE *val_is_utf8_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_VAL_IS_UTF8,0);
if (val_is_utf8_he) {
UV u= SvUV(HeVAL(val_is_utf8_he));
SETBITS(u,val_flags,i,1);
} else {
croak("panic: out of memory? no val_is_utf8_he for %u",i);
}
}
}
*str_buf_pos = 0; str_buf_pos++;
*str_buf_pos = 128; str_buf_pos++;
{
U32 r= (str_buf_pos - start) % alignment;
if (r) {
str_buf_pos += (alignment - r);
}
}
*((U64 *)str_buf_pos)= mph_hash_with_state(state, start, str_buf_pos - start);
str_buf_pos += sizeof(U64);
SvCUR_set(sv_buf, str_buf_pos - start);
SvPOK_on(sv_buf);
RETVAL= sv_buf;
}
OUTPUT:
RETVAL
SV*
mount_file(file_sv,error_sv,flags)
SV* file_sv
SV* error_sv
U32 flags
PROTOTYPE: $$$
CODE:
{
struct mph_obj obj;
STRLEN file_len;
char *file_pv= SvPV(file_sv,file_len);
IV mmap_status= mph_mmap(aTHX_ file_pv, &obj, error_sv, flags);
if (mmap_status < 0) {
XSRETURN_UNDEF;
}
/* copy obj into a new SV which we can return */
RETVAL= newSVpvn((char *)&obj,sizeof(struct mph_obj));
SvPOK_on(RETVAL);
SvREADONLY_on(RETVAL);
}
OUTPUT:
RETVAL
void
unmount_file(mount_sv)
SV* mount_sv
PROTOTYPE: $
CODE:
{
struct mph_obj *obj= (struct mph_obj *)SvPV_nolen(mount_sv);
mph_munmap(obj);
SvOK_off(mount_sv);
}
int
fetch_by_index(mount_sv,index,...)
SV* mount_sv
U32 index
( run in 0.732 second using v1.01-cache-2.11-cpan-97f6503c9c8 )