Algorithm-MinPerfHashTwoLevel

 view release on metacpan or  search on metacpan

MinPerfHashTwoLevel.xs  view on Meta::CPAN

        sv_set_from_bucket(aTHX_ val_sv,strs,bucket->val_ofs,bucket->val_len,index,mph_u8 + mph->val_flags_ofs,1,
                                 gf & MPH_VALS_ARE_SAME_UTF8NESS_MASK, MPH_VALS_ARE_SAME_UTF8NESS_SHIFT);
    }
    if (key_sv) {
        sv_set_from_bucket(aTHX_ key_sv,strs,bucket->key_ofs,bucket->key_len,index,mph_u8 + mph->key_flags_ofs,2,
                                 gf & MPH_KEYS_ARE_SAME_UTF8NESS_MASK, MPH_KEYS_ARE_SAME_UTF8NESS_SHIFT);
    }
    return 1;
}

MPH_STATIC_INLINE int
lookup_key(pTHX_ struct mph_header *mph, SV *key_sv, SV *val_sv)
{
    U8 *strs= (U8 *)mph + mph->str_buf_ofs;
    struct mph_bucket *buckets= (struct mph_bucket *) ((char *)mph + mph->table_ofs);
    struct mph_bucket *bucket;
    U8 *state= (char *)mph + mph->state_ofs;
    STRLEN key_len;
    U8 *key_pv;
    U64 h0;
    U32 h1;
    U32 h2;
    U32 index;
    U8 *got_key_pv;
    STRLEN got_key_len;

    if (SvUTF8(key_sv)) {
        SV *tmp= sv_2mortal(newSVsv(key_sv));
        sv_utf8_downgrade(tmp,1);
        key_sv= tmp;
    }
    key_pv= SvPV(key_sv,key_len);
    h0= mph_hash_with_state(state,key_pv,key_len);
    h1= h0 >> 32;
    index= h1 % mph->num_buckets;

    bucket= buckets + index;
    if (!bucket->xor_val)
        return 0;
    
    h2= h0 & 0xFFFFFFFF;
    if ( bucket->index < 0 ) {
        index = -bucket->index-1;
    } else {
        HASH2INDEX(index,h2,bucket->xor_val,mph->num_buckets);
    }
    bucket= buckets + index;
    got_key_pv= strs + bucket->key_ofs;
    if (bucket->key_len == key_len && memEQ(key_pv,got_key_pv,key_len)) {
        if (val_sv) {
            U64 gf= mph->general_flags;
            sv_set_from_bucket(aTHX_ val_sv,strs,bucket->val_ofs,bucket->val_len,index,((U8*)mph)+mph->val_flags_ofs,1,
                                 gf & MPH_VALS_ARE_SAME_UTF8NESS_MASK, MPH_VALS_ARE_SAME_UTF8NESS_SHIFT);
        }
        return 1;
    }
    return 0;
}

IV
mph_mmap(pTHX_ char *file, struct mph_obj *obj, SV *error, U32 flags) {
    struct stat st;
    struct mph_header *head;
    int fd = open(file, O_RDONLY, 0);
    void *ptr;
    U32 alignment;

    if (error)
        sv_setpvs(error,"");
    if (fd < 0) {
        if (error)
            sv_setpvf(error,"file '%s' could not be opened for read", file);
        return MPH_MOUNT_ERROR_OPEN_FAILED;
    }
    if (fstat(fd,&st)==-1) {
        if (error)
            sv_setpvf(error,"file '%s' could not be fstat()ed", file);
        return MPH_MOUNT_ERROR_FSTAT_FAILED;
    }
    if (st.st_size < sizeof(struct mph_header)) {
        if (error)
            sv_setpvf(error,"file '%s' is too small to be a valid PH2L file", file);
        return MPH_MOUNT_ERROR_TOO_SMALL;
    }
    ptr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED | MPH_MAP_POPULATE, fd, 0);
    close(fd); /* kernel holds its own refcount on the file, we do not need to keep it open */
    if (ptr == MAP_FAILED) {
        if (error)
            sv_setpvf(error,"failed to create mapping to file '%s'", file);
        return MPH_MOUNT_ERROR_MAP_FAILED;
    }

    obj->bytes= st.st_size;
    obj->header= head= (struct mph_header*)ptr;
    if (head->magic_num != MAGIC_DECIMAL) {
        if (head->magic_num == MAGIC_BIG_ENDIAN_DECIMAL) {
            if (error)
                sv_setpvf(error,"this is a big-endian machine, cant handle PH2L files here");
        }
        if (error)
            sv_setpvf(error,"file '%s' is not a PH2L file", file);
        return MPH_MOUNT_ERROR_BAD_MAGIC;
    }
    if (head->variant < MIN_VARIANT) {
        if (error)
            sv_setpvf(error,"unsupported old version '%d' in '%s'", head->variant, file);
        return MPH_MOUNT_ERROR_BAD_VERSION;
    }
    if (head->variant > MAX_VARIANT) {
        if (error)
            sv_setpvf(error,"unknown version '%d' in '%s'", head->variant, file);
        return MPH_MOUNT_ERROR_BAD_VERSION;
    }
    alignment = sizeof(U64);

    if (st.st_size % alignment) {
        if (error)
            sv_setpvf(error,"file '%s' does not have a size which is a multiple of 16 bytes", file);
        return MPH_MOUNT_ERROR_BAD_SIZE;
    }
    if (
        head->table_ofs < head->state_ofs           ||
        head->key_flags_ofs < head->table_ofs       ||
        head->val_flags_ofs < head->key_flags_ofs   ||
        head->str_buf_ofs < head->val_flags_ofs     ||
        st.st_size < head->str_buf_ofs
    ) {
        if (error)
            sv_setpvf(error,"corrupt header offsets in '%s'", file);
        return MPH_MOUNT_ERROR_BAD_OFFSETS;
    }
    if (flags & MPH_F_VALIDATE) {
        char *start= ptr;
        char *state_pv= start + head->state_ofs;
        char *str_buf_start= start + head->str_buf_ofs;
        char *str_buf_end= start + st.st_size;

        U64 have_file_checksum= mph_hash_with_state(state_pv, start, st.st_size - sizeof(U64));
        U64 want_file_checksum= *((U64 *)(str_buf_end - sizeof(U64)));
        if (have_file_checksum != want_file_checksum) {
            if (error)
                sv_setpvf(error,"file checksum '%016lx' != '%016lx' in file '%s'",
                    have_file_checksum,want_file_checksum,file);
            return MPH_MOUNT_ERROR_CORRUPT_FILE;
        }
    }
    return head->variant;
}

void
mph_munmap(struct mph_obj *obj) {
    munmap(obj->header,obj->bytes);
}

STRLEN
normalize_with_flags(pTHX_ SV *sv, SV *normalized_sv, SV *is_utf8_sv, int downgrade) {
    STRLEN len;
    if (SvROK(sv)) {
        croak("Error: Not expecting a reference value in source hash");
    }
    sv_setsv(normalized_sv,sv);
    if (SvOK(sv)) {
        STRLEN pv_len;
        char *pv= SvPV(sv,pv_len);
        if (pv_len > 0xFFFF)
            croak("Error: String in source hash is too long to store, max length is %u got length %lu", 0xFFFF, pv_len);
        if (SvUTF8(sv)) {
            if (downgrade)
                sv_utf8_downgrade(normalized_sv,1);
            if (SvUTF8(normalized_sv)) {
                SvUTF8_off(normalized_sv);
                sv_setiv(is_utf8_sv,1);
            } else {
                sv_setiv(is_utf8_sv,2);
            }
        }
        return pv_len;
    } else {
        sv_setiv(is_utf8_sv, 0);
        return 0;
    }
}

U32
_roundup(const U32 n, const U32 s) {
    const U32 r= n % s;
    if (r) {
        return n + s - r;
    } else {
        return n;
    }
}

START_MY_CXT

I32
_compare(pTHX_ SV *a, SV *b) {
    dMY_CXT;
    HE *a_he= hv_fetch_ent_with_keysv((HV*)SvRV(a),MPH_KEYSV_KEY_NORMALIZED,0);
    HE *b_he= hv_fetch_ent_with_keysv((HV*)SvRV(b),MPH_KEYSV_KEY_NORMALIZED,0);

    return sv_cmp(HeVAL(a_he),HeVAL(b_he));

MinPerfHashTwoLevel.xs  view on Meta::CPAN

    str_buf_end= start + total_size;
    str_buf_pos= str_buf_start + 2;

    Copy(state_pv,state,state_len,char);
    pv= SvPV(comment_sv,pv_len);
    Copy(pv,str_buf_pos,pv_len,char);
    str_buf_pos += pv_len + 1; /* +1 to add a trailing null */

    for (i= 0; i < bucket_count; i++) {
        SV **got= av_fetch(buckets_av,i,0);
        HV *hv= (HV *)SvRV(*got);
        HE *key_normalized_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_KEY_NORMALIZED,0);
        HE *val_normalized_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_VAL_NORMALIZED,0);
        HE *xor_val_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_XOR_VAL,0);

        if (xor_val_he) {
            table[i].xor_val= SvUV(HeVAL(xor_val_he));
        } else {
            table[i].xor_val= 0;
        }
        SETOFS(i,key_normalized_he,table,key_ofs,key_len,str_buf_start,str_buf_pos,str_buf_end,str_ofs_hv);
        SETOFS(i,val_normalized_he,table,val_ofs,val_len,str_buf_start,str_buf_pos,str_buf_end,str_ofs_hv);
        if ( key_is_utf8_generic < 0) {
            HE *key_is_utf8_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_KEY_IS_UTF8,0);
            if (key_is_utf8_he) {
                UV u= SvUV(HeVAL(key_is_utf8_he));
                SETBITS(u,key_flags,i,2);
            } else {
                croak("panic: out of memory? no key_is_utf8_he for %u",i);
            }
        }
        if ( val_is_utf8_generic < 0 ) {
            HE *val_is_utf8_he= hv_fetch_ent_with_keysv(hv,MPH_KEYSV_VAL_IS_UTF8,0);
            if (val_is_utf8_he) {
                UV u= SvUV(HeVAL(val_is_utf8_he));
                SETBITS(u,val_flags,i,1);
            } else {
                croak("panic: out of memory? no val_is_utf8_he for %u",i);
            }
        }
    }
    *str_buf_pos =   0; str_buf_pos++;
    *str_buf_pos = 128; str_buf_pos++;
    {
        U32 r= (str_buf_pos - start) % alignment;
        if (r) {
            str_buf_pos += (alignment - r);
        }
    }
    *((U64 *)str_buf_pos)= mph_hash_with_state(state, start, str_buf_pos - start);
    str_buf_pos += sizeof(U64);

    SvCUR_set(sv_buf, str_buf_pos - start);
    SvPOK_on(sv_buf);
    RETVAL= sv_buf;
}
    OUTPUT:
        RETVAL

SV*
mount_file(file_sv,error_sv,flags)
        SV* file_sv
        SV* error_sv
        U32 flags
    PROTOTYPE: $$$
    CODE:
{
    struct mph_obj obj;
    STRLEN file_len;
    char *file_pv= SvPV(file_sv,file_len);
    IV mmap_status= mph_mmap(aTHX_ file_pv, &obj, error_sv, flags);
    if (mmap_status < 0) {
        XSRETURN_UNDEF;
    }
    /* copy obj into a new SV which we can return */
    RETVAL= newSVpvn((char *)&obj,sizeof(struct mph_obj));
    SvPOK_on(RETVAL);
    SvREADONLY_on(RETVAL);
}
    OUTPUT:
        RETVAL

void
unmount_file(mount_sv)
        SV* mount_sv
    PROTOTYPE: $
    CODE:
{
    struct mph_obj *obj= (struct mph_obj *)SvPV_nolen(mount_sv);
    mph_munmap(obj);
    SvOK_off(mount_sv);
}


int
fetch_by_index(mount_sv,index,...)
        SV* mount_sv
        U32 index
    PROTOTYPE: $$;$$
    CODE:
{
    struct mph_obj *obj= (struct mph_obj *)SvPV_nolen(mount_sv);
    SV* key_sv= items > 2 ? ST(2) : NULL;
    SV* val_sv= items > 3 ? ST(3) : NULL;
    if (items > 4)
       croak("Error: passed too many arguments to "
             "Tie::Hash::MinPerfHashTwoLevel::OnDisk::fetch_by_index(mount_sv, index, key_sv, val_sv)");
    RETVAL= lookup_bucket(aTHX_ obj->header,index,key_sv,val_sv);
}
    OUTPUT:
        RETVAL

int
fetch_by_key(mount_sv,key_sv,...)
        SV* mount_sv
        SV* key_sv
    PROTOTYPE: $$;$
    CODE:
{
    SV* val_sv= items > 2 ? ST(2) : NULL;
    struct mph_obj *obj= (struct mph_obj *)SvPV_nolen(mount_sv);
    if (items > 3)
       croak("Error: passed too many arguments to "
             "Tie::Hash::MinPerfHashTwoLevel::OnDisk::fetch_by_key(mount_sv, index, key_sv)");
    RETVAL= lookup_key(aTHX_ obj->header,key_sv,val_sv);
}
    OUTPUT:
        RETVAL


SV *



( run in 1.025 second using v1.01-cache-2.11-cpan-385001e3568 )