flag results from the CPAN

Alien-Judy

//   statement is retained.
//   email - doug@sourcejudy.com -or- dougbaskins@yahoo.com
//=======================================================================

#include <string.h>                     // for memcmp(), memcpy()

#include <Judy.h>                       // for JudyL* routines/macros

/*
   This routine is a very fast "string" version of an ADT that stores
   (JudyHSIns()), retrieves (JudyHSGet()), deletes (JudyHSDel()) and
   frees the entire ADT (JudyHSFreeArray()) strings.  It uses the "Judy
   arrays" JudyL() API as the main workhorse.  The length of the string
   is included in the calling parameters so that strings with embedded
   \0s can be used.  The string lengths can be from 0 bytes to whatever
   malloc() can handle (~2GB).

   Compile:
  
      cc -O JudyHS.c -c         needs to link with -lJudy (libJudy.a)
  
      Note: in gcc version 3.3.1, -O2 generates faster code than -O
      Note: in gcc version 3.3.2, -O3 generates faster code than -O2

   NOTES:

1) There may be some performance issues with 64 bit machines, because I
   have not characterized that it yet.

2) It appears that a modern CPU (>2Ghz) that the instruction times are
   much faster that a RAM access, so building up a word from bytes takes
   no longer that a whole word access.  I am taking advantage of this to
   make this code endian neutral.  A side effect of this is strings do
   not need to be aligned, nor tested to be on to a word boundry.  In
   older and in slow (RISC) machines, this may be a performance issue.
   I have given up trying to optimize for machines that have very slow
   mpy, mod, variable shifts and call returns.

3) JudyHS is very scalable from 1 string to billions (with enough RAM).
   The memory usage is also scales with population.  I have attempted to
   combine the best characteristics of JudyL arrays with Hashing methods
   and well designed modern processors (such as the 1.3Ghz Intel
   Centrino this is being written on).

   HOW JudyHS WORKS: ( 4[8] means 4 bytes in 32 bit machine and 8 in 64)

   A) A JudyL array is used to separate strings of equal lengths into
   their own structures (a different hash table is used for each length
   of string).  The additional time overhead is very near zero because
   of the CPU cache.  The space efficiency is improved because the
   length need not be stored with the string (ls_t).  The "JLHash" ADT
   in the test program "StringCompare" is verification of both these
   assumptions.

   B) A 32 bit hash value is produced from the string.  Many thanks to
   the Internet and the author (Bob Jenkins) for coming up with a very
   good and fast universal string hash.  Next the 32 bit hash number is
   used as an Index to another JudyL array.  Notice that one (1) JudyL
   array is used as a hash table per each string length.  If there are
   no hash collisions (normally) then the string is copied to a
   structure (ls_t) along with room for storing a Value.  A flag is
   added to the pointer to note it is pointing to a ls_t structure.
   Since the lengths of the strings are the same, there is no need to
   stored length of string in the ls_t structure.  This saves about a
   word per string of memory.

   C) When there is a hashing collision (very rare), a JudyL array is
   used to decode the next 4[8] bytes of the string.  That is, the next
   4[8] bytes of the string are used as the Index.  This process is
   repeated until the remaining string is unique.  The remaining string
   (if any) is stored in a (now smaller) ls_t structure.  If the
   remaining string is less or equal to 4[8] bytes, then the ls_t
   structure is not needed and the Value area in the JudyL array is
   used.  A compile option -DDONOTUSEHASH is available to test this
   structure without using hashing (only the JudyL tree is used).  This
   is equivalent to having all strings hashed to the same bucket.  The
   speed is still better than all other tree based ADTs I have tested.
   An added benefit of this is a very fast "hash collision" resolving.
   It could foil hackers that exploit the slow synonym (linked-list)
   collision handling property used with most hashing algorithms.  If
   this is not a necessary property, then a simpler ADT "JLHash" that is
   documented the the test program "StringCompare.c" may be used with a
   little loss of memory efficiency (because it includes the string
   length with the ls_t structure).  JudyHS was written to be the
   fastest, very scalable, memory efficient, general purpose string ADT
   possible.  (However, I would like to eat those words someday). (dlb)

*/

#ifdef EXAMPLE_CODE
#include <stdio.h>
#include <unistd.h>
#include <string.h>

#include <Judy.h>

//#include "JudyHS.h"                   // for Judy.h without JudyHS*()

// By Doug Baskins Apr 2004 - for JudyHS man page

#define MAXLINE 1000000                 /* max length of line */
char      Index[MAXLINE];               // string to check

int     // Usage:  CheckDupLines < file
main()
{
    Pvoid_t   PJArray = (PWord_t)NULL;  // Judy array.
    PWord_t   PValue;                   // ^ Judy array element.
    Word_t    Bytes;                    // size of JudyHS array.
    Word_t    LineNumb = 0;             // current line number
    Word_t    Dups = 0;                 // number of duplicate lines

    while (fgets(Index, MAXLINE, stdin) != (char *)NULL)
    {
        LineNumb++;                     // line number

//      store string into array
        JHSI(PValue, PJArray, Index, strlen(Index)); 
        if (*PValue)                    // check if duplicate
        {
            Dups++;                     // count duplicates

src/judy-1.0.5/src/JudyHS/JudyHS.c view on Meta::CPAN


#define JUDYHASHSTR(HVALUE,STRING,LENGTH)       \
{                                               \
    uint8_t *p_ = (uint8_t *)(STRING);          \
    uint8_t *q_ = p_ + (LENGTH);                \
    uint32_t c_ = 0;                            \
    for (; p_ != q_; ++p_)                      \
    {                                           \
        c_ = (c_ * 31) + *p_;                   \
    }                                           \
/*  c_ &= gHmask;   see above   */              \
    (HVALUE) = c_;                              \
}

// Find String of Len in JudyHS structure, return pointer to associated Value

PPvoid_t
JudyHSGet(Pcvoid_t PArray,              // pointer (^) to structure
           void * Str,                  // pointer to string
           Word_t Len                   // length of string
    )
{
    uint8_t  *String = (uint8_t *)Str;
    PPvoid_t  PPValue;                  // pointer to Value
    Word_t    Index;                    // 4[8] bytes of String

    JLG(PPValue, PArray, Len);          // find hash table for strings of Len
    if (PPValue == (PPvoid_t) NULL)
        return ((PPvoid_t) NULL);       // no strings of this Len

//  check for caller error (null pointer)
//
    if ((String == (void *) NULL) && (Len != 0))
        return ((PPvoid_t) NULL);       // avoid null-pointer dereference

#ifndef DONOTUSEHASH
    if (Len > WORDSIZE)                 // Hash table not necessary with short
    {
        uint32_t  HValue;               // hash of input string
        JUDYHASHSTR(HValue, String, Len);       // hash to no more than 32 bits
        JLG(PPValue, *PPValue, (Word_t)HValue); // get ^ to hash bucket
        if (PPValue == (PPvoid_t) NULL)
            return ((PPvoid_t) NULL);   // no entry in Hash table
    }
#endif // DONOTUSEHASH

/*
  Each JudyL array decodes 4[8] bytes of the string.  Since the hash
  collisions occur very infrequently, the performance is not important.
  However, even if the Hash code is not used this method still is
  significantly faster than common tree methods (AVL, Red-Black, Splay,
  b-tree, etc..).  You can compare it yourself with #define DONOTUSEHASH
  1 or putting -DDONOTUSEHASH in the cc line.  Use the "StringCompare.c"
  code to compare (9Dec2003 dlb).
*/
    while (Len > WORDSIZE)              // traverse tree of JudyL arrays
    {
        if (IS_PLS(*PPValue))           // ^ to JudyL array or ls_t struct?
        {
            Pls_t     Pls;              // ls_t struct, termination of tree
            Pls = (Pls_t) CLEAR_PLS(*PPValue);  // remove flag from ^

//          if remaining string matches, return ^ to Value, else NULL

            if (memcmp(String, Pls->ls_String, Len) == 0)
                return ((PPvoid_t) (&(Pls->ls_Value)));
            else
                return ((PPvoid_t) NULL);       // string does not match
        }
        else
        {
            COPYSTRINGtoWORD(Index, String, WORDSIZE);

            JLG(PPValue, *PPValue, Index);      // decode next 4[8] bytes
            if (PPValue == (PPvoid_t) NULL)     // if NULL array, bail out
                return ((PPvoid_t) NULL);       // string does not match

            String += WORDSIZE;                 // advance
            Len -= WORDSIZE;
        }
    }

//  Get remaining 1..4[8] bytes left in string

    COPYSTRINGtoWORD(Index, String, Len);
    JLG(PPValue, *PPValue, Index);      // decode last 1-4[8] bytes
    return (PPValue);
}

// Add string to a tree of JudyL arrays (all lengths must be same)

static PPvoid_t
insStrJudyLTree(uint8_t * String,      // string to add to tree of JudyL arrays
                 Word_t Len,            // length of string
                 PPvoid_t PPValue,      // pointer to root pointer
                 PJError_t PJError      // for returning error info
    )
{
    Word_t    Index;                    // next 4[8] bytes of String

    while (Len > WORDSIZE)              // add to JudyL tree
    {
//      CASE 1, pointer is to a NULL, make a new ls_t leaf

        if (*PPValue == (Pvoid_t)NULL)
        {
            Pls_t     Pls;              // memory for a ls_t
            Pls = (Pls_t) JudyMalloc(LS_WORDLEN(Len));
            if (Pls == NULL)
            {
                JU_SET_ERRNO(PJError, JU_ERRNO_NOMEM);
                return (PPJERR);
            }
            Pls->ls_Value = 0;                          // clear Value word
            memcpy(Pls->ls_String, String, Len);        // copy to new struct
            *PPValue = (Pvoid_t)SET_PLS(Pls);           // mark pointer
            return ((PPvoid_t) (&Pls->ls_Value));       // return ^ to Value
        }                                               // no exit here
//      CASE 2: is a ls_t, free (and shorten), then decode into JudyL tree

        if (IS_PLS(*PPValue))                   // pointer to a ls_t? (leaf)

( run in 0.626 second using v1.01-cache-2.11-cpan-140bd7fdf52 )