Alien-Judy

 view release on metacpan or  search on metacpan

src/judy-1.0.5/test/StringCompare.c  view on Meta::CPAN

        very scaleable because the hash table size is ~4 billion.
   4) JudySL - ordered and requires null terminated strings.
   5) Ternary - code borrowed from Mr Dobbs, perhaps 2000
   6) Redblack - code borrowed from J. Zobel, April 2001.
   7) Splay  - code borrowed from J. Zobel, April 2001.

   Note: Splay, Redblack and Ternary methods are not very fast, so they
   have not been completed and made bug free.  I.E. no Delete and Free
   Array routines.  Ternary is fastest retrieve of these three, but uses
   an extraordinary amount of memory.
*/
//=======================================================================
//
// Compile: 
//
//   cc -O StringCompare.c -lm -lJudy -o StringCompare 
//           -or-
//   cc -O -DCPUMHZ=1299 StringCompare.c -lm -lJudy -o StringCompare

/* Notes:  
   1) Use '-DCPUMHZ=1299' in cc line if better clock resolution is desired
      and it compiles successfully.  The 1299 is the cpu MHz : 1298.916 from
      cat /proc/cpuinfo in a Linux system.

   2) -static will generally get better performance because memcmp(),
    memcpy() routines are usually slower with shared librarys.
*/

// Usage:
//
//   StringCompare -A Hash     <options>  textfile
//   StringCompare -A JudyHS   <options>  textfile
//   StringCompare -A JLHash   <options>  textfile
//   StringCompare -A JudySL   <options>  textfile
//   StringCompare -A Ternary  <options>  textfile
//   StringCompare -A Redblack <options>  textfile
//   StringCompare -A Splay    <options>  textfile

#include <stdlib.h>                     // malloc(3)
#include <unistd.h>                     // getopt(3)
#include <stdio.h>                      // printf(3)
#include <fcntl.h>                      // open(2)
#include <string.h>                     // memcmp(3), memcpy(3)
#include <errno.h>                      // errno(3)
#include <sys/mman.h>                   // mmap(2)
#include <sys/time.h>                   // gettimeofday(2)
#include <math.h>                       // pow(3)
#include <sys/utsname.h>                // uname(2)
#include <Judy.h>                       // Judy arrays

//=======================================================================
//   D e f i n e:    T O   T U R N   O F F   A S S E R T I O N S !!!!!!!!     
//=======================================================================
//#define NDEBUG 1
#include <assert.h>                     // assert(3)

//=======================================================================
//      G L O B A L  D A T A
//=======================================================================

int       foolflag = 0;                 // fool compiler from optimizing
static Word_t gStored = 0;              // number of strings inserted
static Word_t gChainln = 0;             // links traversed during RETRIVE

static Word_t PtsPdec = 40;             // default measurement points per decade

#define INFSTRGS 1000000000             // 1 billion strings is infinity

static Word_t nStrg = INFSTRGS;         // infinity -- measure all strings
static Word_t TValues = 100000;         // max measure points for RETRIVE tests
static int pFlag = 0;                   // pre-fault hash table pages into RAM
static int rFlag = 0;                   // do not randomize input file
static int aFlag = 0;                   // word align string buffers
static int DFlag = 0;                   // do the delete measurement
static int CFlag = 0;                   // build sequential Get buffers
static Word_t aCount = 0;               // Count of missaligned string buffers

//  define the maximum length of a string allowed
#define MAXSTRLEN       (100000)
static int MLength = MAXSTRLEN;
static Word_t HTblsz;                   // 1M default hash table size
static int fileidx;                     // argv[fileidx] == file string

// for saving input string data
typedef struct STRING_
{
    int       dt_strlen;
    uint8_t  *dt_string;

} dt_t   , *Pdt_t;

static Pdt_t PdtS_ = NULL;              // memory for Cache access Gets
static uint8_t *Strbuf_ = NULL;
static Word_t Strsiz_ = 0;

// Roundup BYTES to an even number of words

/*
 On Linux 2.6.3-4mdkenterprise (Mandrake 10.0 Community) printing (even
 to a file) makes the timings inaccurate.  So, use -L2 or greater to
 average (actually save min times) and print results after all tests are
 completed.
*/
#define Printf if (Pass == 0) printf

#define ROUNDUPWORD(BYTES) (((BYTES) + sizeof(Word_t) - 1) & (-sizeof(Word_t)))
#define BYTES2WORDS(BYTES) (((BYTES) + sizeof(Word_t) - 1) / (sizeof(Word_t)))

//=======================================================================
//      T I M I N G   M A C R O S
//=======================================================================

static double DeltaUSec;                // Global for remembering delta times

// Some operating systems have get_cycles() in /usr/include/asm/timex.h

#ifdef  CPUMHZ

//  For a 1.34 nS clock cycle processor (750Mhz)

#define CPUSPEED      (1.0 / (CPUMHZ))

src/judy-1.0.5/test/StringCompare.c  view on Meta::CPAN


}                                       // JudyFree()

// ****************************************************************************
// J U D Y   M A L L O C
//
// Higher-level "wrapper" for allocating objects that need not be in RAM,
// although at this time they are in fact only in RAM.  Later we hope that some
// entire subtrees (at a JPM or branch) can be "virtual", so their allocations
// and frees should go through this level.

Word_t
JudyMallocVirtual(Word_t Words)
{
    return (JudyMalloc(Words));

}                                       // JudyMallocVirtual()

// ****************************************************************************
// J U D Y   F R E E

void
JudyFreeVirtual(void *PWord, Word_t Words)
{
    JudyFree(PWord, Words);

}                                       // JudyFreeVirtual()

//=======================================================================
// Routine to get next size of Indexes
//=======================================================================

static int
NextNumb(Word_t *PNumber,               // pointer to returned next number
         double *PDNumb,                // Temp double of above
         double DMult,                  // Multiplier
         Word_t MaxNumb)                // Max number to return
{
//  Save prev number
    double    PrevPDNumb = *PDNumb;
    double    DDiff;

//  Calc next number >= 1.0 beyond previous
    do
    {
        *PDNumb *= DMult;
        DDiff = *PDNumb - PrevPDNumb;
    }
    while (DDiff < 0.5);

//  Return it in integer format
    if (DDiff < 100.0)
        *PNumber += (Word_t)(DDiff + 0.5);
    else
        *PNumber = (Word_t)(*PDNumb + 0.5);

//  Verify it did not exceed max number
    if (*PNumber >= MaxNumb)
    {
        *PNumber = MaxNumb;             // it did, so return max
        return (1);                     // flag it
    }
    return (0);                         // more available
}

//=======================================================================
//      M E A S U R E M E N T  S T R U C T U R E
//=======================================================================

typedef struct _MEASUREMENTS_STRUCT *Pms_t;
typedef struct _MEASUREMENTS_STRUCT
{
    Word_t    ms_delta;                 // number of points in current group
    double    ms_Bytes;                 // average allocated memory/per string
    double    ms_mininsert;             // Min Retrive number
    double    ms_minretrive;            // Min Retrive number
} ms_t;

static Pms_t Pms;                       // array of MEASUREMENTS_STRUCT

// Method type

typedef enum
{
    M_invalid,
    M_Print,
    M_Hash,
    M_JLHash,
    M_JudySL,
    M_JudyHS,
    M_Splay,
    M_Redblack,
    M_Ternary
} Method_t;

//=======================================================================
//      R a n d o m i z e  i n p u t  s t r i n g s
//=======================================================================

static void
Randomize(Pdt_t Pstrstr, Word_t Len)
{
    Word_t    ii;

// swap the "random" index with the sequential one
    for (ii = 1; ii < Len; ii++)
    {
        dt_t      dttemp;
        Word_t    swapii;

//      get "random" index
        swapii = (Word_t)rand() % Len;

//      and swap
        dttemp = Pstrstr[ii];
        Pstrstr[ii] = Pstrstr[swapii];
        Pstrstr[swapii] = dttemp;
    }
}

//=======================================================================

src/judy-1.0.5/test/StringCompare.c  view on Meta::CPAN

            }
            if (strcmp(optarg, "JLHash") == 0)
            {
                Method = M_JLHash;
                HTblsz = 0;             // max 2^32
            }
            if (strcmp(optarg, "JudySL") == 0)
                Method = M_JudySL;
            if (strcmp(optarg, "Splay") == 0)
                Method = M_Splay;
            if (strcmp(optarg, "Redblack") == 0)
                Method = M_Redblack;
            if (strcmp(optarg, "JudyHS") == 0)
                Method = M_JudyHS;
            if (strcmp(optarg, "Ternary") == 0)
                Method = M_Ternary;
            break;

        case 'H':                      // Size of Hash table
            HTblsz = strtoul(optarg, NULL, 0);
            break;

        case 'L':                      // Number of Loops
            Passes = atoi(optarg);
            if (Passes <= 0)
            {
                printf("\n !! OOps - Number of Loops must be > 0\n");
                ErrorFlag++;
            }
            break;

        case 'n':                      // Max population of arrays
            nStrg = strtoul(optarg, NULL, 0);   // Size of Linear Array
            if (nStrg == 0)
            {
                printf("\n !! OOps - Number of strings must be > 0\n");
                ErrorFlag++;
            }
            break;

        case 'T':                      // Maximum retrieve tests for timing 
            TValues = strtoul(optarg, NULL, 0);
            break;

        case 'P':                      // measurement points per decade
            PtsPdec = strtoul(optarg, NULL, 0);
            break;

        case 'M':                      // maximum length of input string
            MLength = atoi(optarg);
            break;

        case 'p':                      // pre-initialize Hash table
            pFlag = 1;
            break;

        case 'r':                      // do not randomize input
            if (CFlag)
            {
                printf
                    ("\n !! OOps '-r' and '-C' flag are mutually exclusive\n");
                ErrorFlag++;
                break;
            }
            rFlag = 1;
            break;

        case 'a':                      // word align string buffers
            aFlag = 1;
            break;

        case 'D':                      // do a delete at end
            DFlag = 1;
            break;

        case 'C':                      // build sequential Get string buffers
            if (rFlag)
            {
                printf
                    ("\n !! OOps '-C' and '-r' flag are mutually exclusive\n");
                ErrorFlag++;
                break;
            }
            CFlag = 1;
            break;

        default:
            ErrorFlag++;
            break;
        }
    }
    if (Method == -1)
    {
        printf
            ("\n !! OOps -- '-A <ADT>' I.E. '-AHash' or '-AJudyHS' is a required option\n");
        ErrorFlag++;
    }

    fileidx = optind;
    if (optind >= argc)
    {
        printf("\n !! OOps -- No input file specified\n");
        ErrorFlag++;
    }

    if (ErrorFlag)
    {
        printf("\n");
        printf("$ %s -A<ADT> -n# -H# -P# -T# -p -r -a InputStringFile\n\n",
               argv[0]);
        printf("Where: ");
        printf("'InputStringFile' is text file of strings to use in test\n\n");
        printf
            ("-A <ADT> is Hash|JLHash|JudySL|JudyHS|Splay|Redblack|Ternary|Print\n");
        printf("\n");
        printf("-n <#> max number of strings to use in tests (all)\n");
        printf("-H <#> is number elements in Hash table\n");
        printf("-P <#> number of measurement points per decade (40)\n");
        printf
            ("-T <#> Change the 'Get' number_of_strings to measure per data point\n");
        printf("-D     Use 'Delete' routine instead of 'FreeArray' routine\n");
        printf("-p     pre-zero hash table to fault in all pages\n");
        printf("-r     Do not randomize Insert and Get order of strings\n");
        printf("-C     Build contigious string buffers for 'Get' tests\n");
        printf("-a     Word_t align the start address of input strings\n");
        printf("-M <#> Change the maximum 'strlen(String)' of input Strings\n");
        printf("\n\n");

        exit(1);
    }

//  calculate max number mask used in hash routine

//============================================================
//  PRINT COMMAND NAME + RUN ARGUMENTS
//============================================================

    printf("# %s", argv[0]);
    if (nStrg != INFSTRGS)
        printf(" -n%lu", nStrg);

src/judy-1.0.5/test/StringCompare.c  view on Meta::CPAN

    if (Method == M_JLHash)
    {
        if (HTblsz)
            printf("# JLHash table virtual size = %lu\n", HTblsz);
        else
            printf("# JLHash table virtual size = 4294967296\n");
    }

//=======================================================================
// Read text input file into RAM
//=======================================================================

    if ((fid = fopen(argv[fileidx], "r")) == NULL)
        FILERROR;

    for (Strlen = LineCnt = 0; LineCnt < nStrg;)
    {
        Chr = fgetc(fid);
        if (Chr == '\n')
        {
            if (Strlen)                 // eat zero length lines
            {
                if (Strlen > MLength)
                    Strlen = MLength;
                Pdt[LineCnt].dt_string = PCurStr - Strlen;
                Pdt[LineCnt].dt_strlen = Strlen;
                LineCnt++;

                Strlen = 0;
                *PCurStr++ = '\0';      // for JudySL
                if (aFlag)              // for word alignment
                    PCurStr = (uint8_t *) ROUNDUPWORD((Word_t)PCurStr);

                if ((Word_t)PCurStr % sizeof(Word_t))
                    aCount++;
            }
        }
        else
        {
            if (Strlen < MLength)
            {
                Strlen++;
                if (Chr == '\0')
                    Chr = ' ';          // for JudySL
                *PCurStr++ = (uint8_t) Chr;
            }
        }
    }
    fclose(fid);
    fid = NULL;
    assert(nStrg == LineCnt);

    printf("# %lu (%.1f%%) non-Word_t aligned string buffers\n",
           aCount, (double)aCount / (double)LineCnt * 100.0);

    printf("# Ram used for input data = %lu bytes\n", StringMemory);

    printf("# Average string length = %.1f bytes\n",
           (double)(StrTot - LineCnt) / LineCnt);

// Allocate memory for Cached assess to 'Get' (largest delta). This flag
// will put the 'randomized' 'Get' order strings in a sequential buffer.
// Modern processors will 'read ahead' with an access to RAM is sequential
// -- thus saving the 'Get' having to bring the string into cache.
    if (CFlag)
    {
        PdtS_ = (Pdt_t) malloc(TValues * sizeof(dt_t));
        if (PdtS_ == NULL)
            MALLOCERROR;

//      now guess how much memory will be needed for the strings
        Strsiz_ = ((StrTot / nStrg) * TValues);
        Strsiz_ += Strsiz_;             // bump %20

        Strbuf_ = (uint8_t *) malloc(Strsiz_);
        if (Strbuf_ == NULL)
            MALLOCERROR;

        printf
            ("# %lu bytes malloc() for 'cached' strings for Get measurement\n",
             Strsiz_);
    }

//=======================================================================
//  TIME GETSTRING() from Cache (most of the time)
//=======================================================================

    STARTTm(tm);                        // start timer
    for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
    {
        GETSTRING(PCurStr, Strlen);
        Strlen = Pdt[LineCnt].dt_strlen;
        PCurStr = Pdt[LineCnt].dt_string;

        if (strlen(PCurStr) != Strlen)  // bring string into Cache
        {
//          necessary to prevent cc from optimizing out
            printf(" !! OOps Bug, wrong string length\n");
            exit(1);
        }
    }
    ENDTm(DeltaUSec, tm);               // end timer

    printf
        ("# Access Time    = %6.3f uS average per string (mostly from Cache)\n",
         DeltaUSec / nStrg);

//=======================================================================
//  TIME GETSTRING() + HASHSTR() from Cache (most of the time)
//=======================================================================

    STARTTm(tm);                        // start timer
    for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
    {
        uint32_t  hval;
        GETSTRING(PCurStr, Strlen);
        PCurStr = Pdt[LineCnt].dt_string;
        Strlen = Pdt[LineCnt].dt_strlen;
        hval = HASHSTR(PCurStr, Strlen, HTblsz);
        if (foolflag)
            printf("OOps foolflag is set, hval = %d\n", hval);
    }
    ENDTm(DeltaUSec, tm);               // end timer

    printf
        ("# HashStr() Time = %6.3f uS average per string (mostly from Cache)\n",
         DeltaUSec / nStrg);

//  randomize the input strings (adjacent strings will not be on same page)

    if (rFlag == 0)
    {
        Randomize(Pdt, nStrg);          // Randomize ALL to be stored

//=======================================================================
//  TIME GETSTRING() from RAM (most of the time)
//=======================================================================

        STARTTm(tm);                    // start timer
        for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
        {
            GETSTRING(PCurStr, Strlen);
            Strlen = Pdt[LineCnt].dt_strlen;
            PCurStr = Pdt[LineCnt].dt_string;

            if (strlen(PCurStr) != Strlen)      // bring string into Cache
            {
//              necessary to prevent cc from optimizing out
                printf(" !! OOps Bug, wrong string length\n");
                exit(1);
            }
        }
        ENDTm(DeltaUSec, tm);           // end timer

        printf
            ("# Access Time    = %6.3f uS average per string (mostly from RAM)\n",
             DeltaUSec / nStrg);

//=======================================================================
//  TIME GETSTRING() + HASHSTR() from RAM (most of the time)
//=======================================================================

        STARTTm(tm);                    // start timer
        for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
        {
            uint32_t  hval;
            GETSTRING(PCurStr, Strlen);
            Strlen = Pdt[LineCnt].dt_strlen;
            PCurStr = Pdt[LineCnt].dt_string;
            hval = HASHSTR(PCurStr, Strlen, HTblsz);
            if (foolflag)
                printf("OOps foolflag is set, hval = %u\n", hval);
        }
        ENDTm(DeltaUSec, tm);           // end timer

        printf
            ("# HashStr() Time = %6.3f uS average per string (mostly from RAM)\n",
             DeltaUSec / nStrg);
    }

//=======================================================================
//  Insert, Get and Delete loops
//=======================================================================

    for (Pass = 0; Pass < Passes; Pass++)
    {
        printf("# Pass %d\n", Pass);

//      heading of table 
        Printf
            ("# TotInserts  DeltaGets  DupStrs InsTime GetTime HChainLen Ram/String\n");
        gStored = 0;                    // number of strings inserted
        StrNumb = 0;                    // number of attempted strings inserted

        STARTmem;                       // current malloc() mem usage
        for (grp = 0; grp < Groups; grp++)
        {
            PWord_t   PValue;
            Word_t    Begin = gStored;  // remember current STOREed
            Word_t    Delta = Pms[grp].ms_delta;

            switch (Method)
            {
            case M_Print:
            {
                STARTTm(tm);            // start timer
                for (lines = 0; lines < Delta; lines++, StrNumb++)
                {
                    GETSTRING(PCurStr, Strlen);
                    PCurStr = Pdt[StrNumb].dt_string;
                    Printf("%s\n", (char *)PCurStr);
                }
                ENDTm(DeltaUSec, tm);   // end timer
                break;
            }
            case M_Hash:
            {
                STARTTm(tm);            // start timer
                for (lines = 0; lines < Delta; lines++, StrNumb++)
                {
                    GETSTRING(PCurStr, Strlen);
                    Strlen = Pdt[StrNumb].dt_strlen;
                    PCurStr = Pdt[StrNumb].dt_string;

                    PValue = HashIns(&HRoot, PCurStr, Strlen, HTblsz);
                    if ((*PValue)++ == 0)
                        gStored++;      // number of strings stored
                }
                ENDTm(DeltaUSec, tm);   // end timer
                break;
            }
            case M_JLHash:

src/judy-1.0.5/test/StringCompare.c  view on Meta::CPAN

            case M_Redblack:
            {
                STARTTm(tm);            // start timer
                for (lines = 0; lines < ReadLin; lines++)
                {
                    GETSTRING(PCurStr, Strlen);
                    PCurStr = Pdts[lines].dt_string;

                    redblacksearch(&rbans, (char *)PCurStr);
                }
                ENDTm(DeltaUSec, tm);   // end timer
                break;
            }
            case M_Ternary:
            {
                STARTTm(tm);            // start timer
                for (lines = 0; lines < ReadLin; lines++)
                {
                    GETSTRING(PCurStr, Strlen);
                    Strlen = Pdts[lines].dt_strlen;
                    PCurStr = Pdts[lines].dt_string;

                    if (TernaryGet(Ternary, PCurStr, Strlen) == 0)
                    {
                        printf("\n OOps - Ternary Bug at Line = %d\n",
                               __LINE__);
                        exit(1);
                    }
                }
                ENDTm(DeltaUSec, tm);   // end timer
                break;
            }
            default:
                assert(0);              // cant happen
                break;
            }
            Mult = DeltaUSec / (double)ReadLin;

//          save least value
            if (Mult < Pms[grp].ms_minretrive)
                Pms[grp].ms_minretrive = Mult;

            Printf(" %7.3f", Mult);     // RETRIVE per string
            Printf(" %9.6f", (double)gChainln / (double)ReadLin);

//      RAM USED PER STRING TO STORE DATA

            Printf(" %13.1f", (double)Pms[grp].ms_Bytes);
            Printf("\n");
            fflush(stdout);
        }
        if (Method == M_Print)
            exit(0);

        Printf("# Total Duplicate strings = %lu\n", nStrg - gStored);

//=======================================================================
//  Delete loop
//=======================================================================

        DeltaUSec = -1.0;               // set deleted flag

        if (rFlag == 0)
        {
            Randomize(Pdt, StrNumb);    // Randomize ONLY those stored
        }
        switch (Method)
        {
        case M_JudySL:
        {
            if (DFlag)
            {
                Printf("# Begin JudySLDel() loop...\n");
                STARTTm(tm);            // start timer
                for (lines = 0; lines < nStrg; lines++)
                {
                    int       Rc;
                    GETSTRING(PCurStr, Strlen);
                    PCurStr = Pdt[lines].dt_string;
                    JSLD(Rc, JudySL, PCurStr);  // delete string
                    assert(Rc != JERR);
                }
                ENDTm(DeltaUSec, tm);   // end timer
            }
            else
            {
                Printf("# Begin JudySLFreeArray()...\n");
                STARTTm(tm);            // start timer
                JSLFA(Bytes, JudySL);
                ENDTm(DeltaUSec, tm);   // end timer
            }
            break;
        }
        case M_JudyHS:
        {
            if (DFlag)
            {
                int       Rc;
                Printf("# Begin JudyHSDel() loop...");
                STARTTm(tm);            // start timer
                for (lines = 0; lines < nStrg; lines++)
                {
                    GETSTRING(PCurStr, Strlen);
                    Strlen = Pdt[lines].dt_strlen;
                    PCurStr = Pdt[lines].dt_string;
                    JHSD(Rc, JudyHS, PCurStr, Strlen);  // Delete string
                    assert(Rc != JERR);
                }
                ENDTm(DeltaUSec, tm);   // end timer
            }
            else
            {
                Printf("# Begin JudyHSFreeArray()...\n");
                STARTTm(tm);            // start timer
                JHSFA(Bytes, JudyHS);
                ENDTm(DeltaUSec, tm);   // end timer
            }
            break;
        }
        case M_Hash:
        {



( run in 0.866 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )