Alien-Judy

 view release on metacpan or  search on metacpan

src/judy-1.0.5/test/SLcompare.c  view on Meta::CPAN

performance of JudySL with short (a text word) strings.  I have a data 
base of all (28.8 million) domain names on the internet as a test case.

wc /data/domnames.txt
28826508 28826508 488227095 /data/domnames.txt

For the curious, JudySL (I.E JSLI/G macros in this program) is simply an
application subroutine that uses JudyL to the work.  JudyL is a very
scaleable word to word (or pointer) mapper.  JudySL is implemented as a
digital tree using JudyL arrays as 2^32 (or 2^64 in 64 bit machines)
wide (virtual) nodes.  Each level in the digital tree decodes the next 4
(or 8) bytes of the line/string until only one entry would be in the
node.  Then the remaining undecoded portion of the line is stored as a
string from the parent node.  A digital tree does not need rotation or
balancing.  In practice, digital trees are rarely use because of their
poor memory efficiency in the nodes and a tendency to have large depths.
These problems are solved in the Judy algorithm.  For more details see
application notes on:  www.sourcejudy.com.  And for the really curious,
a technical paper is available in the application section.  If you would
like to be a reviewer, contact Doug Baskins at <doug@sourcejudy.com>

*/

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>              // printf
#include <fcntl.h>
#include <string.h>
#include <errno.h>              // errno
#include <sys/mman.h>           // mmap()
#include <sys/stat.h>           // stat()
#include <sys/time.h>           // gettimeofday()

// remove all uses of this in production code.
int       gDupCnt = 0;          // counter for duplicate strings

//=======================================================================
//      T I M I N G   M A C R O S
//=======================================================================
// if your machine is one of the supported types in the following header
// file then uncomment this corresponding to what the header file says.
// This will give very high timing resolution.
//
// #define JU_xxxxxxx 1         // read timeit.h
// #define JU_LINUX_IA32 1      // I.E. IA32 Linux
//
//
// optional for high resolution times and compile with timeit.c
//   cc -static -O -o Judy -DJUDYMETHOD SLcompare.c timeit.c -lJudy
//
//#include "timeit.h"

double    DeltaUSec;            // Global for remembering delta times

#ifndef _TIMEIT_H

// Note: I have found some Linux systems (2.4.18-6mdk) to have bugs in the 
// gettimeofday() routine.  Sometimes it returns a negative ~2840 microseconds
// instead of 0 or 1.  If you use the above #include "timeit.h" and compile with
// timeit.c and use -DJU_LINUX_IA32, that problem will be eliminated.  This is
// because for delta times less than .1 sec, the hardware free running timer
// is used instead of gettimeofday().  I have found the negative time problem
// appears about 40-50 times per second with consecutive gettimeofday() calls.

#define TIMER_vars(T) struct timeval __TVBeg_##T, __TVEnd_##T

#define STARTTm(T) gettimeofday(&__TVBeg_##T, NULL)

#define ENDTm(D,T)                                                      \
{                                                                       \
    gettimeofday(&__TVEnd_##T, NULL);                                   \
    (D) = (double)(__TVEnd_##T.tv_sec  - __TVBeg_##T.tv_sec) * 1E6 +    \
         ((double)(__TVEnd_##T.tv_usec - __TVBeg_##T.tv_usec));         \
}

#endif // _TIMEIT_H

//=======================================================================
//      M E M O R Y   S I Z E   M A C R O S
//=======================================================================
//      Most mallocs have mallinfo()

// define this if your malloc does not have mallinfo();
#define NOMALLINFO 1

double    DeltaMem;             // for remembering

#ifndef NOMALLINFO

#include <malloc.h>             // mallinfo()

struct mallinfo malStart;

#define STARTmem malStart = mallinfo() /* works with some mallocs */
#define ENDmem                                                  \
{                                                               \
    struct mallinfo malEnd = mallinfo();                        \
/* strange little dance from signed to unsigned to double */    \
    unsigned int _un_int = malEnd.arena - malStart.arena;       \
    DeltaMem = _un_int;      /* to double */                    \
}

#else // MALLINFO

// this usually works for machines with less than 1-2Gb RAM.
// (it does not include memory ACQUIRED by mmap())

char     *malStart;

#define STARTmem (malStart = (char *)sbrk(0))
#define ENDmem                                                  \
{                                                               \
    char  *malEnd =  (char *)sbrk(0);                           \
    DeltaMem = malEnd - malStart;                               \
}

#endif // MALLINFO


//=======================================================================
//      G E T S T R I N G   F R O M   mmap()   F I L E   M A C R O

src/judy-1.0.5/test/SLcompare.c  view on Meta::CPAN

        }
        if (curr->par == NULL)
            ans->root = curr;
        ans->root->colour = BLACK;
    }
    else
        gDupCnt++;              /* count duplicate strings */

    return;
}

/* Create a node to hold a word */
TREEREC  *
wcreate(char *word, TREEREC * par)
{
    TREEREC  *tmp;

    tmp = (TREEREC *) malloc(sizeof(TREEREC));
    tmp->word = (char *)malloc(strlen(word) + 1);
    strcpy(tmp->word, word);
    tmp->left = tmp->right = NULL;

    tmp->par = par;

    return (tmp);
}

#define INITARRAY ANSREC      ans = {0}

#define STORESTRING(STRING)   redblackinsert(&ans, STRING)

#define GETSTRING(STRING)     redblacksearch(&ans, STRING)

#endif // REDBLACKMETHOD

//=======================================================================
//      M A I N   P R O G R A M  -by-  Doug Baskins
//=======================================================================

// error routine for system routines for accessing file

#define FILERROR                                                        \
{                                                                       \
    printf("%s: Cannot open file \"%s\": %s "                           \
		"(errno = %d)\n", argv[0], argv[1], strerror(errno),    \
		errno);                                                 \
    fprintf(stderr, "%s: Cannot open file \"%s\": %s "                  \
		"(errno = %d)\n", argv[0], argv[1], strerror(errno),    \
		errno);                                                 \
    exit(1);                                                            \
}


//=======================================================================
//      M E A S U R E  A D T   S P E E D  and  M E M O R Y  U S A G E
//=======================================================================

int
main(int argc, char *argv[])
{
    TIMER_vars(tm);             // declare timer variables
    int       fd;               // to read file.
    struct stat statbuf;        // to get size of file
    char     *Pfile;            // ram address of file
    size_t    fsize;            // file size in bytes
    char     *FSmap;            // start address of mapped file
    char     *FEmap;            // end address+1 of mapped file
    char      String[MAXLINE];  // input buffer
    int       StrCnt;           // line counter
    int       ii;               // temp

    INITARRAY;

// CHECK FOR REQUIRED INPUT FILE PARAMETER:

    if (argc != 2)
    {
        printf("Usage: %s <text file>\n", argv[0]);
        exit(1);
    }

// GET FILE SIZE

    if (stat(argv[1], &statbuf) == -1)
        FILERROR;

    fsize = statbuf.st_size;

// OPEN INPUT FILE:

    if ((fd = open(argv[1], O_RDONLY)) == -1)
        FILERROR;

// MEMORY MAP FILE

    Pfile =
        (char *)mmap(NULL, fsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
    if (Pfile == (char *)-1)
        FILERROR;

    FEmap = Pfile + fsize;      // set end+1 address

// print command name + run arguments

    printf("# ");
    for (ii = 0; ii < argc; ii++)
        printf("%s ", argv[ii]);
    printf("\n");
    fflush(stdout);

// file is in RAM, read again to measure time

    STARTTm(tm);                // start timer

    for (StrCnt = 0, FSmap = Pfile; FSmap < FEmap; )
    {
        GETLINE(String, FSmap); // 'read' next string

        StrCnt++;
    }
    ENDTm(DeltaUSec, tm);       // end timer
//
//  print header - 6 fields

    printf("#  lines avg_linelen  getline   stored");
    printf(" RAMused/line  store/ln lookup/ln ADT\n");

//  print numb lines  filesize/line

    printf("%8u", StrCnt);      // Number of lines
    printf(" %6.1f byts", (double)fsize / (double)StrCnt); // filesize/line
    fflush(stdout);

//  print time to read a line

    printf(" %5.3f uS", DeltaUSec / (double)StrCnt);
    fflush(stdout);

// INPUT/STORE LOOP:

// read each input line into String and store into ADT

    STARTmem;                   // current malloc() mem usage
    STARTTm(tm);                // start timer

    for (FSmap = Pfile; FSmap < FEmap; )
    {
        GETLINE(String, FSmap);  // 'read' next string

        STORESTRING(String);
    }
    ENDTm(DeltaUSec, tm);       // end timer
    ENDmem;                     // current malloc() mem usage

//  print number of non-duplicate lines

    printf(" %8u", StrCnt - gDupCnt); // duplicate lines

//  print RAM used by malloc() by ADT to store data

    printf(" %7.1f byts", (double)(DeltaMem + MEMOVD) /
            (double)(StrCnt - gDupCnt));

//  print time per line to store the data

    printf(" %6.3f uS", DeltaUSec / (double)StrCnt);
    fflush(stdout);

// READ BACK LOOP

    STARTTm(tm);                // start timer

    for (FSmap = Pfile; FSmap < FEmap; )
    {
        GETLINE(String, FSmap);  // 'read' next string

        GETSTRING(String);
    }
    ENDTm(DeltaUSec, tm);       // end timer

//  print time per line to lookup the data from the ADT

    printf(" %6.3f uS", DeltaUSec / (double)StrCnt); // store time/line
    printf(" %s\n", ADTMETHOD);

    return (0);                 // done

}                               // main()



( run in 0.586 second using v1.01-cache-2.11-cpan-39bf76dae61 )