Alien-Judy
view release on metacpan or search on metacpan
src/judy-1.0.5/test/StringCompare.c view on Meta::CPAN
very scaleable because the hash table size is ~4 billion.
4) JudySL - ordered and requires null terminated strings.
5) Ternary - code borrowed from Mr Dobbs, perhaps 2000
6) Redblack - code borrowed from J. Zobel, April 2001.
7) Splay - code borrowed from J. Zobel, April 2001.
Note: Splay, Redblack and Ternary methods are not very fast, so they
have not been completed and made bug free. I.E. no Delete and Free
Array routines. Ternary is fastest retrieve of these three, but uses
an extraordinary amount of memory.
*/
//=======================================================================
//
// Compile:
//
// cc -O StringCompare.c -lm -lJudy -o StringCompare
// -or-
// cc -O -DCPUMHZ=1299 StringCompare.c -lm -lJudy -o StringCompare
/* Notes:
1) Use '-DCPUMHZ=1299' in cc line if better clock resolution is desired
and it compiles successfully. The 1299 is the cpu MHz : 1298.916 from
cat /proc/cpuinfo in a Linux system.
2) -static will generally get better performance because memcmp(),
memcpy() routines are usually slower with shared librarys.
*/
// Usage:
//
// StringCompare -A Hash <options> textfile
// StringCompare -A JudyHS <options> textfile
// StringCompare -A JLHash <options> textfile
// StringCompare -A JudySL <options> textfile
// StringCompare -A Ternary <options> textfile
// StringCompare -A Redblack <options> textfile
// StringCompare -A Splay <options> textfile
#include <stdlib.h> // malloc(3)
#include <unistd.h> // getopt(3)
#include <stdio.h> // printf(3)
#include <fcntl.h> // open(2)
#include <string.h> // memcmp(3), memcpy(3)
#include <errno.h> // errno(3)
#include <sys/mman.h> // mmap(2)
#include <sys/time.h> // gettimeofday(2)
#include <math.h> // pow(3)
#include <sys/utsname.h> // uname(2)
#include <Judy.h> // Judy arrays
//=======================================================================
// D e f i n e: T O T U R N O F F A S S E R T I O N S !!!!!!!!
//=======================================================================
//#define NDEBUG 1
#include <assert.h> // assert(3)
//=======================================================================
// G L O B A L D A T A
//=======================================================================
int foolflag = 0; // fool compiler from optimizing
static Word_t gStored = 0; // number of strings inserted
static Word_t gChainln = 0; // links traversed during RETRIVE
static Word_t PtsPdec = 40; // default measurement points per decade
#define INFSTRGS 1000000000 // 1 billion strings is infinity
static Word_t nStrg = INFSTRGS; // infinity -- measure all strings
static Word_t TValues = 100000; // max measure points for RETRIVE tests
static int pFlag = 0; // pre-fault hash table pages into RAM
static int rFlag = 0; // do not randomize input file
static int aFlag = 0; // word align string buffers
static int DFlag = 0; // do the delete measurement
static int CFlag = 0; // build sequential Get buffers
static Word_t aCount = 0; // Count of missaligned string buffers
// define the maximum length of a string allowed
#define MAXSTRLEN (100000)
static int MLength = MAXSTRLEN;
static Word_t HTblsz; // 1M default hash table size
static int fileidx; // argv[fileidx] == file string
// for saving input string data
typedef struct STRING_
{
int dt_strlen;
uint8_t *dt_string;
} dt_t , *Pdt_t;
static Pdt_t PdtS_ = NULL; // memory for Cache access Gets
static uint8_t *Strbuf_ = NULL;
static Word_t Strsiz_ = 0;
// Roundup BYTES to an even number of words
/*
On Linux 2.6.3-4mdkenterprise (Mandrake 10.0 Community) printing (even
to a file) makes the timings inaccurate. So, use -L2 or greater to
average (actually save min times) and print results after all tests are
completed.
*/
#define Printf if (Pass == 0) printf
#define ROUNDUPWORD(BYTES) (((BYTES) + sizeof(Word_t) - 1) & (-sizeof(Word_t)))
#define BYTES2WORDS(BYTES) (((BYTES) + sizeof(Word_t) - 1) / (sizeof(Word_t)))
//=======================================================================
// T I M I N G M A C R O S
//=======================================================================
static double DeltaUSec; // Global for remembering delta times
// Some operating systems have get_cycles() in /usr/include/asm/timex.h
#ifdef CPUMHZ
// For a 1.34 nS clock cycle processor (750Mhz)
#define CPUSPEED (1.0 / (CPUMHZ))
src/judy-1.0.5/test/StringCompare.c view on Meta::CPAN
} // JudyFree()
// ****************************************************************************
// J U D Y M A L L O C
//
// Higher-level "wrapper" for allocating objects that need not be in RAM,
// although at this time they are in fact only in RAM. Later we hope that some
// entire subtrees (at a JPM or branch) can be "virtual", so their allocations
// and frees should go through this level.
Word_t
JudyMallocVirtual(Word_t Words)
{
return (JudyMalloc(Words));
} // JudyMallocVirtual()
// ****************************************************************************
// J U D Y F R E E
void
JudyFreeVirtual(void *PWord, Word_t Words)
{
JudyFree(PWord, Words);
} // JudyFreeVirtual()
//=======================================================================
// Routine to get next size of Indexes
//=======================================================================
static int
NextNumb(Word_t *PNumber, // pointer to returned next number
double *PDNumb, // Temp double of above
double DMult, // Multiplier
Word_t MaxNumb) // Max number to return
{
// Save prev number
double PrevPDNumb = *PDNumb;
double DDiff;
// Calc next number >= 1.0 beyond previous
do
{
*PDNumb *= DMult;
DDiff = *PDNumb - PrevPDNumb;
}
while (DDiff < 0.5);
// Return it in integer format
if (DDiff < 100.0)
*PNumber += (Word_t)(DDiff + 0.5);
else
*PNumber = (Word_t)(*PDNumb + 0.5);
// Verify it did not exceed max number
if (*PNumber >= MaxNumb)
{
*PNumber = MaxNumb; // it did, so return max
return (1); // flag it
}
return (0); // more available
}
//=======================================================================
// M E A S U R E M E N T S T R U C T U R E
//=======================================================================
typedef struct _MEASUREMENTS_STRUCT *Pms_t;
typedef struct _MEASUREMENTS_STRUCT
{
Word_t ms_delta; // number of points in current group
double ms_Bytes; // average allocated memory/per string
double ms_mininsert; // Min Retrive number
double ms_minretrive; // Min Retrive number
} ms_t;
static Pms_t Pms; // array of MEASUREMENTS_STRUCT
// Method type
typedef enum
{
M_invalid,
M_Print,
M_Hash,
M_JLHash,
M_JudySL,
M_JudyHS,
M_Splay,
M_Redblack,
M_Ternary
} Method_t;
//=======================================================================
// R a n d o m i z e i n p u t s t r i n g s
//=======================================================================
static void
Randomize(Pdt_t Pstrstr, Word_t Len)
{
Word_t ii;
// swap the "random" index with the sequential one
for (ii = 1; ii < Len; ii++)
{
dt_t dttemp;
Word_t swapii;
// get "random" index
swapii = (Word_t)rand() % Len;
// and swap
dttemp = Pstrstr[ii];
Pstrstr[ii] = Pstrstr[swapii];
Pstrstr[swapii] = dttemp;
}
}
//=======================================================================
src/judy-1.0.5/test/StringCompare.c view on Meta::CPAN
}
if (strcmp(optarg, "JLHash") == 0)
{
Method = M_JLHash;
HTblsz = 0; // max 2^32
}
if (strcmp(optarg, "JudySL") == 0)
Method = M_JudySL;
if (strcmp(optarg, "Splay") == 0)
Method = M_Splay;
if (strcmp(optarg, "Redblack") == 0)
Method = M_Redblack;
if (strcmp(optarg, "JudyHS") == 0)
Method = M_JudyHS;
if (strcmp(optarg, "Ternary") == 0)
Method = M_Ternary;
break;
case 'H': // Size of Hash table
HTblsz = strtoul(optarg, NULL, 0);
break;
case 'L': // Number of Loops
Passes = atoi(optarg);
if (Passes <= 0)
{
printf("\n !! OOps - Number of Loops must be > 0\n");
ErrorFlag++;
}
break;
case 'n': // Max population of arrays
nStrg = strtoul(optarg, NULL, 0); // Size of Linear Array
if (nStrg == 0)
{
printf("\n !! OOps - Number of strings must be > 0\n");
ErrorFlag++;
}
break;
case 'T': // Maximum retrieve tests for timing
TValues = strtoul(optarg, NULL, 0);
break;
case 'P': // measurement points per decade
PtsPdec = strtoul(optarg, NULL, 0);
break;
case 'M': // maximum length of input string
MLength = atoi(optarg);
break;
case 'p': // pre-initialize Hash table
pFlag = 1;
break;
case 'r': // do not randomize input
if (CFlag)
{
printf
("\n !! OOps '-r' and '-C' flag are mutually exclusive\n");
ErrorFlag++;
break;
}
rFlag = 1;
break;
case 'a': // word align string buffers
aFlag = 1;
break;
case 'D': // do a delete at end
DFlag = 1;
break;
case 'C': // build sequential Get string buffers
if (rFlag)
{
printf
("\n !! OOps '-C' and '-r' flag are mutually exclusive\n");
ErrorFlag++;
break;
}
CFlag = 1;
break;
default:
ErrorFlag++;
break;
}
}
if (Method == -1)
{
printf
("\n !! OOps -- '-A <ADT>' I.E. '-AHash' or '-AJudyHS' is a required option\n");
ErrorFlag++;
}
fileidx = optind;
if (optind >= argc)
{
printf("\n !! OOps -- No input file specified\n");
ErrorFlag++;
}
if (ErrorFlag)
{
printf("\n");
printf("$ %s -A<ADT> -n# -H# -P# -T# -p -r -a InputStringFile\n\n",
argv[0]);
printf("Where: ");
printf("'InputStringFile' is text file of strings to use in test\n\n");
printf
("-A <ADT> is Hash|JLHash|JudySL|JudyHS|Splay|Redblack|Ternary|Print\n");
printf("\n");
printf("-n <#> max number of strings to use in tests (all)\n");
printf("-H <#> is number elements in Hash table\n");
printf("-P <#> number of measurement points per decade (40)\n");
printf
("-T <#> Change the 'Get' number_of_strings to measure per data point\n");
printf("-D Use 'Delete' routine instead of 'FreeArray' routine\n");
printf("-p pre-zero hash table to fault in all pages\n");
printf("-r Do not randomize Insert and Get order of strings\n");
printf("-C Build contigious string buffers for 'Get' tests\n");
printf("-a Word_t align the start address of input strings\n");
printf("-M <#> Change the maximum 'strlen(String)' of input Strings\n");
printf("\n\n");
exit(1);
}
// calculate max number mask used in hash routine
//============================================================
// PRINT COMMAND NAME + RUN ARGUMENTS
//============================================================
printf("# %s", argv[0]);
if (nStrg != INFSTRGS)
printf(" -n%lu", nStrg);
src/judy-1.0.5/test/StringCompare.c view on Meta::CPAN
if (Method == M_JLHash)
{
if (HTblsz)
printf("# JLHash table virtual size = %lu\n", HTblsz);
else
printf("# JLHash table virtual size = 4294967296\n");
}
//=======================================================================
// Read text input file into RAM
//=======================================================================
if ((fid = fopen(argv[fileidx], "r")) == NULL)
FILERROR;
for (Strlen = LineCnt = 0; LineCnt < nStrg;)
{
Chr = fgetc(fid);
if (Chr == '\n')
{
if (Strlen) // eat zero length lines
{
if (Strlen > MLength)
Strlen = MLength;
Pdt[LineCnt].dt_string = PCurStr - Strlen;
Pdt[LineCnt].dt_strlen = Strlen;
LineCnt++;
Strlen = 0;
*PCurStr++ = '\0'; // for JudySL
if (aFlag) // for word alignment
PCurStr = (uint8_t *) ROUNDUPWORD((Word_t)PCurStr);
if ((Word_t)PCurStr % sizeof(Word_t))
aCount++;
}
}
else
{
if (Strlen < MLength)
{
Strlen++;
if (Chr == '\0')
Chr = ' '; // for JudySL
*PCurStr++ = (uint8_t) Chr;
}
}
}
fclose(fid);
fid = NULL;
assert(nStrg == LineCnt);
printf("# %lu (%.1f%%) non-Word_t aligned string buffers\n",
aCount, (double)aCount / (double)LineCnt * 100.0);
printf("# Ram used for input data = %lu bytes\n", StringMemory);
printf("# Average string length = %.1f bytes\n",
(double)(StrTot - LineCnt) / LineCnt);
// Allocate memory for Cached assess to 'Get' (largest delta). This flag
// will put the 'randomized' 'Get' order strings in a sequential buffer.
// Modern processors will 'read ahead' with an access to RAM is sequential
// -- thus saving the 'Get' having to bring the string into cache.
if (CFlag)
{
PdtS_ = (Pdt_t) malloc(TValues * sizeof(dt_t));
if (PdtS_ == NULL)
MALLOCERROR;
// now guess how much memory will be needed for the strings
Strsiz_ = ((StrTot / nStrg) * TValues);
Strsiz_ += Strsiz_; // bump %20
Strbuf_ = (uint8_t *) malloc(Strsiz_);
if (Strbuf_ == NULL)
MALLOCERROR;
printf
("# %lu bytes malloc() for 'cached' strings for Get measurement\n",
Strsiz_);
}
//=======================================================================
// TIME GETSTRING() from Cache (most of the time)
//=======================================================================
STARTTm(tm); // start timer
for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
{
GETSTRING(PCurStr, Strlen);
Strlen = Pdt[LineCnt].dt_strlen;
PCurStr = Pdt[LineCnt].dt_string;
if (strlen(PCurStr) != Strlen) // bring string into Cache
{
// necessary to prevent cc from optimizing out
printf(" !! OOps Bug, wrong string length\n");
exit(1);
}
}
ENDTm(DeltaUSec, tm); // end timer
printf
("# Access Time = %6.3f uS average per string (mostly from Cache)\n",
DeltaUSec / nStrg);
//=======================================================================
// TIME GETSTRING() + HASHSTR() from Cache (most of the time)
//=======================================================================
STARTTm(tm); // start timer
for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
{
uint32_t hval;
GETSTRING(PCurStr, Strlen);
PCurStr = Pdt[LineCnt].dt_string;
Strlen = Pdt[LineCnt].dt_strlen;
hval = HASHSTR(PCurStr, Strlen, HTblsz);
if (foolflag)
printf("OOps foolflag is set, hval = %d\n", hval);
}
ENDTm(DeltaUSec, tm); // end timer
printf
("# HashStr() Time = %6.3f uS average per string (mostly from Cache)\n",
DeltaUSec / nStrg);
// randomize the input strings (adjacent strings will not be on same page)
if (rFlag == 0)
{
Randomize(Pdt, nStrg); // Randomize ALL to be stored
//=======================================================================
// TIME GETSTRING() from RAM (most of the time)
//=======================================================================
STARTTm(tm); // start timer
for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
{
GETSTRING(PCurStr, Strlen);
Strlen = Pdt[LineCnt].dt_strlen;
PCurStr = Pdt[LineCnt].dt_string;
if (strlen(PCurStr) != Strlen) // bring string into Cache
{
// necessary to prevent cc from optimizing out
printf(" !! OOps Bug, wrong string length\n");
exit(1);
}
}
ENDTm(DeltaUSec, tm); // end timer
printf
("# Access Time = %6.3f uS average per string (mostly from RAM)\n",
DeltaUSec / nStrg);
//=======================================================================
// TIME GETSTRING() + HASHSTR() from RAM (most of the time)
//=======================================================================
STARTTm(tm); // start timer
for (LineCnt = 0; LineCnt < nStrg; LineCnt++)
{
uint32_t hval;
GETSTRING(PCurStr, Strlen);
Strlen = Pdt[LineCnt].dt_strlen;
PCurStr = Pdt[LineCnt].dt_string;
hval = HASHSTR(PCurStr, Strlen, HTblsz);
if (foolflag)
printf("OOps foolflag is set, hval = %u\n", hval);
}
ENDTm(DeltaUSec, tm); // end timer
printf
("# HashStr() Time = %6.3f uS average per string (mostly from RAM)\n",
DeltaUSec / nStrg);
}
//=======================================================================
// Insert, Get and Delete loops
//=======================================================================
for (Pass = 0; Pass < Passes; Pass++)
{
printf("# Pass %d\n", Pass);
// heading of table
Printf
("# TotInserts DeltaGets DupStrs InsTime GetTime HChainLen Ram/String\n");
gStored = 0; // number of strings inserted
StrNumb = 0; // number of attempted strings inserted
STARTmem; // current malloc() mem usage
for (grp = 0; grp < Groups; grp++)
{
PWord_t PValue;
Word_t Begin = gStored; // remember current STOREed
Word_t Delta = Pms[grp].ms_delta;
switch (Method)
{
case M_Print:
{
STARTTm(tm); // start timer
for (lines = 0; lines < Delta; lines++, StrNumb++)
{
GETSTRING(PCurStr, Strlen);
PCurStr = Pdt[StrNumb].dt_string;
Printf("%s\n", (char *)PCurStr);
}
ENDTm(DeltaUSec, tm); // end timer
break;
}
case M_Hash:
{
STARTTm(tm); // start timer
for (lines = 0; lines < Delta; lines++, StrNumb++)
{
GETSTRING(PCurStr, Strlen);
Strlen = Pdt[StrNumb].dt_strlen;
PCurStr = Pdt[StrNumb].dt_string;
PValue = HashIns(&HRoot, PCurStr, Strlen, HTblsz);
if ((*PValue)++ == 0)
gStored++; // number of strings stored
}
ENDTm(DeltaUSec, tm); // end timer
break;
}
case M_JLHash:
src/judy-1.0.5/test/StringCompare.c view on Meta::CPAN
case M_Redblack:
{
STARTTm(tm); // start timer
for (lines = 0; lines < ReadLin; lines++)
{
GETSTRING(PCurStr, Strlen);
PCurStr = Pdts[lines].dt_string;
redblacksearch(&rbans, (char *)PCurStr);
}
ENDTm(DeltaUSec, tm); // end timer
break;
}
case M_Ternary:
{
STARTTm(tm); // start timer
for (lines = 0; lines < ReadLin; lines++)
{
GETSTRING(PCurStr, Strlen);
Strlen = Pdts[lines].dt_strlen;
PCurStr = Pdts[lines].dt_string;
if (TernaryGet(Ternary, PCurStr, Strlen) == 0)
{
printf("\n OOps - Ternary Bug at Line = %d\n",
__LINE__);
exit(1);
}
}
ENDTm(DeltaUSec, tm); // end timer
break;
}
default:
assert(0); // cant happen
break;
}
Mult = DeltaUSec / (double)ReadLin;
// save least value
if (Mult < Pms[grp].ms_minretrive)
Pms[grp].ms_minretrive = Mult;
Printf(" %7.3f", Mult); // RETRIVE per string
Printf(" %9.6f", (double)gChainln / (double)ReadLin);
// RAM USED PER STRING TO STORE DATA
Printf(" %13.1f", (double)Pms[grp].ms_Bytes);
Printf("\n");
fflush(stdout);
}
if (Method == M_Print)
exit(0);
Printf("# Total Duplicate strings = %lu\n", nStrg - gStored);
//=======================================================================
// Delete loop
//=======================================================================
DeltaUSec = -1.0; // set deleted flag
if (rFlag == 0)
{
Randomize(Pdt, StrNumb); // Randomize ONLY those stored
}
switch (Method)
{
case M_JudySL:
{
if (DFlag)
{
Printf("# Begin JudySLDel() loop...\n");
STARTTm(tm); // start timer
for (lines = 0; lines < nStrg; lines++)
{
int Rc;
GETSTRING(PCurStr, Strlen);
PCurStr = Pdt[lines].dt_string;
JSLD(Rc, JudySL, PCurStr); // delete string
assert(Rc != JERR);
}
ENDTm(DeltaUSec, tm); // end timer
}
else
{
Printf("# Begin JudySLFreeArray()...\n");
STARTTm(tm); // start timer
JSLFA(Bytes, JudySL);
ENDTm(DeltaUSec, tm); // end timer
}
break;
}
case M_JudyHS:
{
if (DFlag)
{
int Rc;
Printf("# Begin JudyHSDel() loop...");
STARTTm(tm); // start timer
for (lines = 0; lines < nStrg; lines++)
{
GETSTRING(PCurStr, Strlen);
Strlen = Pdt[lines].dt_strlen;
PCurStr = Pdt[lines].dt_string;
JHSD(Rc, JudyHS, PCurStr, Strlen); // Delete string
assert(Rc != JERR);
}
ENDTm(DeltaUSec, tm); // end timer
}
else
{
Printf("# Begin JudyHSFreeArray()...\n");
STARTTm(tm); // start timer
JHSFA(Bytes, JudyHS);
ENDTm(DeltaUSec, tm); // end timer
}
break;
}
case M_Hash:
{
( run in 0.866 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )