#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <string.h>
#include "judy.h"

typedef unsigned int uint32;

int s = (10 << 5), e = (10 << 20), c = 24;
int probes = 1 << 22;

//
//  forward declarations
//

static double get_time_in_seconds(void);

#define ran_arr_next() (*ran_arr_ptr>=0? *ran_arr_ptr++: ran_arr_cycle())
void ran_start(long seed);
long ran_arr_cycle();
extern long *ran_arr_ptr;

void *hashCreate(int size);
void hashFree(void *hash);
uint32 *hashFind(void *hash, uint32 key);
uint32 *hashInsert(void *hash, uint32 key);
int hashDelete(void *hash, uint32 key);
int hashCount(void *hash);
int hashMem(void *hash);

void *bhashCreate(int size);
uint32 *bhashFind(void *hash, uint32 key);
uint32 *bhashInsert(void *hash, uint32 key);
int bhashDelete(void *hash, uint32 key);

//
//  global state
//

Pvoid_t judy;
void *hash, *bhash;

int result;  // validate, and prevent optimizer from throwing away unused results
int deletes;
int get_count;
int rehashes;

void reset(void)
{
   Word_t r;

   // clear and init all implementations (just to simplify life,
   // since this code isn't timed anyway)

   JLFA(r, judy);
   if (hash != NULL)
      hashFree(hash);
   hash = hashCreate(1);
   if (bhash != NULL)
      hashFree(bhash);
   bhash = bhashCreate(1);

   ran_start(314159L);   // reset the random number generator outside the timing
}

void resetcounts(void)
{
   result = 0;
   deletes = 0;
   get_count = 0;
   rehashes = 0;
}

#define SEQ_BASE   0x1b7F3e40     // multiple of 64

#define TABLE_SIZE    (1 << 24)   // 64M    == 16 million unique entries, ~= 160M of Judy

static unsigned int table[TABLE_SIZE];

static int uint_comp(const void *p, const void *q)
{
   unsigned int a = * (unsigned int *) p;
   unsigned int b = * (unsigned int *) q;

   return a < b ? -1 : a > b;
}

int sort = 0, swap = 0, seq_scale = 1, jitter = 0, shuffle = 0;

static void do_sort(void)
{
   qsort(table, TABLE_SIZE, sizeof(table[0]), uint_comp);
}

static void do_jitter(void)
{
   int i;
   for (i=0; i < TABLE_SIZE; ++i) {
      table[i] += ran_arr_next() % jitter;
   }
}

static void do_shuffle(int len)
{
   unsigned int i,j,t;
   for (i=len-1; i > 0; --i) {
      j = ran_arr_next() % i;
      t = table[j];
      table[j] = table[i];
      table[i] = t;
   }
}

static void do_swap(void)
{
   int i;
   for (i=0; i < TABLE_SIZE; ++i) {
      uint32 x = table[i];
      table[i] = (((x) & 0xff) << 24) | (((x >> 8) & 0xff) << 16) | (((x >> 16) & 0xff) << 8) | (x >> 24);
   }
}

typedef struct
{
   char *option;
   char *type;
   char *longdesc;
   void (*init)(int value);
} Data;

void initRandomTable(int size)
{
   int i;
   for (i=0; i < TABLE_SIZE; ++i)
      table[i] = ran_arr_next();
} 

void initRandom22Table(int size)
{
   int i;
   for (i=0; i < TABLE_SIZE; ++i)
      table[i] = (ran_arr_next() & 0x3fffff);
} 

void initRandom5678Table(int size)
{
   int i;
   for (i=0; i < TABLE_SIZE; ++i)
      table[i] = (ran_arr_next() & 0x1f3f7fff);
}

// this was a bit slow with a ran_arr_next() per char and a mod per char, so
// i've now over-optimized it
void initUpperTable(int size)
{
   int i;
   for (i=0; i < TABLE_SIZE; ++i) {
      unsigned char *c = (unsigned char *) &table[i];
      int j;
      uint32 data = ran_arr_next();
      for (j=0; j < 4; ) {
         data >>= 5;
         c[j] = (data & 31) + 'A';
         if (c[j] <= 'Z') ++j;  // if we consume all 32 bits = 6+ characters in the attempt, we'll end with all A's
      }
   }
}

char *symbol_data = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_e";

void initSymbolTable(int size)
{
   int i,j;
   for (i=0; i < TABLE_SIZE; ++i) {
      unsigned char *c = (unsigned char *) & table[i];
      uint32 data = ran_arr_next();
      for (j=0; j < 4; ++j) {
         c[j] = symbol_data[data & 63];
         data >>= 6;
      }
   }
}

void initSeqTable(int size)
{
   int i;
   for (i=0; i < TABLE_SIZE; ++i)
      table[i] = SEQ_BASE + i * seq_scale;
}

Data datasets[] =
{
   { "rand", "random", "random 32-bit numbers", initRandomTable, },
   { "rand0688", "random22", "random 22-bit numbers", initRandom22Table, },
   { "rand5678", "random5678", "random numbers with 5,6,7, and 8 potentially non-zero bits per byte", initRandom5678Table, },
   { "seq", "sequence", "sequential integers", initSeqTable, },
   { "upper", "uppercase", "4-character uppercase-only symbols", initUpperTable, },
   { "symbol", "symbol", "4-character mixed lower+upper+digits", initSymbolTable, },
};

void (*insert)(unsigned int key);
void (*delfunc)(unsigned int key);  // inconsistently named due to C++
void (*get)(unsigned int key);

int del,ins;

void buildTable(int start, int end)
{
   int i;
   for (i=start; i < end; ++i) {
      insert(table[i]);
   }
   del = start;
   ins = end;
}

int nongreater_power_of_two(int k)
{
   int z = 1;
   while (z <= k) z <<= 1;
   assert((z >> 1) <= k && z > k);
   return z >> 1;
}

void probeTable(int size, int probes, int insdel_choose, int ins_choose, int probe_offset, int probe_addend)
{
   int i;
   int mask = nongreater_power_of_two(size)-1;

   for (i=1; i <= probes; ++i) {
      if ((i & insdel_choose) == 0) {
         if ((i & ins_choose) == 0) {
            insert(table[ins]);
            ins = (ins + 1) & (TABLE_SIZE-1);
         } else {
            delfunc(table[del]);
            del = (del + 1) & (TABLE_SIZE-1);
         }
      } else { 
         get(table[(del + (i&mask) + probe_offset) & (TABLE_SIZE-1)]+probe_addend);
      }
   }
}

typedef struct
{
   char *array_name;
   void (*ins)(unsigned int key);
   void (*del)(unsigned int key);
   void (*get)(unsigned int key);
   int (*count)(void);
   int (*mem)(void);
} AArray;

void jInsert(unsigned int key) { Word_t *p; JLI(p, judy, key); *p = ~key; }

void jGet(unsigned int key)
{
   Word_t *p;
   JLG(p, judy, key);
   if (p) {
      result += *p;
      ++get_count;
   }
}

void jDelete(unsigned int key)
{
   int r;
   JLD(r, judy, key);
   if (r) ++deletes;
}

int jCount(void)
{
   Word_t r;
   JLC(r, judy, 0, -1);
   return r;
}

int jMem(void) { return JudyLMemUsed(judy); }

AArray judy_desc = { "Judy", jInsert, jDelete, jGet, jCount, jMem };

int  hCount(void)               { return hashCount(hash); }
int  hMem(void)                 { return hashMem(hash);   }
void hInsert(unsigned int key)  { *hashInsert(hash, key) = ~key; }

void hGet(unsigned int key)
{
   uint32 *p;
   p = hashFind(hash, key);
   if (p) {
      result += *p;
      ++get_count;
   }
}

void hDelete(unsigned int key)
{
   if (hashDelete(hash, key))
      ++deletes;
}

AArray hash_desc = { "hash", hInsert, hDelete, hGet, hCount, hMem };

void bhInsert(unsigned int key)
{
   *bhashInsert(bhash, key) = ~key;
}

void bhGet(unsigned int key)
{
   uint32 *p = bhashFind(bhash, key);
   if (p) { result += *p; ++get_count; }
}

void bhDelete(unsigned int key)
{
   if (bhashDelete(bhash, key))
      ++deletes;
}

int bhCount(void)
{
   return hashCount(bhash);
}

int bhMem(void)
{
   return hashMem(bhash);
}

AArray bhash_desc = { "bhash", bhInsert, bhDelete, bhGet, bhCount, bhMem };

int validate, time_build;

static double time_probe(int c0, int probes, int div, int c2, int c3, int c4, int c5)
{
   double start_time = get_time_in_seconds();
   probeTable(c0,probes/div,c2,c3,c4,c5);
   return div * (get_time_in_seconds() - start_time);
}

static void test(AArray *aa,
                 Data *d,
                 int start, int end, int steps)
{
   double total_time = 0;
   int i, last = 0;

   if (start == end) steps = 1;

   d->init(start);
   if (swap) do_swap();
   if (jitter) do_jitter();
   if (sort) do_sort();

   fprintf(stderr, "%s\n", d->longdesc);

   insert = aa->ins;
   delfunc = aa->del;
   get = aa->get;

   // prime it
   {
      reset();
      buildTable(0,start);
      probeTable(start, probes, ~0, 0, 0, 0);
   }
   reset();

   for (i=0; i < steps; ++i) {
      double time0, time1;
      int v, real_v;
      if (steps == 1) {
         v = start;
      } else {
         double f = (log(end) - log(start)) / (steps-1) * i + log(start);
         v = (int) (floor(exp(f) + 0.5));
      }

      resetcounts();
      if (shuffle) do_shuffle(v);

      if (!time_build) reset();

      time0 = get_time_in_seconds();
      if (time_build)
         buildTable(last, v);
      else
         buildTable(0,v);
      time1 = get_time_in_seconds();
      last = v;

      real_v = aa->count();

      if (validate) {
         time_probe(v,probes, 4, 3<<2, 0x80, 0,0);
         printf("%s %s %06x: %08x %06x %06x %06x\n", aa->array_name, d->type, real_v, result, get_count, deletes, rehashes);
      } else if (time_build) {
         total_time += time1 - time0;
         printf("%s, size %8d ( %8d ), %s: buildtime= %7.1lfms, memory= %d bytes\n",
             d->type, real_v, v, aa->array_name, 1000*total_time, aa->mem());
      } else {
         double hit_time  = time_probe(v, probes, 1, ~0, 0, 0,0);
         double miss_time = time_probe(v, probes, 1, ~0, 0, 
             d->init == initSeqTable && seq_scale != 1 ? 0 : v*2+1,
             1);
         double mix_time  = time_probe(v, probes, 4,  0, 0x80, 0,0); // 256 ins, then 256, del, etc.
         double post_hit_time = time_probe(v, probes, 1, ~0, 0, 0,0);

         printf("%s(4Mprobes), size %8d, %s: hits= %7.1lfms, miss= %7.1lfms, ins/del= %7.1lfms, rehits: %7.1lfms\n",
             d->type, real_v, aa->array_name, 1000*hit_time, 1000*miss_time, 1000*mix_time, 1000*post_hit_time);
      }
   }
}

float hash_table_max_full = 0.65f;
float hash_table_empty = 0.030f;
float hash_delete_rehash = 0.8f;

int usage(void)
{
   fprintf(stderr, "Usage: aatest [options] <aaimpl> <dataset>\n"
          "   <aaimpl>:   judy, hash, bhash\n"
          "   <dataset>:  rand, rand0688, rand5678, seq, upper, symbol\n"
          "   e.g. 'aatest judy rand' or 'aatest hash upper'\n"
          " Options:\n"
          "   -hashfull <0-100>   percent full hash table must be before doubling\n"
          "   -hashempty <0-50>   percent empty hash table must be before halving\n"
          "   -validate           computes a result that shouldn't vary with array type\n"
          "   -sort               forces insertions and deletions to be in dictionary order\n"
          "   -swap               swap bytes of all keys as if little to big-endian\n"
          "   -scale <integer>    for 'seq' dataset, uses multiples of this\n"
          "   -build              efficiently measure build times\n"
          "   -samples <integer>  number of sizes to measure\n"
          "   -jitter <integer>   randomly offset each sample by this much\n"
          "   -end <integer>      maximum size to probe\n"
   );
   return 1;
}

int main(int argc, char **argv)
{
   int i;
   AArray *a;
   Data *d = NULL;

   for (i=1; i < argc; ++i) {
      if (argv[i][0] == '-') {
         if (!stricmp("sort", argv[i]+1)) {
            sort = 1;
         } else if (!stricmp("swap", argv[i]+1)) {
            swap = 1;
         } else if (!stricmp("build", argv[i]+1)) {
            time_build = 1;
            c <<= 2;
         } else if (!stricmp("validate", argv[i]+1)) {
            validate = 1;
            c >>= 1;
         } else if (!stricmp("shuffle", argv[i]+1)) {
            shuffle = 1;
         } else {
            if (!stricmp("hashfull", argv[i]+1))
               hash_table_max_full = (float) atof(argv[i+1]) / 100;
            else if (!stricmp("hashempty", argv[i]+1)) 
               hash_table_empty = (float) atof(argv[i+1]) / 100;
            else if (!stricmp("scale", argv[i]+1)) 
               seq_scale = atoi(argv[i+1]);
            else if (!stricmp("samples", argv[i]+1))
               c = atoi(argv[i+1]);
            else if (!stricmp("end", argv[i]+1))
               e = atoi(argv[i+1]);
            else if (!stricmp("jitter", argv[i]+1))
               jitter = atoi(argv[i+1]);
            else {
               fprintf(stderr, "Unknown option '%s'.\n", argv[i]);
               return usage();
            }
            ++i;  // add an extra one for the argument above
         }
      } else {
         if (!stricmp("judy", argv[i])) a = &judy_desc;
         else if (!stricmp("hash", argv[i])) a = &hash_desc; 
         else if (!stricmp("bhash", argv[i])) a = &bhash_desc;
         else {
            int j;
            for (j=0; j < sizeof(datasets)/sizeof(datasets[0]); ++j)
               if (!stricmp(datasets[j].option, argv[i])) {
                  d = &datasets[j];
                  break;
               }
            if (d == NULL) return usage();
         }
      }
   }

   if (d == NULL || a == NULL) return usage();

   if (seq_scale < 1) seq_scale = 1;
   if (c < 1) c = 1;

   if (hash_table_max_full < 0.01 || hash_table_max_full > 0.999) {
      hash_table_max_full = (hash_table_max_full < 0.5) ? 0.01f : 0.999f;
      fprintf(stderr, "hashfull must range from 1 to 99.9. Set to %g.\n", hash_table_max_full * 100);
   }
   
   if (hash_table_empty < 0.005 || hash_table_empty > 0.499 || hash_table_empty >= hash_table_max_full/2.1f) {
      hash_table_max_full = (hash_table_max_full < 0.25) ? 0.005f : 0.499f;
      if (hash_table_empty >= hash_table_max_full) hash_table_empty = hash_table_max_full / 2.5f;
      fprintf(stderr, "hashempty must range from 0.5 to 49.9, and be smaller than half of hashfull. Set to %g.\n", hash_table_empty);
   }

   // @TODO: tune this value!
   hash_delete_rehash = 1 - (1 - hash_table_max_full)/2;

   test(a, d, s, e, c);

   return 0;
}


////////////////////////////////////////////////////////
//
//   hash table implementation
//

static int hashword(uint32 c)
{
   // Bob Jenkin's mix function, possibly overkill for only 32 bits?
   // but a simpler one was no faster, so what the heck
   uint32 a,b;
   a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
   a -= b; a -= c; a ^= (c>>13);
   b -= c; b -= a; b ^= (a<<8);
   c -= a; c -= b; c ^= (b>>13);
   a -= b; a -= c; a ^= (c>>12);
   b -= c; b -= a; b ^= (a<<16);
   c -= a; c -= b; c ^= (b>>5);
   a -= b; a -= c; a ^= (c>>3);
   b -= c; b -= a; b ^= (a<<10);
   c -= a; c -= b; c ^= (b>>15);
   return c;
}

#define KEY_NULL 0
#define KEY_DELETED 1

typedef struct
{
   uint32 key, value;
} Item;

typedef struct
{
   int mask;
   Item *data;
   int has_key[2];
   uint32 value[2];  // handle out of bound values
   int data_len;
   int population, deletes;
   int grow_size, shrink_size, delete_size;  // cache for speed
   void *free_ptr;
} Hash;

#define OVERAGE 1

void *hashCreate(int size)
{
   Hash *h = malloc(sizeof(*h));
   if (size < 8) size = 8;
   assert(nongreater_power_of_two(size) == size);
   h->data_len = size;
   h->mask = size-1;
   h->population = h->deletes = 0;
   h->has_key[0] = h->has_key[1] = 0;
   h->grow_size = (int) (size * hash_table_max_full) - 1;
   if (h->grow_size < 2) h->grow_size = 2;
   h->shrink_size = (int) (size * hash_table_empty);
   if (h->data_len == 8) h->shrink_size = 0;
   h->delete_size = (int) (size * hash_delete_rehash) - 1;
   h->data = calloc(sizeof(h->data[0]), size+OVERAGE*2);
   h->free_ptr = h->data;
   h->data += OVERAGE;
   return h;
}

void hashFree(void *h)
{
   free(((Hash *) h)->free_ptr);
   free(h);
}

uint32 *hashFind(void *hash, uint32 key)
{
   Hash *d = (Hash *) hash;
   Item *z = d->data;
   uint32 mask = d->mask;

   uint32 h = hashword(key);
   uint32 p = h & mask;
   uint32 s;

   if (key <= KEY_DELETED) {
      if (d->has_key[key]) return &d->value[key];
      return NULL;
   }

   if (z[p].key == key) return &z[p].value;
   if (z[p].key == KEY_NULL) return NULL;

   s = (((h >> 16) | (h << 16)) & mask) | 1;
   assert(d->population + d->deletes != d->data_len);
   do {
      p = (p + s) & mask;
      if (z[p].key == key) return &z[p].value;
   } while (z[p].key != KEY_NULL);

   return NULL;
}

static void rehash(Hash *h, int len)
{
   static int no_recurse=0;
   Hash *d = hashCreate(len);
   int i;
   
   assert(no_recurse == 0);
   ++no_recurse;

   ++rehashes;

   for (i=0; i < h->data_len; ++i) {
      if (h->data[i].key > KEY_DELETED) {
         uint32 *p = hashInsert(d, h->data[i].key);
         assert(p != NULL);
         *p = h->data[i].value;
      }
   }
   assert(h->population == d->population);

   free(h->free_ptr);
   for (i=0; i < 2; ++i)
      d->has_key[i] = h->has_key[i],
      d->value[i] = h->value[i];
   memcpy(h, d, sizeof(*h));
   free(d);

   --no_recurse;
}

uint32 *hashInsert(void *hash, uint32 key)
{
   Hash *d = (Hash *) hash;
   Item *z = d->data;
   uint32 mask = d->mask;
   int f = -1;

   uint32 h = hashword(key);
   uint32 p = h & mask;
   uint32 s;

   if (key <= KEY_DELETED) {
      d->has_key[key] = 1;
      return &d->value[key];
   }

   if (z[p].key == key) return &z[p].value;
   if (z[p].key == KEY_DELETED) f = p;
   if (z[p].key > KEY_NULL) {
      s = (((h >> 16) | (h << 16)) & mask) | 1;
      assert(d->population + d->deletes != d->data_len);
      do {
         p = (p + s) & mask;
         if (z[p].key == key) return &z[p].value;
         if (z[p].key == KEY_DELETED && f == -1) f = p;
      } while (z[p].key > KEY_NULL);
   }

   if (z[p].key == KEY_NULL && d->population >= d->grow_size) {
      rehash(d, d->data_len * 2);
      return hashInsert(hash, key);
   } else if (d->population + d->deletes > d->delete_size) {
      rehash(d, d->data_len);
      return hashInsert(hash, key);
   } else {
      if (f >= 0) p = f;
      if (z[p].key == KEY_DELETED) --d->deletes;
      z[p].key = key;
      ++d->population;
      return &z[p].value;
   }
}

int hashDelete(void *hash, uint32 key)
{
   Hash *h = (Hash *) hash;
   Item *p;
   uint32 *value = hashFind(hash, key);
   if (value == NULL) return 0;
   if (key <= KEY_DELETED) {
      h->has_key[key] = 0;
      return 1;
   }
   p = (Item *) (value-1);
   p->key = KEY_DELETED;
   --h->population;
   ++h->deletes;
   if (h->population < h->shrink_size)
      rehash(h, h->data_len >> 1);
   else if (h->population + h->deletes >= h->delete_size)
      rehash(h, h->data_len);
   return 1;
}

int hashCount(void *hash)
{
   Hash *h = hash;
   return h->population + h->has_key[KEY_NULL] + h->has_key[KEY_DELETED];
}

int hashMem(void *hash)
{ 
    return sizeof(Hash) + sizeof(Item) * ((Hash *) hash)->data_len;
}

//
//   end hash table implementation
//
////////////////////////////////////////////////////////
//
//   bucketed to cache-line hash table
//

#define LINESIZE      32
#define BUCKETSIZE    (LINESIZE / sizeof(Item))
#define BUCKETMASK    (BUCKETSIZE-1)

void *bhashCreate(int size)
{
   Hash *h = malloc(sizeof(*h));
   assert(h != NULL);
   if (size < 8) size = 8;
   assert(nongreater_power_of_two(size) == size);
   h->data_len = size;
   h->mask = (size-1) & ~BUCKETMASK;
   h->population = h->deletes = 0;
   h->has_key[0] = h->has_key[1] = 0;
   h->grow_size = (int) (size * hash_table_max_full) - 1;
   if (h->grow_size < 2) h->grow_size = 2;
   h->shrink_size = (int) (size * hash_table_empty);
   if (h->data_len == 8) h->shrink_size = 0;
   h->delete_size = (int) (size * hash_delete_rehash) - 1;
   h->free_ptr = calloc(sizeof(h->data[0]), size+LINESIZE + 16);
   assert(h->free_ptr != NULL);
   h->data = (Item *) ((((int) h->free_ptr) + 8 + LINESIZE) & ~(LINESIZE-1));
   return h;
}

uint32 *bhashFind(void *hash, uint32 key)
{
   Hash *d = (Hash *) hash;
   Item *z = d->data;
   uint32 mask = d->mask;

   uint32 h = hashword(key);
   uint32 p = h & mask;
   uint32 s,q;

   if (key <= KEY_DELETED) {
      if (d->has_key[key]) return &d->value[key];
      return NULL;
   }

   assert(BUCKETSIZE == 4);

   if (z[p+0].key == key) return &z[p+0].value;
   if (z[p+1].key == key) return &z[p+1].value;
   if (z[p+2].key == key) return &z[p+2].value;
   if (z[p+3].key == key) return &z[p+3].value;
   if (z[p+0].key == KEY_NULL) return NULL;
   if (z[p+1].key == KEY_NULL) return NULL;
   if (z[p+2].key == KEY_NULL) return NULL;
   if (z[p+3].key == KEY_NULL) return NULL;

   s = (((h >> 16) | (h << 16)) & mask) | BUCKETSIZE;
   q = p;
   do {
      int i;
      p = (p + s) & mask;
      for (i=0; i < BUCKETSIZE; ++i)
         if (z[p+i].key == key) return &z[p+i].value;
      for (i=0; i < BUCKETSIZE; ++i)
         if (z[p+i].key == KEY_NULL) return NULL;
   } while (p != q);

   return NULL;
}

static void brehash(Hash *h, int len)
{
   static int no_recurse=0;
   Hash *d = bhashCreate(len);
   int i;
   
   assert(no_recurse == 0);
   ++no_recurse;

   ++rehashes;

   for (i=0; i < h->data_len; ++i) {
      if (h->data[i].key > KEY_DELETED) {
         uint32 *p = bhashInsert(d, h->data[i].key);
         assert(p != NULL);
         *p = h->data[i].value;
      }
   }
   assert(h->population == d->population);

   free(h->free_ptr);
   for (i=0; i < 2; ++i)
      d->has_key[i] = h->has_key[i],
      d->value[i] = h->value[i];
   memcpy(h, d, sizeof(*h));
   free(d);

   --no_recurse;
}

uint32 *bhashInsert(void *hash, uint32 key)
{
   Hash *d = (Hash *) hash;
   Item *z = d->data;
   uint32 mask = d->mask;
   int i,f = -1,done=0;

   uint32 h = hashword(key);
   uint32 p = h & mask;
   uint32 s;

   if (key <= KEY_DELETED) {
      d->has_key[key] = 1;
      return &d->value[key];
   }

   for (i=0; i < BUCKETSIZE; ++i) {
      if (z[p+i].key == key) return &z[p+i].value;
      else if (z[p+i].key == KEY_DELETED) f = p+i;
      else if (z[p+i].key == KEY_NULL) f = p+i, done = 1;
   }
   
   if (!done) {
      uint32 q = p;
      s = (((h >> 16) | (h << 16)) & mask) | BUCKETSIZE;
      do {
         p = (p + s) & mask;
         for (i=0; i < BUCKETSIZE; ++i) {
            if (z[p+i].key == key) return &z[p+i].value;
            else if (z[p+i].key == KEY_DELETED && f == -1) f = p+i;
            else if (z[p+i].key == KEY_NULL) { done = 1; if (f == -1) f = p+i; }
         }
      } while (!done && p != q);
   }

   assert(f != -1);
   assert(z[f].key == KEY_NULL || z[f].key == KEY_DELETED);

   if (z[f].key == KEY_NULL && d->population >= d->grow_size) {
      brehash(d, d->data_len * 2);
      return bhashInsert(hash, key);
   } else if (z[f].key == KEY_NULL && d->population + d->deletes > d->delete_size) {
      brehash(d, d->data_len);
      return bhashInsert(hash, key);
   } else {
      if (z[f].key == KEY_DELETED) --d->deletes;
      z[f].key = key;
      ++d->population;
      return &z[f].value;
   }
}

int bhashDelete(void *hash, uint32 key)
{
   Hash *h = (Hash *) hash;
   Item *p;
   uint32 *value = bhashFind(hash, key);
   if (value == NULL) return 0;
   if (key <= KEY_DELETED) {
      h->has_key[key] = 0;
      return 1;
   }
   p = (Item *) (value-1);
   p->key = KEY_DELETED;
   --h->population;
   ++h->deletes;
   if (h->population < h->shrink_size)
      brehash(h, h->data_len >> 1);
   else if (h->population + h->deletes >= h->delete_size)
      brehash(h, h->data_len);
   return 1;
}

//
//   end bucketed hash table implementation
//
////////////////////////////////////////////////////////
//
//   ran_array, Knuth's random number generator
//

/*    This program by D E Knuth is in the public domain and freely copyable
 *    AS LONG AS YOU MAKE ABSOLUTELY NO CHANGES!
 *    It is explained in Seminumerical Algorithms, 3rd edition, Section 3.6
 *    (or in the errata to the 2nd edition --- see
 *        http://www-cs-faculty.stanford.edu/~knuth/taocp.html
 *    in the changes to Volume 2 on pages 171 and following).              */

/*    N.B. The MODIFICATIONS introduced in the 9th printing (2002) are
      included here; there's no backwards compatibility with the original. */

/*    This version also adopts Brendan McKay's suggestion to
      accommodate naive users who forget to call ran_start(seed).          */

/*    If you find any bugs, please report them immediately to
 *                 taocp@cs.stanford.edu
 *    (and you will be rewarded if the bug is genuine). Thanks!            */

/************ see the book for explanations and caveats! *******************/
/************ in particular, you need two's complement arithmetic **********/

#define KK 100                     /* the long lag */
#define LL  37                     /* the short lag */
#define MM (1L<<30)                 /* the modulus */
#define mod_diff(x,y) (((x)-(y))&(MM-1)) /* subtraction mod MM */

long ran_x[KK];                    /* the generator state */

#ifdef __STDC__
void ran_array(long aa[],int n)
#else
void ran_array(aa,n)    /* put n new random numbers in aa */
  long *aa;   /* destination */
  int n;      /* array length (must be at least KK) */
#endif
{
  register int i,j;
  for (j=0;j<KK;j++) aa[j]=ran_x[j];
  for (;j<n;j++) aa[j]=mod_diff(aa[j-KK],aa[j-LL]);
  for (i=0;i<LL;i++,j++) ran_x[i]=mod_diff(aa[j-KK],aa[j-LL]);
  for (;i<KK;i++,j++) ran_x[i]=mod_diff(aa[j-KK],ran_x[i-LL]);
}

/* the following routines are from exercise 3.6--15 */
/* after calling ran_start, get new randoms by, e.g., "x=ran_arr_next()" */

#define QUALITY 1009 /* recommended quality level for high-res use */
long ran_arr_buf[QUALITY];
long ran_arr_dummy=-1, ran_arr_started=-1;
long *ran_arr_ptr=&ran_arr_dummy; /* the next random number, or -1 */

#define TT  70   /* guaranteed separation between streams */
#define is_odd(x)  ((x)&1)          /* units bit of x */

#ifdef __STDC__
void ran_start(long seed)
#else
void ran_start(seed)    /* do this before using ran_array */
  long seed;            /* selector for different streams */
#endif
{
  register int t,j;
  long x[KK+KK-1];              /* the preparation buffer */
  register long ss=(seed+2)&(MM-2);
  for (j=0;j<KK;j++) {
    x[j]=ss;                      /* bootstrap the buffer */
    ss<<=1; if (ss>=MM) ss-=MM-2; /* cyclic shift 29 bits */
  }
  x[1]++;              /* make x[1] (and only x[1]) odd */
  for (ss=seed&(MM-1),t=TT-1; t; ) {       
    for (j=KK-1;j>0;j--) x[j+j]=x[j], x[j+j-1]=0; /* "square" */
    for (j=KK+KK-2;j>=KK;j--)
      x[j-(KK-LL)]=mod_diff(x[j-(KK-LL)],x[j]),
      x[j-KK]=mod_diff(x[j-KK],x[j]);
    if (is_odd(ss)) {              /* "multiply by z" */
      for (j=KK;j>0;j--)  x[j]=x[j-1];
      x[0]=x[KK];            /* shift the buffer cyclically */
      x[LL]=mod_diff(x[LL],x[KK]);
    }
    if (ss) ss>>=1; else t--;
  }
  for (j=0;j<LL;j++) ran_x[j+KK-LL]=x[j];
  for (;j<KK;j++) ran_x[j-LL]=x[j];
  for (j=0;j<10;j++) ran_array(x,KK+KK-1); /* warm things up */
  ran_arr_ptr=&ran_arr_started;
}

#define ran_arr_next() (*ran_arr_ptr>=0? *ran_arr_ptr++: ran_arr_cycle())
long ran_arr_cycle()
{
  if (ran_arr_ptr==&ran_arr_dummy)
    ran_start(314159L); /* the user forgot to initialize */
  ran_array(ran_arr_buf,QUALITY);
  ran_arr_buf[100]=-1;
  ran_arr_ptr=ran_arr_buf+1;
  return ran_arr_buf[0];
}

//
//   end Knuth ran_array code
//
////////////////////////////////////////////////////////
//
//   win32 specific timing code
//

#define WIN32_LEAN_AND_MEAN
#include <windows.h>

static double get_time_in_seconds(void)
{
   LARGE_INTEGER freq;
   LARGE_INTEGER time;

   BOOL ok = QueryPerformanceFrequency(&freq);
   assert(ok == TRUE);

   freq.QuadPart = freq.QuadPart;

   ok = QueryPerformanceCounter(&time);
   assert(ok == TRUE);

   return time.QuadPart / (double) freq.QuadPart;
}