# find a "hash ratio", based on the number of iterations requried to
# reach each entry.

from ctypes import c_uint32 as u32

HASH_SIZE=2048

def sum_to(n):
    return n * (n + 1) / 2

def hash_it_DJB(s):
    h = 5381
    
    for c in s:
        h = u32(((h << 5) + h) ^ ord(c)).value

    return (h % HASH_SIZE)

def hash_it_sum(s):
    h = 0
    for c in s:
        h = u32(h + ord(c)).value
    return (h % HASH_SIZE)

def hash_it_xor(s):
    h = 0
    for c in s:
        h = u32(h ^ ord(c)).value
    return (h % HASH_SIZE)

def hash_it(s):
    return hash_it_DJB(s)

ht = {}

f = open('/usr/share/dict/words')

entries = 0
for l in f.readlines():
    entries += 1
    h = hash_it(l)
    if (ht.has_key(h)):
        ht[h] += 1
    else:
        ht[h] = 1

total_lookups = 0
for k, v in ht.iteritems():
    total_lookups += sum_to(v)

# if entries < buckets, then the ideal is just to visit each once,
# i.e. the number of entries
if (entries < HASH_SIZE):
    ideal_length = 1
    ideal_lookups = entries
else:
    ideal_length = entries / HASH_SIZE
    ideal_lookups = sum_to(ideal_length) * HASH_SIZE
    left_overs = entries % HASH_SIZE
    # accessing each of the left over elements requires one more
    # iteration than the ideal length
    ideal_lookups += (ideal_length + 1) * left_overs

ratio = float(ideal_lookups)/total_lookups

print "Ideal %d\nActual %d\n----\nRatio %f\n" % \
      (ideal_lookups, total_lookups, ratio)
