import hashlib import random import string from scipy import integrate from math import log import numpy as np from tqdm import tqdm from sympy import nextprime from statistics import median def HLLsketch(seq): M = [0]*(2**p) for x in tqdm(seq): bits = bin(int.from_bytes(hashlib.md5(x.encode()).digest(), "little"))[2:].zfill(128) idx = int(bits[:p], 2) bits = bits[p:] val = 1+bits.index("1") M[idx] = max(M[idx], val) return M # update M[i] with the right number of leading zeros def HLL(seq, p): m = 2**p f = lambda u: log((2+u)/(1+u), 2)**m alpha = (m*integrate.quad(f, 0, np.inf)[0])**(-1) print("alpha", alpha) E = alpha * m**2 * (sum(2**(-M[j]) for j in range(0, m)))**(-1) if E < (5/2)* m : print("Estimate needs correcting (see associated paper). Too small") if E > (1/30)*2**(32): print("Estimate needs correcting (see associated paper). Too large") return E rangeofvals = 20000 seq = random.choices([*map(str, range(rangeofvals))], k = 5000) truecount = len(set(seq)) print("true cardinality is", truecount) p = 5 M = HLLsketch(seq) print("HLL estimate", HLL(seq, p))