# -*- perl -*- # # Perfect Minimal Hash Generator written in Perl, which produces # C output. # require 'random_sv_vectors.ph'; require 'crc64.ph'; # # Compute the prehash for a key # # prehash(key, sv, N) # sub prehash(\$\$\$) { my(\$key, \$n, \$sv) = @_; my @c = crc64(\$sv, \$key); my \$nmask = (\$n << 1) - 2; # Create a bipartite graph... \$k1 = (\$c[1] & \$nmask) + 0; # low word \$k2 = (\$c[0] & \$nmask) + 1; # high word return (\$k1, \$k2); } # # Walk the assignment graph, return true on success # sub walk_graph(\$\$\$\$) { my(\$nodeval,\$nodeneigh,\$n,\$v) = @_; my \$nx; # print STDERR "Vertex \$n value \$v\n"; \$\$nodeval[\$n] = \$v; foreach \$nx (@{\$\$nodeneigh[\$n]}) { # \$nx -> [neigh, hash] my (\$o, \$e) = @\$nx; # print STDERR "Edge \$n,\$o value \$e: "; my \$ov; if (defined(\$ov = \$\$nodeval[\$o])) { if (\$v+\$ov != \$e) { # Cyclic graph with collision # print STDERR "error, should be ", \$v+\$ov, "\n"; return 0; } else { # print STDERR "ok\n"; } } else { return 0 unless (walk_graph(\$nodeval, \$nodeneigh, \$o, \$e-\$v)); } } return 1; } # # Generate the function assuming a given N. # # gen_hash_n(N, sv, \%data, run) # sub gen_hash_n(\$\$\$\$) { my(\$n, \$sv, \$href, \$run) = @_; my @keys = keys(%{\$href}); my \$i; my \$gr; my (\$k, \$v); my \$gsize = 2*\$n; my @nodeval; my @nodeneigh; my %edges; for (\$i = 0; \$i < \$gsize; \$i++) { \$nodeneigh[\$i] = []; } %edges = (); foreach \$k (@keys) { my (\$pf1, \$pf2) = prehash(\$k, \$n, \$sv); (\$pf1,\$pf2) = (\$pf2,\$pf1) if (\$pf1 > \$pf2); # Canonicalize order my \$pf = "\$pf1,\$pf2"; my \$e = \${\$href}{\$k}; my \$xkey; if (defined(\$xkey = \$edges{\$pf})) { next if (\$e == \${\$href}{\$xkey}); # Duplicate hash, safe to ignore if (defined(\$run)) { print STDERR "\$run: Collision: \$pf: \$k with \$xkey\n"; } return; } # print STDERR "Edge \$pf value \$e from \$k\n"; \$edges{\$pf} = \$k; push(@{\$nodeneigh[\$pf1]}, [\$pf2, \$e]); push(@{\$nodeneigh[\$pf2]}, [\$pf1, \$e]); } # Now we need to assign values to each vertex, so that for each # edge, the sum of the values for the two vertices give the value # for the edge (which is our hash index.) If we find an impossible # sitation, the graph was cyclic. @nodeval = (undef) x \$gsize; for (\$i = 0; \$i < \$gsize; \$i++) { if (scalar(@{\$nodeneigh[\$i]})) { # This vertex has neighbors (is used) if (!defined(\$nodeval[\$i])) { # First vertex in a cluster unless (walk_graph(\@nodeval, \@nodeneigh, \$i, 0)) { if (defined(\$run)) { print STDERR "\$run: Graph is cyclic\n"; } return; } } } } # for (\$i = 0; \$i < \$n; \$i++) { # print STDERR "Vertex ", \$i, ": ", \$g[\$i], "\n"; # } if (defined(\$run)) { printf STDERR "\$run: Done: n = \$n, sv = [0x%08x, 0x%08x]\n", \$\$sv[0], \$\$sv[1]; } return (\$n, \$sv, \@nodeval); } # # Driver for generating the function # # gen_perfect_hash(\%data) # sub gen_perfect_hash(\$) { my(\$href) = @_; my @keys = keys(%{\$href}); my @hashinfo; my (\$n, \$i, \$j, \$sv, \$maxj); my \$run = 1; # Minimal power of 2 value for N with enough wiggle room. # The scaling constant must be larger than 0.5 in order for the # algorithm to ever terminate. The higher the scaling constant, # the more space does the hash take up, but the less likely is it # that an invalid token will require a string comparison. my \$room = int(scalar(@keys)*1.6); \$n = 1; while (\$n < \$room) { \$n <<= 1; } # Number of times to try... \$maxj = scalar @random_sv_vectors; for (\$i = 0; \$i < 4; \$i++) { printf STDERR "%d vectors, trying n = %d...\n", scalar @keys, \$n; for (\$j = 0; \$j < \$maxj; \$j++) { \$sv = \$random_sv_vectors[\$j]; @hashinfo = gen_hash_n(\$n, \$sv, \$href, \$run++); return @hashinfo if (@hashinfo); } \$n <<= 1; } return; } # # Verify that the hash table is actually correct... # sub verify_hash_table(\$\$) { my (\$href, \$hashinfo) = @_; my (\$n, \$sv, \$g) = @{\$hashinfo}; my \$k; my \$err = 0; foreach \$k (keys(%\$href)) { my (\$pf1, \$pf2) = prehash(\$k, \$n, \$sv); my \$g1 = \${\$g}[\$pf1]; my \$g2 = \${\$g}[\$pf2]; if (\$g1+\$g2 != \${\$href}{\$k}) { printf STDERR "%s(%d,%d): %d+%d = %d != %d\n", \$k, \$pf1, \$pf2, \$g1, \$g2, \$g1+\$g2, \${\$href}{\$k}; \$err = 1; } else { # printf STDERR "%s: %d+%d = %d ok\n", # \$k, \$g1, \$g2, \$g1+\$g2; } } die "\$0: hash validation error\n" if (\$err); } 1;