Adjust the DOH string hash function

The one we're currently using only considers the last five characters plus the least significant bit of the last-but-sixth character, which unsurprisingly generates a lot of many-way collisions. This change seems to give about a 4% reduction in wallclock time for processing li_std_list_wrap.i from the testsuite for Python. The hash collision rate for this example drops from 39% to 0! Closes #2303
2022-07-07 11:50:00 +12:00 · 2022-07-07 11:50:00 +12:00 · 5a96a39aa4
commit 5a96a39aa4
parent 8987259959
1 changed files with 14 additions and 6 deletions
--- a/Source/DOH/string.c
+++ b/Source/DOH/string.c
@ -180,19 +180,27 @@ static int String_hash(DOH *so) {
  if (s->hashkey >= 0) {
    return s->hashkey;
  } else {
-    char *c = s->str;
+    /* We use the djb2 hash function: https://theartincode.stanis.me/008-djb2/
+     *
+     * One difference is we use initial seed 0.  It seems the usual seed value
+     * is intended to help spread out hash values, which is beneficial if
+     * linear probing is used but DOH Hash uses a chain of buckets instead, and
+     * grouped hash values are probably more cache friendly.  In tests using
+     * 0 seems slightly faster anyway.
+     */
+    const char *c = s->str;
    unsigned int len = s->len > 50 ? 50 : s->len;
    unsigned int h = 0;
    unsigned int mlen = len >> 2;
    unsigned int i = mlen;
    for (; i; --i) {
-      h = (h << 5) + *(c++);
-      h = (h << 5) + *(c++);
-      h = (h << 5) + *(c++);
-      h = (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
    }
    for (i = len - (mlen << 2); i; --i) {
-      h = (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
    }
    h &= 0x7fffffff;
    s->hashkey = (int)h;