From eaaf89360548af67f3c53137e9817a21cd9575aa Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 7 Jul 2022 11:50:00 +1200
Subject: [PATCH] Adjust the DOH string hash function

The one we're currently using only considers the last five characters
plus the least significant bit of the last-but-sixth character, which
unsurprisingly generates a lot of many-way collisions.

This change seems to give about a 4% reduction in wallclock time for
processing li_std_list_wrap.i from the testsuite for Python.  The
hash collision rate for this example drops from 39% to 0!

Closes #2303
---
 CHANGES.current     | 11 +++++++++++
 Source/DOH/string.c | 20 ++++++++++++++------
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/CHANGES.current b/CHANGES.current
index ac2c2d099..bfe3f8b0a 100644
--- a/CHANGES.current
+++ b/CHANGES.current
@@ -7,6 +7,17 @@ the issue number to the end of the URL: https://github.com/swig/swig/issues/
 Version 4.1.0 (in progress)
 ===========================
 
+2022-09-29: olly
+	    #2303 SWIG's internal hash tables now use a better hash function.
+
+	    The old hash function only considerd the last five characters
+	    plus the least significant bit of the last-but-sixth character,
+	    which as you might guess generated a lot of many-way collisions.
+
+	    This change seems to give about a 4% reduction in wallclock time
+	    for processing li_std_list_wrap.i from the testsuite for Python.
+	    The hash collision rate for this example drops from 39% to 0!
+
 2022-09-29: wsfulton
             #2303 Type tables are now output in a fixed order whereas previously
             the order may change with any minor input code change. This shouldn't
diff --git a/Source/DOH/string.c b/Source/DOH/string.c
index 543c3e3f8..8f321508e 100644
--- a/Source/DOH/string.c
+++ b/Source/DOH/string.c
@@ -180,19 +180,27 @@ static int String_hash(DOH *so) {
   if (s->hashkey >= 0) {
     return s->hashkey;
   } else {
-    char *c = s->str;
+    /* We use the djb2 hash function: https://theartincode.stanis.me/008-djb2/
+     *
+     * One difference is we use initial seed 0.  It seems the usual seed value
+     * is intended to help spread out hash values, which is beneficial if
+     * linear probing is used but DOH Hash uses a chain of buckets instead, and
+     * grouped hash values are probably more cache friendly.  In tests using
+     * 0 seems slightly faster anyway.
+     */
+    const char *c = s->str;
     unsigned int len = s->len > 50 ? 50 : s->len;
     unsigned int h = 0;
     unsigned int mlen = len >> 2;
     unsigned int i = mlen;
     for (; i; --i) {
-      h = (h << 5) + *(c++);
-      h = (h << 5) + *(c++);
-      h = (h << 5) + *(c++);
-      h = (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
     }
     for (i = len - (mlen << 2); i; --i) {
-      h = (h << 5) + *(c++);
+      h = h + (h << 5) + *(c++);
     }
     h &= 0x7fffffff;
     s->hashkey = (int)h;