From eaaf89360548af67f3c53137e9817a21cd9575aa Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 7 Jul 2022 11:50:00 +1200 Subject: [PATCH] Adjust the DOH string hash function The one we're currently using only considers the last five characters plus the least significant bit of the last-but-sixth character, which unsurprisingly generates a lot of many-way collisions. This change seems to give about a 4% reduction in wallclock time for processing li_std_list_wrap.i from the testsuite for Python. The hash collision rate for this example drops from 39% to 0! Closes #2303 --- CHANGES.current | 11 +++++++++++ Source/DOH/string.c | 20 ++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/CHANGES.current b/CHANGES.current index ac2c2d099..bfe3f8b0a 100644 --- a/CHANGES.current +++ b/CHANGES.current @@ -7,6 +7,17 @@ the issue number to the end of the URL: https://github.com/swig/swig/issues/ Version 4.1.0 (in progress) =========================== +2022-09-29: olly + #2303 SWIG's internal hash tables now use a better hash function. + + The old hash function only considerd the last five characters + plus the least significant bit of the last-but-sixth character, + which as you might guess generated a lot of many-way collisions. + + This change seems to give about a 4% reduction in wallclock time + for processing li_std_list_wrap.i from the testsuite for Python. + The hash collision rate for this example drops from 39% to 0! + 2022-09-29: wsfulton #2303 Type tables are now output in a fixed order whereas previously the order may change with any minor input code change. This shouldn't diff --git a/Source/DOH/string.c b/Source/DOH/string.c index 543c3e3f8..8f321508e 100644 --- a/Source/DOH/string.c +++ b/Source/DOH/string.c @@ -180,19 +180,27 @@ static int String_hash(DOH *so) { if (s->hashkey >= 0) { return s->hashkey; } else { - char *c = s->str; + /* We use the djb2 hash function: https://theartincode.stanis.me/008-djb2/ + * + * One difference is we use initial seed 0. It seems the usual seed value + * is intended to help spread out hash values, which is beneficial if + * linear probing is used but DOH Hash uses a chain of buckets instead, and + * grouped hash values are probably more cache friendly. In tests using + * 0 seems slightly faster anyway. + */ + const char *c = s->str; unsigned int len = s->len > 50 ? 50 : s->len; unsigned int h = 0; unsigned int mlen = len >> 2; unsigned int i = mlen; for (; i; --i) { - h = (h << 5) + *(c++); - h = (h << 5) + *(c++); - h = (h << 5) + *(c++); - h = (h << 5) + *(c++); + h = h + (h << 5) + *(c++); + h = h + (h << 5) + *(c++); + h = h + (h << 5) + *(c++); + h = h + (h << 5) + *(c++); } for (i = len - (mlen << 2); i; --i) { - h = (h << 5) + *(c++); + h = h + (h << 5) + *(c++); } h &= 0x7fffffff; s->hashkey = (int)h;