Adjust the DOH string hash function
The one we're currently using only considers the last five characters plus the least significant bit of the last-but-sixth character, which unsurprisingly generates a lot of many-way collisions. This change seems to give about a 4% reduction in wallclock time for processing li_std_list_wrap.i from the testsuite for Python. The hash collision rate for this example drops from 39% to 0! Closes #2303
This commit is contained in:
parent
8987259959
commit
5a96a39aa4
1 changed files with 14 additions and 6 deletions
|
|
@ -180,19 +180,27 @@ static int String_hash(DOH *so) {
|
|||
if (s->hashkey >= 0) {
|
||||
return s->hashkey;
|
||||
} else {
|
||||
char *c = s->str;
|
||||
/* We use the djb2 hash function: https://theartincode.stanis.me/008-djb2/
|
||||
*
|
||||
* One difference is we use initial seed 0. It seems the usual seed value
|
||||
* is intended to help spread out hash values, which is beneficial if
|
||||
* linear probing is used but DOH Hash uses a chain of buckets instead, and
|
||||
* grouped hash values are probably more cache friendly. In tests using
|
||||
* 0 seems slightly faster anyway.
|
||||
*/
|
||||
const char *c = s->str;
|
||||
unsigned int len = s->len > 50 ? 50 : s->len;
|
||||
unsigned int h = 0;
|
||||
unsigned int mlen = len >> 2;
|
||||
unsigned int i = mlen;
|
||||
for (; i; --i) {
|
||||
h = (h << 5) + *(c++);
|
||||
h = (h << 5) + *(c++);
|
||||
h = (h << 5) + *(c++);
|
||||
h = (h << 5) + *(c++);
|
||||
h = h + (h << 5) + *(c++);
|
||||
h = h + (h << 5) + *(c++);
|
||||
h = h + (h << 5) + *(c++);
|
||||
h = h + (h << 5) + *(c++);
|
||||
}
|
||||
for (i = len - (mlen << 2); i; --i) {
|
||||
h = (h << 5) + *(c++);
|
||||
h = h + (h << 5) + *(c++);
|
||||
}
|
||||
h &= 0x7fffffff;
|
||||
s->hashkey = (int)h;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue