[PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c

Wed May 18 17:35:20 GMT 2022

On Wed, May 18, 2022 at 12:34 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, May 17, 2022 at 12:11 AM Siddhesh Poyarekar <siddhesh@gotplt.org> wrote:
> >
> > On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> > > The prior unrolling didn't really do much as it left the dependency
> > > chain between iterations. Unrolled the loop for 4 so 4x multiplies
> > > could be pipelined in out-of-order machines.
> > >
> > > Results for __nss_hash
> > > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> > >
> > > Time as Geometric Mean of N=25 runs
> > > Geometric of all benchmark New / Old: 0.845
> > >    type, length, New Time, Old Time, New Time / Old Time
> > >   fixed,      0,    4.019,    3.729,               1.078
> > >   fixed,      1,     4.95,    5.707,               0.867
> > >   fixed,      2,    5.152,    5.657,               0.911
> > >   fixed,      3,    4.641,    5.721,               0.811
> > >   fixed,      4,    5.551,     5.81,               0.955
> > >   fixed,      5,    6.525,    6.552,               0.996
> > >   fixed,      6,    6.711,    6.561,               1.023
> > >   fixed,      7,    6.715,    6.767,               0.992
> > >   fixed,      8,    7.874,    7.915,               0.995
> > >   fixed,      9,    8.888,    9.767,                0.91
> > >   fixed,     10,    8.959,    9.762,               0.918
> > >   fixed,     11,    9.188,    9.987,                0.92
> > >   fixed,     12,    9.708,   10.618,               0.914
> > >   fixed,     13,   10.393,    11.14,               0.933
> > >   fixed,     14,   10.628,   12.097,               0.879
> > >   fixed,     15,   10.982,   12.965,               0.847
> > >   fixed,     16,   11.851,   14.429,               0.821
> > >   fixed,     32,   24.334,   34.414,               0.707
> > >   fixed,     64,   55.618,   86.688,               0.642
> > >   fixed,    128,  118.261,   224.36,               0.527
> > >   fixed,    256,  256.183,  538.629,               0.476
> > > random,      2,   11.194,   11.556,               0.969
> > > random,      4,   17.516,   17.205,               1.018
> > > random,      8,   23.501,   20.985,                1.12
> > > random,     16,   28.131,   29.212,               0.963
> > > random,     32,   35.436,   38.662,               0.917
> > > random,     64,    45.74,   58.868,               0.777
> > > random,    128,   75.394,  121.963,               0.618
> > > random,    256,  139.524,  260.726,               0.535
> > > ---
> > >   nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
> > >   1 file changed, 42 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> > > index 27a348ea9b..c6a375f386 100644
> > > --- a/nss/nss_hash.c
> > > +++ b/nss/nss_hash.c
> > > @@ -19,58 +19,63 @@
> > >
> > >   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
> > >   /*
> > > - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> > > + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
> > >    * units.  On the first time through the loop we get the "leftover bytes"
> > > - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> > > - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> > > - * this routine is heavily used enough, it's worth the ugly coding.
> > > + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> > > + * HASHC. Further unrolling does not appear to help.
> > >    *
> > >    * OZ's original sdbm hash
> > >    */
> > >   uint32_t
> > >   __nss_hash (const void *keyarg, size_t len)
> > >   {
> > > +  enum
> > > +  {
> > > +    HASH_CONST_P0 = 1,              /* (uint32_t)(65599 ^ 0).  */
> > > +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> > > +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> > > +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> > > +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> > > +  };
> > > +
> > >     const unsigned char *key;
> > > -  size_t loop;
> > >     uint32_t h;
> > >
> > > -#define HASHC   h = *key++ + 65599 * h
> > > +#define HASHC        h = *key++ + HASH_CONST_P1 * h
> > >
> > >     h = 0;
> > >     key = keyarg;
> > >     if (len > 0)
> > >       {
> > > -      loop = (len + 8 - 1) >> 3;
> > > -      switch (len & (8 - 1))
> > > -        {
> > > -        case 0:
> > > -          do
> > > -            {
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 7:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 6:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 5:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 4:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 3:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 2:
> > > -              HASHC;
> > > -              /* FALLTHROUGH */
> > > -            case 1:
> > > -              HASHC;
> > > -            }
> > > -       while (--loop);
> > > -        }
> > > +      switch ((len & (4 - 1)))
> > > +     {
> > > +     case 0:
> > > +       /* h starts out as zero so no need to include the multiply. */
> > > +       h = *key++;
> > > +       /* FALLTHROUGH */
> > > +     case 3:
> > > +       HASHC;
> > > +       /* FALLTHROUGH */
> > > +     case 2:
> > > +       HASHC;
> > > +       /* FALLTHROUGH */
> > > +     case 1:
> > > +       HASHC;
> > > +       /* FALLTHROUGH */
> > > +     }
> >
> > The first 4 bytes, also sufficient for len <= 4.  OK.
> >
> > > +
> > > +      uint32_t c0, c1, c2, c3;
> > > +      for (--len; len >= 4; len -= 4)
> > > +     {
> > > +       c0 = (unsigned char) *(key + 0);
> > > +       c1 = (unsigned char) *(key + 1);
> > > +       c2 = (unsigned char) *(key + 2);
> > > +       c3 = (unsigned char) *(key + 3);
> > > +       h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> > > +           + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
> > > +
> > > +       key += 4;
> > > +     }
> >
> > Remaining larger lengths.  OK.
> >
> > >       }
> > >     return h;
> > >   }
> >
> > TBH this wins solely on the front of the code being easier to
> > understand.  The fact that it is also faster in some cases is a bonus :)
> >
> > LGTM.
> >
> > Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
>
> No change to this file in V10.

NB: I added __simple_nss_hash to a new header file as I don't think
it's really justifiable to add a new function that can't easily be DCE
for testing/benchmarking.