vppinfra/vppinfra/vhash.h

   1 /*
   2  * Copyright (c) 2015 Cisco and/or its affiliates.
   3  * Licensed under the Apache License, Version 2.0 (the "License");
   4  * you may not use this file except in compliance with the License.
   5  * You may obtain a copy of the License at:
   6  *
   7  *     http://www.apache.org/licenses/LICENSE-2.0
   8  *
   9  * Unless required by applicable law or agreed to in writing, software
  10  * distributed under the License is distributed on an "AS IS" BASIS,
  11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12  * See the License for the specific language governing permissions and
  13  * limitations under the License.
  14  */
  15 /*
  16   Copyright (c) 2010 Eliot Dresselhaus
  17
  18   Permission is hereby granted, free of charge, to any person obtaining
  19   a copy of this software and associated documentation files (the
  20   "Software"), to deal in the Software without restriction, including
  21   without limitation the rights to use, copy, modify, merge, publish,
  22   distribute, sublicense, and/or sell copies of the Software, and to
  23   permit persons to whom the Software is furnished to do so, subject to
  24   the following conditions:
  25
  26   The above copyright notice and this permission notice shall be
  27   included in all copies or substantial portions of the Software.
  28
  29   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  30   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  31   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  32   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  33   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  34   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  35   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  36 */
  37
  38 #ifndef included_clib_vhash_h
  39 #define included_clib_vhash_h
  40
  41 #include <vppinfra/vector.h>
  42
  43 #ifdef CLIB_HAVE_VEC128
  44
  45 #include <vppinfra/cache.h>
  46 #include <vppinfra/hash.h>
  47 #include <vppinfra/pipeline.h>
  48
  49 /* Gathers 32 bits worth of key with given index. */
  50 typedef u32 (vhash_key_function_t) (void * state, u32 vector_index, u32 key_word_index);
  51 typedef u32x4 (vhash_4key_function_t) (void * state, u32 vector_index, u32 key_word_index);
  52 /* Sets/gets result of hash lookup. */
  53 typedef u32 (vhash_result_function_t) (void * state, u32 vector_index, u32 result, u32 n_key_u32);
  54 typedef u32x4 (vhash_4result_function_t) (void * state, u32 vector_index, u32x4 results, u32 n_key_u32);
  55
  56 typedef struct {
  57   u32x4_union_t hashed_key[3];
  58 } vhash_hashed_key_t;
  59
  60 /* Search buckets are really this structure. */
  61 typedef struct {
  62   /* 4 results for this bucket.
  63      Zero is used to mark empty results.  This means user can't use the result ~0
  64      since user results differ from internal results stored in buckets by 1.
  65      e.g. internal result = user result + 1. */
  66   u32x4_union_t result;
  67
  68   /* n_key_u32s u32x4s of key data follow. */
  69   u32x4_union_t key[0];
  70 } vhash_search_bucket_t;
  71
  72 typedef struct {
  73   u32x4_union_t * search_buckets;
  74
  75   /* Vector of bucket free indices. */
  76   u32 * free_indices;
  77
  78   /* Number of entries in this overflow bucket. */
  79   u32 n_overflow;
  80 } vhash_overflow_buckets_t;
  81
  82 typedef struct {
  83   /* 2^log2_n_keys keys grouped in groups of 4.
  84      Each bucket contains 4 results plus 4 keys for a
  85      total of (1 + n_key_u32) u32x4s. */
  86   u32x4_union_t * search_buckets;
  87
  88   /* When a bucket of 4 results/keys are full we search
  89      the overflow.  hash_key is used to select which overflow
  90      bucket. */
  91   vhash_overflow_buckets_t overflow_buckets[16];
  92
  93   /* Total count of occupied elements in hash table. */
  94   u32 n_elts;
  95
  96   u32 log2_n_keys;
  97
  98   /* Number of 32 bit words in a hash key. */
  99   u32 n_key_u32;
 100
 101   u32x4_union_t bucket_mask;
 102
 103   /* table[i] = min_log2 (first_set (~i)). */
 104   u8 find_first_zero_table[16];
 105
 106   /* Hash seeds for Jenkins hash. */
 107   u32x4_union_t hash_seeds[3];
 108
 109   /* Key work space is a vector of length
 110      n_key_u32s << log2_n_key_word_len_u32x. */
 111   u32 log2_n_key_word_len_u32x;
 112
 113   /* Work space to store keys between pipeline stages. */
 114   u32x4_union_t * key_work_space;
 115
 116   /* Hash work space to store Jenkins hash values between
 117      pipeline stages. */
 118   vhash_hashed_key_t * hash_work_space;
 119 } vhash_t;
 120
 121 always_inline vhash_overflow_buckets_t *
 122 vhash_get_overflow_buckets (vhash_t * h, u32 key)
 123 {
 124   u32 i = (((key & h->bucket_mask.as_u32[0]) >> 2) & 0xf);
 125   ASSERT (i < ARRAY_LEN (h->overflow_buckets));
 126   return h->overflow_buckets + i;
 127 }
 128
 129 always_inline uword
 130 vhash_is_non_empty_overflow_bucket (vhash_t * h, u32 key)
 131 {
 132   u32 i = (((key & h->bucket_mask.as_u32[0]) >> 2) & 0xf);
 133   ASSERT (i < ARRAY_LEN (h->overflow_buckets));
 134   return h->overflow_buckets[i].n_overflow > 0;
 135 }
 136
 137 always_inline void
 138 vhash_free_overflow_buckets (vhash_overflow_buckets_t * obs)
 139 {
 140   vec_free (obs->search_buckets);
 141   vec_free (obs->free_indices);
 142 }
 143
 144 always_inline void
 145 vhash_free (vhash_t * h)
 146 {
 147   uword i;
 148   for (i = 0; i < ARRAY_LEN (h->overflow_buckets); i++)
 149     vhash_free_overflow_buckets (&h->overflow_buckets[i]);
 150   vec_free (h->search_buckets);
 151   vec_free (h->key_work_space);
 152   vec_free (h->hash_work_space);
 153 }
 154
 155 always_inline void
 156 vhash_set_key_word (vhash_t * h, u32 wi, u32 vi, u32 value)
 157 {
 158   u32 i0 = (wi << h->log2_n_key_word_len_u32x) + (vi / 4);
 159   u32 i1 = vi % 4;
 160   vec_elt (h->key_work_space, i0).as_u32[i1] = value;
 161 }
 162
 163 always_inline void
 164 vhash_set_key_word_u32x (vhash_t * h, u32 wi, u32 vi, u32x value)
 165 {
 166   u32 i0 = (wi << h->log2_n_key_word_len_u32x) + (vi / 4);
 167   vec_elt (h->key_work_space, i0).as_u32x4 = value;
 168 }
 169
 170 always_inline u32
 171 vhash_get_key_word (vhash_t * h, u32 wi, u32 vi)
 172 {
 173   u32 i0 = (wi << h->log2_n_key_word_len_u32x) + (vi / 4);
 174   u32 i1 = vi % 4;
 175   return vec_elt (h->key_work_space, i0).as_u32[i1];
 176 }
 177
 178 always_inline u32x
 179 vhash_get_key_word_u32x (vhash_t * h, u32 wi, u32 vi)
 180 {
 181   u32 i0 = (wi << h->log2_n_key_word_len_u32x) + vi;
 182   return vec_elt (h->key_work_space, i0).as_u32x4;
 183 }
 184
 185 always_inline void
 186 vhash_validate_sizes (vhash_t * h, u32 n_key_u32, u32 n_vectors)
 187 {
 188   u32 n, l;
 189
 190   n = max_pow2 (n_vectors) / 4;
 191   n = clib_max (n, 8);
 192
 193   h->log2_n_key_word_len_u32x = l = min_log2 (n);
 194   vec_validate_aligned (h->key_work_space, (n_key_u32 << l) - 1, CLIB_CACHE_LINE_BYTES);
 195   vec_validate_aligned (h->hash_work_space, n - 1, CLIB_CACHE_LINE_BYTES);
 196 }
 197
 198 always_inline void
 199 vhash_gather_key_stage (vhash_t * h,
 200                         u32 vector_index,
 201                         u32 n_vectors,
 202                         vhash_key_function_t key_function,
 203                         void * state,
 204                         u32 n_key_u32s)
 205 {
 206   u32 i, j, vi;
 207
 208   /* Gather keys for 4 packets (for 128 bit vector length e.g. u32x4). */
 209   for (i = 0; i < n_vectors; i++)
 210     {
 211       vi = vector_index * 4 + i;
 212       for (j = 0; j < n_key_u32s; j++)
 213         vhash_set_key_word (h, j, vi,
 214                             key_function (state, vi, j));
 215     }
 216 }
 217
 218 always_inline void
 219 vhash_gather_4key_stage (vhash_t * h,
 220                          u32 vector_index,
 221                          vhash_4key_function_t key_function,
 222                          void * state,
 223                          u32 n_key_u32s)
 224 {
 225   u32 j, vi;
 226   vi = vector_index * 4;
 227   for (j = 0; j < n_key_u32s; j++)
 228     vhash_set_key_word_u32x (h, j, vi, key_function (state, vi, j));
 229 }
 230
 231 always_inline void
 232 vhash_mix_stage (vhash_t * h,
 233                  u32 vector_index,
 234                  u32 n_key_u32s)
 235 {
 236   i32 i, n_left;
 237   u32x a, b, c;
 238
 239   /* Only need to do this for keys longer than 12 bytes. */
 240   ASSERT (n_key_u32s > 3);
 241
 242   a = h->hash_seeds[0].as_u32x4;
 243   b = h->hash_seeds[1].as_u32x4;
 244   c = h->hash_seeds[2].as_u32x4;
 245   for (i = 0, n_left = n_key_u32s - 3; n_left > 0; n_left -= 3, i += 3)
 246     {
 247       a += vhash_get_key_word_u32x (h, n_key_u32s - 1 - (i + 0), vector_index);
 248       if (n_left > 1)
 249         b += vhash_get_key_word_u32x (h, n_key_u32s - 1 - (i + 1), vector_index);
 250       if (n_left > 2)
 251         c += vhash_get_key_word_u32x (h, n_key_u32s - 1 - (i + 2), vector_index);
 252
 253       hash_v3_mix_u32x (a, b, c);
 254     }
 255
 256   /* Save away a, b, c for later finalize. */
 257   {
 258     vhash_hashed_key_t * hk = vec_elt_at_index (h->hash_work_space, vector_index);
 259     hk->hashed_key[0].as_u32x4 = a;
 260     hk->hashed_key[1].as_u32x4 = b;
 261     hk->hashed_key[2].as_u32x4 = c;
 262   }
 263 }
 264
 265 always_inline vhash_search_bucket_t *
 266 vhash_get_search_bucket_with_index (vhash_t * h, u32 i, u32 n_key_u32s)
 267 {
 268   return ((vhash_search_bucket_t *)
 269           vec_elt_at_index (h->search_buckets,
 270                             (i / 4) * ((sizeof (vhash_search_bucket_t) / sizeof (u32x4)) + n_key_u32s)));
 271 }
 272
 273 always_inline vhash_search_bucket_t *
 274 vhash_get_search_bucket (vhash_t * h, u32 key_hash, u32 n_key_u32s)
 275 {
 276   u32 i = key_hash & h->bucket_mask.as_u32[0];
 277   return vhash_get_search_bucket_with_index (h, i, n_key_u32s);
 278 }
 279
 280 always_inline u32x4
 281 vhash_get_4_search_bucket_byte_offsets (vhash_t * h, u32x4 key_hash, u32 n_key_u32s)
 282 {
 283   vhash_search_bucket_t * b;
 284   u32 n_bytes_per_bucket = sizeof (b[0]) + n_key_u32s * sizeof (b->key[0]);
 285   u32x4 r = key_hash & h->bucket_mask.as_u32x4;
 286
 287   /* Multiply with shifts and adds to get bucket byte offset. */
 288 #define _(x) u32x4_ishift_left (r, (x) - 2)
 289   if (n_bytes_per_bucket == (1 << 5))
 290     r = _ (5);
 291   else if (n_bytes_per_bucket == ((1 << 5) + (1 << 4)))
 292     r = _ (5) + _ (4);
 293   else if (n_bytes_per_bucket == (1 << 6))
 294     r = _ (6);
 295   else if (n_bytes_per_bucket == ((1 << 6) + (1 << 4)))
 296     r = _ (6) + _ (4);
 297   else if (n_bytes_per_bucket == ((1 << 6) + (1 << 5)))
 298     r = _ (6) + _ (5);
 299   else if (n_bytes_per_bucket == ((1 << 6) + (1 << 5) + (1 << 4)))
 300     r = _ (6) + _ (5) + _ (4);
 301   else
 302     ASSERT (0);
 303 #undef _
 304   return r;
 305 }
 306
 307 always_inline void
 308 vhash_finalize_stage (vhash_t * h,
 309                       u32 vector_index,
 310                       u32 n_key_u32s)
 311 {
 312   i32 n_left;
 313   u32x a, b, c;
 314   vhash_hashed_key_t * hk = vec_elt_at_index (h->hash_work_space, vector_index);
 315
 316   if (n_key_u32s <= 3)
 317     {
 318       a = h->hash_seeds[0].as_u32x4;
 319       b = h->hash_seeds[1].as_u32x4;
 320       c = h->hash_seeds[2].as_u32x4;
 321       n_left = n_key_u32s;
 322     }
 323   else
 324     {
 325       a = hk->hashed_key[0].as_u32x4;
 326       b = hk->hashed_key[1].as_u32x4;
 327       c = hk->hashed_key[2].as_u32x4;
 328       n_left = 3;
 329     }
 330
 331   if (n_left > 0)
 332     a += vhash_get_key_word_u32x (h, 0, vector_index);
 333   if (n_left > 1)
 334     b += vhash_get_key_word_u32x (h, 1, vector_index);
 335   if (n_left > 2)
 336     c += vhash_get_key_word_u32x (h, 2, vector_index);
 337
 338   hash_v3_finalize_u32x (a, b, c);
 339
 340   /* Only save away last 32 bits of hash code. */
 341   hk->hashed_key[2].as_u32x4 = c;
 342
 343   /* Prefetch buckets.  This costs a bit for small tables but saves
 344      big for large ones. */
 345   {
 346     vhash_search_bucket_t * b0, * b1, * b2, * b3;
 347     u32x4_union_t kh;
 348
 349     kh.as_u32x4 = vhash_get_4_search_bucket_byte_offsets (h, c, n_key_u32s);
 350     hk->hashed_key[1].as_u32x4 = kh.as_u32x4;
 351
 352     b0 = (void *) h->search_buckets + kh.as_u32[0];
 353     b1 = (void *) h->search_buckets + kh.as_u32[1];
 354     b2 = (void *) h->search_buckets + kh.as_u32[2];
 355     b3 = (void *) h->search_buckets + kh.as_u32[3];
 356
 357     CLIB_PREFETCH (b0, sizeof (b0[0]) + n_key_u32s * sizeof (b0->key[0]), READ);
 358     CLIB_PREFETCH (b1, sizeof (b1[0]) + n_key_u32s * sizeof (b1->key[0]), READ);
 359     CLIB_PREFETCH (b2, sizeof (b2[0]) + n_key_u32s * sizeof (b2->key[0]), READ);
 360     CLIB_PREFETCH (b3, sizeof (b3[0]) + n_key_u32s * sizeof (b3->key[0]), READ);
 361   }
 362 }
 363
 364 always_inline u32
 365 vhash_merge_results (u32x4 r)
 366 {
 367   r = r | u32x4_word_shift_right (r, 2);
 368   r = r | u32x4_word_shift_right (r, 1);
 369   return u32x4_get0 (r);
 370 }
 371
 372 /* Bucket is full if none of its 4 results are 0. */
 373 always_inline u32
 374 vhash_search_bucket_is_full (u32x4 r)
 375 { return u32x4_zero_byte_mask (r) == 0; }
 376
 377 always_inline u32
 378 vhash_non_empty_result_index (u32x4 x)
 379 {
 380   u32 empty_mask = u32x4_zero_byte_mask (x);
 381   ASSERT (empty_mask != 0xffff);
 382   return min_log2 (0xffff &~ empty_mask) / 4;
 383 }
 384
 385 always_inline u32
 386 vhash_empty_result_index (u32x4 x)
 387 {
 388   u32 empty_mask = u32x4_zero_byte_mask (x);
 389   ASSERT (empty_mask != 0);
 390   return min_log2 (0xffff & empty_mask) / 4;
 391 }
 392
 393 always_inline u32x4
 394 vhash_bucket_compare (vhash_t * h,
 395                       u32x4_union_t * bucket,
 396                       u32 key_word_index,
 397                       u32 vi)
 398 {
 399   u32 k = vhash_get_key_word (h, key_word_index, vi);
 400   u32x4 x = {k, k, k, k};
 401   return u32x4_is_equal (bucket[key_word_index].as_u32x4, x);
 402 }
 403
 404 #define vhash_bucket_compare_4(h,wi,vi,b0,b1,b2,b3,cmp0,cmp1,cmp2,cmp3) \
 405 do {                                                                    \
 406   u32x4 _k4 = vhash_get_key_word_u32x ((h), (wi), (vi));                \
 407   u32x4 _k0 = u32x4_splat_word (_k4, 0);                                \
 408   u32x4 _k1 = u32x4_splat_word (_k4, 1);                                \
 409   u32x4 _k2 = u32x4_splat_word (_k4, 2);                                \
 410   u32x4 _k3 = u32x4_splat_word (_k4, 3);                                \
 411                                                                         \
 412   cmp0 = u32x4_is_equal (b0->key[wi].as_u32x4, _k0);                    \
 413   cmp1 = u32x4_is_equal (b1->key[wi].as_u32x4, _k1);                    \
 414   cmp2 = u32x4_is_equal (b2->key[wi].as_u32x4, _k2);                    \
 415   cmp3 = u32x4_is_equal (b3->key[wi].as_u32x4, _k3);                    \
 416 } while (0)
 417
 418 u32 vhash_get_overflow (vhash_t * h,
 419                         u32 key_hash,
 420                         u32 vi,
 421                         u32 n_key_u32s);
 422
 423 always_inline void
 424 vhash_get_stage (vhash_t * h,
 425                  u32 vector_index,
 426                  u32 n_vectors,
 427                  vhash_result_function_t result_function,
 428                  void * state,
 429                  u32 n_key_u32s)
 430 {
 431   u32 i, j;
 432   vhash_hashed_key_t * hk = vec_elt_at_index (h->hash_work_space, vector_index);
 433   vhash_search_bucket_t * b;
 434
 435   for (i = 0; i < n_vectors; i++)
 436     {
 437       u32 vi = vector_index * 4 + i;
 438       u32 key_hash = hk->hashed_key[2].as_u32[i];
 439       u32 result;
 440       u32x4 r, r0;
 441
 442       b = vhash_get_search_bucket (h, key_hash, n_key_u32s);
 443
 444       r = r0 = b->result.as_u32x4;
 445       for (j = 0; j < n_key_u32s; j++)
 446         r &= vhash_bucket_compare (h, &b->key[0], j, vi);
 447
 448       /* At this point only one of 4 results should be non-zero.
 449          So we can or all 4 together and get the valid result (if there is one). */
 450       result = vhash_merge_results (r);
 451
 452       if (! result && vhash_search_bucket_is_full (r0))
 453         result = vhash_get_overflow (h, key_hash, vi, n_key_u32s);
 454
 455       result_function (state, vi, result - 1, n_key_u32s);
 456     }
 457 }
 458
 459 always_inline void
 460 vhash_get_4_stage (vhash_t * h,
 461                    u32 vector_index,
 462                    vhash_4result_function_t result_function,
 463                    void * state,
 464                    u32 n_key_u32s)
 465 {
 466   u32 i, vi;
 467   vhash_hashed_key_t * hk = vec_elt_at_index (h->hash_work_space, vector_index);
 468   vhash_search_bucket_t * b0, * b1, * b2, * b3;
 469   u32x4 r0, r1, r2, r3, r0_before, r1_before, r2_before, r3_before;
 470   u32x4_union_t kh;
 471
 472   kh.as_u32x4 = hk->hashed_key[1].as_u32x4;
 473
 474   b0 = (void *) h->search_buckets + kh.as_u32[0];
 475   b1 = (void *) h->search_buckets + kh.as_u32[1];
 476   b2 = (void *) h->search_buckets + kh.as_u32[2];
 477   b3 = (void *) h->search_buckets + kh.as_u32[3];
 478
 479   r0 = r0_before = b0->result.as_u32x4;
 480   r1 = r1_before = b1->result.as_u32x4;
 481   r2 = r2_before = b2->result.as_u32x4;
 482   r3 = r3_before = b3->result.as_u32x4;
 483
 484   vi = vector_index * 4;
 485
 486   for (i = 0; i < n_key_u32s; i++)
 487     {
 488       u32x4 c0, c1, c2, c3;
 489       vhash_bucket_compare_4 (h, i, vector_index,
 490                               b0, b1, b2, b3,
 491                               c0, c1, c2, c3);
 492       r0 &= c0;
 493       r1 &= c1;
 494       r2 &= c2;
 495       r3 &= c3;
 496     }
 497
 498   u32x4_transpose (r0, r1, r2, r3);
 499
 500   /* Gather together 4 results. */
 501   {
 502     u32x4_union_t r;
 503     u32x4 ones = {1,1,1,1};
 504     u32 not_found_mask;
 505
 506     r.as_u32x4 = r0 | r1 | r2 | r3;
 507     not_found_mask = u32x4_zero_byte_mask (r.as_u32x4);
 508     not_found_mask &= ((vhash_search_bucket_is_full (r0_before) << (4*0))
 509                        | (vhash_search_bucket_is_full (r1_before) << (4*1))
 510                        | (vhash_search_bucket_is_full (r2_before) << (4*2))
 511                        | (vhash_search_bucket_is_full (r3_before) << (4*3)));
 512     if (not_found_mask)
 513       {
 514         u32x4_union_t key_hash;
 515
 516         key_hash.as_u32x4 = hk->hashed_key[2].as_u32x4 & h->bucket_mask.as_u32x4;
 517
 518         /* Slow path: one of the buckets may have been full and we need to search overflow. */
 519         if (not_found_mask & (1 << (4*0)))
 520           r.as_u32[0] = vhash_get_overflow (h, key_hash.as_u32[0],
 521                                               vi + 0, n_key_u32s);
 522         if (not_found_mask & (1 << (4*1)))
 523           r.as_u32[1] = vhash_get_overflow (h, key_hash.as_u32[1],
 524                                               vi + 1, n_key_u32s);
 525         if (not_found_mask & (1 << (4*2)))
 526           r.as_u32[2] = vhash_get_overflow (h, key_hash.as_u32[2],
 527                                               vi + 2, n_key_u32s);
 528         if (not_found_mask & (1 << (4*3)))
 529           r.as_u32[3] = vhash_get_overflow (h, key_hash.as_u32[3],
 530                                               vi + 3, n_key_u32s);
 531       }
 532
 533     result_function (state, vi, r.as_u32x4 - ones, n_key_u32s);
 534   }
 535 }
 536
 537 u32
 538 vhash_set_overflow (vhash_t * h,
 539                     u32 key_hash,
 540                     u32 vi,
 541                     u32 new_result,
 542                     u32 n_key_u32s);
 543
 544 always_inline void
 545 vhash_set_stage (vhash_t * h,
 546                  u32 vector_index,
 547                  u32 n_vectors,
 548                  vhash_result_function_t result_function,
 549                  void * state,
 550                  u32 n_key_u32s)
 551 {
 552   u32 i, j, n_new_elts = 0;
 553   vhash_hashed_key_t * hk = vec_elt_at_index (h->hash_work_space, vector_index);
 554   vhash_search_bucket_t * b;
 555
 556   for (i = 0; i < n_vectors; i++)
 557     {
 558       u32 vi = vector_index * 4 + i;
 559       u32 key_hash = hk->hashed_key[2].as_u32[i];
 560       u32 old_result, new_result;
 561       u32 i_set;
 562       u32x4 r, r0, cmp;
 563
 564       b = vhash_get_search_bucket (h, key_hash, n_key_u32s);
 565
 566       cmp = vhash_bucket_compare (h, &b->key[0], 0, vi);
 567       for (j = 1; j < n_key_u32s; j++)
 568         cmp &= vhash_bucket_compare (h, &b->key[0], j, vi);
 569
 570       r0 = b->result.as_u32x4;
 571       r = r0 & cmp;
 572
 573       /* At this point only one of 4 results should be non-zero.
 574          So we can or all 4 together and get the valid result (if there is one). */
 575       old_result = vhash_merge_results (r);
 576
 577       if (! old_result && vhash_search_bucket_is_full (r0))
 578         old_result = vhash_get_overflow (h, key_hash, vi, n_key_u32s);
 579
 580       /* Get new result; possibly do something with old result. */
 581       new_result = result_function (state, vi, old_result - 1, n_key_u32s);
 582
 583       /* User cannot use ~0 as a hash result since a result of 0 is
 584          used to mark unused bucket entries. */
 585       ASSERT (new_result + 1 != 0);
 586       new_result += 1;
 587
 588       /* Set over-writes existing result. */
 589       if (old_result)
 590         {
 591           i_set = vhash_non_empty_result_index (r);
 592           b->result.as_u32[i_set] = new_result;
 593         }
 594       else
 595         {
 596           /* Set allocates new result. */
 597           u32 valid_mask;
 598
 599           valid_mask = (((b->result.as_u32[0] != 0) << 0)
 600                         | ((b->result.as_u32[1] != 0) << 1)
 601                         | ((b->result.as_u32[2] != 0) << 2)
 602                         | ((b->result.as_u32[3] != 0) << 3));
 603
 604           /* Rotate 4 bit valid mask so that key_hash corresponds to bit 0. */
 605           i_set = key_hash & 3;
 606           valid_mask = ((valid_mask >> i_set) | (valid_mask << (4 - i_set))) & 0xf;
 607
 608           /* Insert into first empty position in bucket after key_hash. */
 609           i_set = (i_set + h->find_first_zero_table[valid_mask]) & 3;
 610
 611           if (valid_mask != 0xf)
 612             {
 613               n_new_elts += 1;
 614
 615               b->result.as_u32[i_set] = new_result;
 616
 617               /* Insert new key into search bucket. */
 618               for (j = 0; j < n_key_u32s; j++)
 619                 b->key[j].as_u32[i_set] = vhash_get_key_word (h, j, vi);
 620             }
 621           else
 622             vhash_set_overflow (h, key_hash, vi, new_result, n_key_u32s);
 623         }
 624     }
 625
 626   h->n_elts += n_new_elts;
 627 }
 628
 629 u32
 630 vhash_unset_overflow (vhash_t * h,
 631                       u32 key_hash,
 632                       u32 vi,
 633                       u32 n_key_u32s);
 634
 635 void
 636 vhash_unset_refill_from_overflow (vhash_t * h,
 637                                   vhash_search_bucket_t * b,
 638                                   u32 key_hash,
 639                                   u32 n_key_u32s);
 640
 641 /* Note: Eliot tried doing 4 unsets at once and could not get a speed up
 642    and abandoned vhash_unset_4_stage. */
 643 always_inline void
 644 vhash_unset_stage (vhash_t * h,
 645                    u32 vector_index,
 646                    u32 n_vectors,
 647                    vhash_result_function_t result_function,
 648                    void * state,
 649                    u32 n_key_u32s)
 650 {
 651   u32 i, j, n_elts_unset = 0;
 652   vhash_hashed_key_t * hk = vec_elt_at_index (h->hash_work_space, vector_index);
 653   vhash_search_bucket_t * b;
 654
 655   for (i = 0; i < n_vectors; i++)
 656     {
 657       u32 vi = vector_index * 4 + i;
 658       u32 key_hash = hk->hashed_key[2].as_u32[i];
 659       u32 old_result;
 660       u32x4 cmp, r0;
 661
 662       b = vhash_get_search_bucket (h, key_hash, n_key_u32s);
 663
 664       cmp = vhash_bucket_compare (h, &b->key[0], 0, vi);
 665       for (j = 1; j < n_key_u32s; j++)
 666         cmp &= vhash_bucket_compare (h, &b->key[0], j, vi);
 667
 668       r0 = b->result.as_u32x4;
 669
 670       /* At this point cmp is all ones where key matches and zero otherwise.
 671          So, this will invalidate results for matching key and do nothing otherwise. */
 672       b->result.as_u32x4 = r0 & ~cmp;
 673
 674       old_result = vhash_merge_results (r0 & cmp);
 675
 676       n_elts_unset += old_result != 0;
 677
 678       if (vhash_search_bucket_is_full (r0))
 679         {
 680           if (old_result)
 681             vhash_unset_refill_from_overflow (h, b, key_hash, n_key_u32s);
 682           else
 683             old_result = vhash_unset_overflow (h, key_hash, vi, n_key_u32s);
 684         }
 685
 686       result_function (state, vi, old_result - 1, n_key_u32s);
 687     }
 688   ASSERT (h->n_elts >= n_elts_unset);
 689   h->n_elts -= n_elts_unset;
 690 }
 691
 692 void vhash_init (vhash_t * h, u32 log2_n_keys, u32 n_key_u32,
 693                  u32 * hash_seeds);
 694
 695 void vhash_resize (vhash_t * old, u32 log2_n_keys);
 696
 697 typedef struct {
 698   vhash_t * vhash;
 699
 700   union {
 701     struct {
 702       u32 * keys;
 703       u32 * results;
 704     };
 705
 706     /* Vector layout for get keys. */
 707     struct {
 708       u32x4_union_t * get_keys;
 709       u32x4_union_t * get_results;
 710     };
 711   };
 712
 713   u32 n_vectors_div_4;
 714   u32 n_vectors_mod_4;
 715
 716   u32 n_key_u32;
 717
 718   u32 n_keys;
 719 } vhash_main_t;
 720
 721 always_inline u32
 722 vhash_get_alloc_keys (vhash_main_t * vm, u32 n_keys, u32 n_key_u32)
 723 {
 724   u32 i, n;
 725
 726   i = vm->n_keys;
 727   vm->n_keys = i + n_keys;
 728
 729   n = (round_pow2 (vm->n_keys, 4) / 4) * n_key_u32;
 730
 731   vec_validate_aligned (vm->get_keys, n - 1, sizeof (vm->get_keys[0]));
 732   vec_validate_aligned (vm->get_results, n - 1, sizeof (vm->get_results[0]));
 733
 734   return i;
 735 }
 736
 737 always_inline void
 738 vhash_get_set_key_word (vhash_main_t * vm, u32 vi, u32 wi, u32 n_key_u32,
 739                         u32 value)
 740 {
 741   u32x4_union_t * k = vec_elt_at_index (vm->get_keys, (vi / 4) * n_key_u32);
 742   ASSERT (wi < n_key_u32);
 743   k[wi].as_u32[vi % 4] = value;
 744 }
 745
 746 always_inline u32
 747 vhash_get_fetch_result (vhash_main_t * vm, u32 vi)
 748 {
 749   u32x4_union_t * r = vec_elt_at_index (vm->get_results, vi / 4);
 750   return r->as_u32[vi % 4];
 751 }
 752
 753 void vhash_main_get (vhash_main_t * vm);
 754
 755 always_inline u32
 756 vhash_set_alloc_keys (vhash_main_t * vm, u32 n_keys, u32 n_key_u32)
 757 {
 758   u32 i;
 759
 760   i = vm->n_keys;
 761   vm->n_keys = i + n_keys;
 762
 763   vec_resize (vm->keys, n_keys * n_key_u32);
 764   vec_resize (vm->results, n_keys);
 765
 766   return i;
 767 }
 768
 769 always_inline void
 770 vhash_set_set_key_word (vhash_main_t * vm, u32 vi, u32 wi, u32 n_key_u32,
 771                         u32 value)
 772 {
 773   u32 * k = vec_elt_at_index (vm->keys, vi * n_key_u32);
 774   ASSERT (wi < n_key_u32);
 775   k[wi] = value;
 776 }
 777
 778 always_inline void
 779 vhash_set_set_result (vhash_main_t * vm, u32 vi, u32 result)
 780 {
 781   u32 * r = vec_elt_at_index (vm->results, vi);
 782   r[0] = result;
 783 }
 784
 785 always_inline u32
 786 vhash_set_fetch_old_result (vhash_main_t * vm, u32 vi)
 787 {
 788   u32 * r = vec_elt_at_index (vm->results, vi);
 789   return r[0];
 790 }
 791
 792 void vhash_main_set (vhash_main_t * vm);
 793
 794 always_inline u32
 795 vhash_unset_alloc_keys (vhash_main_t * vm, u32 n_keys, u32 n_key_u32)
 796 { return vhash_set_alloc_keys (vm, n_keys, n_key_u32); }
 797
 798 always_inline void
 799 vhash_unset_set_key_word (vhash_main_t * vm, u32 vi, u32 wi, u32 n_key_u32,
 800                         u32 value)
 801 { vhash_set_set_key_word (vm, vi, wi, n_key_u32, value); }
 802
 803 always_inline void
 804 vhash_unset_set_result (vhash_main_t * vm, u32 vi, u32 result)
 805 { vhash_set_set_result (vm, vi, result); }
 806
 807 always_inline u32
 808 vhash_unset_fetch_old_result (vhash_main_t * vm, u32 vi)
 809 { return vhash_set_fetch_old_result (vm, vi); }
 810
 811 void vhash_main_unset (vhash_main_t * vm);
 812
 813 typedef struct {
 814   vhash_main_t new;
 815
 816   vhash_t * old;
 817 } vhash_resize_t;
 818
 819 u32 vhash_resize_incremental (vhash_resize_t * vr, u32 vector_index, u32 n_vectors);
 820
 821 #endif /* CLIB_HAVE_VEC128 */
 822
 823 #endif /* included_clib_vhash_h */