2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 /** @cond DOCUMENTATION_IS_IN_BIHASH_DOC_H */
18 static inline void *BV (alloc_aligned) (BVT (clib_bihash) * h, uword nbytes)
22 /* Round to an even number of cache lines */
23 nbytes += CLIB_CACHE_LINE_BYTES - 1;
24 nbytes &= ~(CLIB_CACHE_LINE_BYTES - 1);
26 rv = alloc_arena_next (h);
27 alloc_arena_next (h) += nbytes;
29 if (rv >= (alloc_arena (h) + alloc_arena_size (h)))
36 void BV (clib_bihash_init)
37 (BVT (clib_bihash) * h, char *name, u32 nbuckets, uword memory_size)
41 nbuckets = 1 << (max_log2 (nbuckets));
43 h->name = (u8 *) name;
44 h->nbuckets = nbuckets;
45 h->log2_nbuckets = max_log2 (nbuckets);
48 * Make sure the requested size is rational. The max table
49 * size without playing the alignment card is 64 Gbytes.
50 * If someone starts complaining that's not enough, we can shift
51 * the offset by CLIB_LOG2_CACHE_LINE_BYTES...
53 ASSERT (memory_size < (1ULL << BIHASH_BUCKET_OFFSET_BITS));
55 alloc_arena (h) = (uword) clib_mem_vm_alloc (memory_size);
56 alloc_arena_next (h) = alloc_arena (h);
57 alloc_arena_size (h) = memory_size;
59 bucket_size = nbuckets * sizeof (h->buckets[0]);
60 h->buckets = BV (alloc_aligned) (h, bucket_size);
62 h->alloc_lock = BV (alloc_aligned) (h, CLIB_CACHE_LINE_BYTES);
69 #if !defined (MFD_ALLOW_SEALING)
70 #define MFD_ALLOW_SEALING 0x0002U
73 void BV (clib_bihash_master_init_svm)
74 (BVT (clib_bihash) * h, char *name, u32 nbuckets,
75 u64 base_address, u64 memory_size)
79 vec_header_t *freelist_vh;
82 ASSERT (base_address);
83 ASSERT (base_address + memory_size < (1ULL << 32));
85 /* Set up for memfd sharing */
86 if ((fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1)
88 clib_unix_warning ("memfd_create");
92 if (ftruncate (fd, memory_size) < 0)
94 clib_unix_warning ("ftruncate");
98 /* Not mission-critical, complain and continue */
99 if ((fcntl (fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
100 clib_unix_warning ("fcntl (F_ADD_SEALS)");
102 mmap_addr = mmap (u64_to_pointer (base_address), memory_size,
103 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd,
106 if (mmap_addr == MAP_FAILED)
108 clib_unix_warning ("mmap failed");
112 h->sh = (void *) mmap_addr;
114 nbuckets = 1 << (max_log2 (nbuckets));
116 h->name = (u8 *) name;
117 h->sh->nbuckets = h->nbuckets = nbuckets;
118 h->log2_nbuckets = max_log2 (nbuckets);
120 alloc_arena (h) = (u64) (uword) mmap_addr;
121 alloc_arena_next (h) = alloc_arena (h) + CLIB_CACHE_LINE_BYTES;
122 alloc_arena_size (h) = memory_size;
124 bucket_size = nbuckets * sizeof (h->buckets[0]);
125 h->buckets = BV (alloc_aligned) (h, bucket_size);
126 h->sh->buckets_as_u64 = (u64) (uword) h->buckets;
128 h->alloc_lock = BV (alloc_aligned) (h, CLIB_CACHE_LINE_BYTES);
129 h->alloc_lock[0] = 0;
131 h->sh->alloc_lock_as_u64 = (u64) (uword) (h->alloc_lock);
132 freelist_vh = BV (alloc_aligned) (h, sizeof (vec_header_t) +
133 BIHASH_FREELIST_LENGTH * sizeof (u64));
134 freelist_vh->len = BIHASH_FREELIST_LENGTH;
135 freelist_vh->dlmalloc_header_offset = 0xDEADBEEF;
136 h->sh->freelists_as_u64 = (u64) (uword) freelist_vh->vector_data;
137 h->freelists = (void *) (uword) (h->sh->freelists_as_u64);
142 void BV (clib_bihash_slave_init_svm)
143 (BVT (clib_bihash) * h, char *name, int fd)
146 u64 base_address, memory_size;
147 BVT (clib_bihash_shared_header) * sh;
149 /* Trial mapping, to place the segment */
150 mmap_addr = mmap (0, 4096, PROT_READ, MAP_SHARED, fd, 0 /* offset */ );
151 if (mmap_addr == MAP_FAILED)
153 clib_unix_warning ("trial mmap failed");
157 sh = (BVT (clib_bihash_shared_header) *) mmap_addr;
159 base_address = sh->alloc_arena;
160 memory_size = sh->alloc_arena_size;
162 munmap (mmap_addr, 4096);
164 /* Actual mapping, at the required address */
165 mmap_addr = mmap (u64_to_pointer (base_address), memory_size,
166 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd,
169 if (mmap_addr == MAP_FAILED)
171 clib_unix_warning ("mmap failed");
177 h->sh = (void *) mmap_addr;
180 h->name = (u8 *) name;
181 h->buckets = u64_to_pointer (h->sh->buckets_as_u64);
182 h->nbuckets = h->sh->nbuckets;
183 h->log2_nbuckets = max_log2 (h->nbuckets);
185 h->alloc_lock = u64_to_pointer (h->sh->alloc_lock_as_u64);
186 h->freelists = u64_to_pointer (h->sh->freelists_as_u64);
189 #endif /* BIHASH_32_64_SVM */
191 void BV (clib_bihash_set_kvp_format_fn) (BVT (clib_bihash) * h,
192 format_function_t * fmt_fn)
197 void BV (clib_bihash_free) (BVT (clib_bihash) * h)
199 vec_free (h->working_copies);
200 #if BIHASH_32_64_SVM == 0
201 vec_free (h->freelists);
204 (void) close (h->memfd);
206 clib_mem_vm_free ((void *) (uword) (alloc_arena (h)), alloc_arena_size (h));
207 memset (h, 0, sizeof (*h));
211 BVT (clib_bihash_value) *
212 BV (value_alloc) (BVT (clib_bihash) * h, u32 log2_pages)
214 BVT (clib_bihash_value) * rv = 0;
216 ASSERT (h->alloc_lock[0]);
219 ASSERT (log2_pages < vec_len (h->freelists));
222 if (log2_pages >= vec_len (h->freelists) || h->freelists[log2_pages] == 0)
224 vec_validate_init_empty (h->freelists, log2_pages, 0);
225 rv = BV (alloc_aligned) (h, (sizeof (*rv) * (1 << log2_pages)));
228 rv = (void *) (uword) h->freelists[log2_pages];
229 h->freelists[log2_pages] = rv->next_free_as_u64;
234 * Latest gcc complains that the length arg is zero
235 * if we replace (1<<log2_pages) with vec_len(rv).
238 memset (rv, 0xff, sizeof (*rv) * (1 << log2_pages));
243 BV (value_free) (BVT (clib_bihash) * h, BVT (clib_bihash_value) * v,
246 ASSERT (h->alloc_lock[0]);
248 ASSERT (vec_len (h->freelists) > log2_pages);
251 memset (v, 0xFE, sizeof (*v) * (1 << log2_pages));
253 v->next_free_as_u64 = (u64) h->freelists[log2_pages];
254 h->freelists[log2_pages] = (u64) (uword) v;
258 BV (make_working_copy) (BVT (clib_bihash) * h, BVT (clib_bihash_bucket) * b)
260 BVT (clib_bihash_value) * v;
261 BVT (clib_bihash_bucket) working_bucket __attribute__ ((aligned (8)));
262 BVT (clib_bihash_value) * working_copy;
263 u32 thread_index = os_get_thread_index ();
264 int log2_working_copy_length;
266 ASSERT (h->alloc_lock[0]);
268 if (thread_index >= vec_len (h->working_copies))
270 vec_validate (h->working_copies, thread_index);
271 vec_validate_init_empty (h->working_copy_lengths, thread_index, ~0);
275 * working_copies are per-cpu so that near-simultaneous
276 * updates from multiple threads will not result in sporadic, spurious
279 working_copy = h->working_copies[thread_index];
280 log2_working_copy_length = h->working_copy_lengths[thread_index];
282 h->saved_bucket.as_u64 = b->as_u64;
284 if (b->log2_pages > log2_working_copy_length)
287 * It's not worth the bookkeeping to free working copies
289 * clib_mem_free (working_copy);
291 working_copy = BV (alloc_aligned)
292 (h, sizeof (working_copy[0]) * (1 << b->log2_pages));
293 h->working_copy_lengths[thread_index] = b->log2_pages;
294 h->working_copies[thread_index] = working_copy;
297 v = BV (clib_bihash_get_value) (h, b->offset);
299 clib_memcpy (working_copy, v, sizeof (*v) * (1 << b->log2_pages));
300 working_bucket.as_u64 = b->as_u64;
301 working_bucket.offset = BV (clib_bihash_get_offset) (h, working_copy);
302 CLIB_MEMORY_BARRIER ();
303 b->as_u64 = working_bucket.as_u64;
304 h->working_copies[thread_index] = working_copy;
308 BVT (clib_bihash_value) *
309 BV (split_and_rehash)
310 (BVT (clib_bihash) * h,
311 BVT (clib_bihash_value) * old_values, u32 old_log2_pages,
314 BVT (clib_bihash_value) * new_values, *new_v;
315 int i, j, length_in_kvs;
317 ASSERT (h->alloc_lock[0]);
319 new_values = BV (value_alloc) (h, new_log2_pages);
320 length_in_kvs = (1 << old_log2_pages) * BIHASH_KVP_PER_PAGE;
322 for (i = 0; i < length_in_kvs; i++)
326 /* Entry not in use? Forget it */
327 if (BV (clib_bihash_is_free) (&(old_values->kvp[i])))
330 /* rehash the item onto its new home-page */
331 new_hash = BV (clib_bihash_hash) (&(old_values->kvp[i]));
332 new_hash >>= h->log2_nbuckets;
333 new_hash &= (1 << new_log2_pages) - 1;
334 new_v = &new_values[new_hash];
336 /* Across the new home-page */
337 for (j = 0; j < BIHASH_KVP_PER_PAGE; j++)
340 if (BV (clib_bihash_is_free) (&(new_v->kvp[j])))
342 clib_memcpy (&(new_v->kvp[j]), &(old_values->kvp[i]),
343 sizeof (new_v->kvp[j]));
347 /* Crap. Tell caller to try again */
348 BV (value_free) (h, new_values, new_log2_pages);
357 BVT (clib_bihash_value) *
358 BV (split_and_rehash_linear)
359 (BVT (clib_bihash) * h,
360 BVT (clib_bihash_value) * old_values, u32 old_log2_pages,
363 BVT (clib_bihash_value) * new_values;
364 int i, j, new_length, old_length;
366 ASSERT (h->alloc_lock[0]);
368 new_values = BV (value_alloc) (h, new_log2_pages);
369 new_length = (1 << new_log2_pages) * BIHASH_KVP_PER_PAGE;
370 old_length = (1 << old_log2_pages) * BIHASH_KVP_PER_PAGE;
373 /* Across the old value array */
374 for (i = 0; i < old_length; i++)
376 /* Find a free slot in the new linear scan bucket */
377 for (; j < new_length; j++)
379 /* Old value not in use? Forget it. */
380 if (BV (clib_bihash_is_free) (&(old_values->kvp[i])))
383 /* New value should never be in use */
384 if (BV (clib_bihash_is_free) (&(new_values->kvp[j])))
386 /* Copy the old value and move along */
387 clib_memcpy (&(new_values->kvp[j]), &(old_values->kvp[i]),
388 sizeof (new_values->kvp[j]));
393 /* This should never happen... */
394 clib_warning ("BUG: linear rehash failed!");
395 BV (value_free) (h, new_values, new_log2_pages);
403 static inline int BV (clib_bihash_add_del_inline)
404 (BVT (clib_bihash) * h, BVT (clib_bihash_kv) * add_v, int is_add,
405 int (*is_stale_cb) (BVT (clib_bihash_kv) *, void *), void *arg)
408 BVT (clib_bihash_bucket) * b, tmp_b;
409 BVT (clib_bihash_value) * v, *new_v, *save_new_v, *working_copy;
412 u32 new_log2_pages, old_log2_pages;
413 u32 thread_index = os_get_thread_index ();
414 int mark_bucket_linear;
417 hash = BV (clib_bihash_hash) (add_v);
419 bucket_index = hash & (h->nbuckets - 1);
420 b = &h->buckets[bucket_index];
422 hash >>= h->log2_nbuckets;
424 BV (clib_bihash_lock_bucket) (b);
426 /* First elt in the bucket? */
427 if (BV (clib_bihash_bucket_is_empty) (b))
431 BV (clib_bihash_unlock_bucket) (b);
435 BV (clib_bihash_alloc_lock) (h);
436 v = BV (value_alloc) (h, 0);
437 BV (clib_bihash_alloc_unlock) (h);
440 tmp_b.as_u64 = 0; /* clears bucket lock */
441 tmp_b.offset = BV (clib_bihash_get_offset) (h, v);
443 CLIB_MEMORY_BARRIER ();
445 b->as_u64 = tmp_b.as_u64;
446 BV (clib_bihash_unlock_bucket) (b);
450 /* WARNING: we're still looking at the live copy... */
451 limit = BIHASH_KVP_PER_PAGE;
452 v = BV (clib_bihash_get_value) (h, b->offset);
454 v += (b->linear_search == 0) ? hash & ((1 << b->log2_pages) - 1) : 0;
455 if (b->linear_search)
456 limit <<= b->log2_pages;
461 * Because reader threads are looking at live data,
462 * we have to be extra careful. Readers do NOT hold the
463 * bucket lock. We need to be SLOWER than a search, past the
464 * point where readers CHECK the bucket lock.
468 * For obvious (in hindsight) reasons, see if we're supposed to
469 * replace an existing key, then look for an empty slot.
471 for (i = 0; i < limit; i++)
473 if (!memcmp (&(v->kvp[i]), &add_v->key, sizeof (add_v->key)))
475 CLIB_MEMORY_BARRIER (); /* Add a delay */
476 clib_memcpy (&(v->kvp[i]), add_v, sizeof (*add_v));
477 BV (clib_bihash_unlock_bucket) (b);
482 * Look for an empty slot. If found, use it
484 for (i = 0; i < limit; i++)
486 if (BV (clib_bihash_is_free) (&(v->kvp[i])))
489 * Copy the value first, so that if a reader manages
490 * to match the new key, the value will be right...
492 clib_memcpy (&(v->kvp[i].value),
493 &add_v->value, sizeof (add_v->value));
494 CLIB_MEMORY_BARRIER (); /* Make sure the value has settled */
495 clib_memcpy (&(v->kvp[i]), &add_v->key, sizeof (add_v->key));
497 ASSERT (b->refcnt > 0);
498 BV (clib_bihash_unlock_bucket) (b);
502 /* look for stale data to overwrite */
505 for (i = 0; i < limit; i++)
507 if (is_stale_cb (&(v->kvp[i]), arg))
509 CLIB_MEMORY_BARRIER ();
510 clib_memcpy (&(v->kvp[i]), add_v, sizeof (*add_v));
511 BV (clib_bihash_unlock_bucket) (b);
516 /* Out of space in this bucket, split the bucket... */
518 else /* delete case */
520 for (i = 0; i < limit; i++)
522 /* Found the key? Kill it... */
523 if (!memcmp (&(v->kvp[i]), &add_v->key, sizeof (add_v->key)))
525 memset (&(v->kvp[i]), 0xff, sizeof (*(add_v)));
526 /* Is the bucket empty? */
527 if (PREDICT_TRUE (b->refcnt > 1))
530 BV (clib_bihash_unlock_bucket) (b);
533 else /* yes, free it */
535 /* Save old bucket value, need log2_pages to free it */
536 tmp_b.as_u64 = b->as_u64;
537 CLIB_MEMORY_BARRIER ();
539 /* Kill and unlock the bucket */
542 /* And free the backing storage */
543 BV (clib_bihash_alloc_lock) (h);
544 /* Note: v currently points into the middle of the bucket */
545 v = BV (clib_bihash_get_value) (h, tmp_b.offset);
546 BV (value_free) (h, v, tmp_b.log2_pages);
547 BV (clib_bihash_alloc_unlock) (h);
553 BV (clib_bihash_unlock_bucket) (b);
557 /* Move readers to a (locked) temp copy of the bucket */
558 BV (clib_bihash_alloc_lock) (h);
559 BV (make_working_copy) (h, b);
561 v = BV (clib_bihash_get_value) (h, h->saved_bucket.offset);
563 old_log2_pages = h->saved_bucket.log2_pages;
564 new_log2_pages = old_log2_pages + 1;
565 mark_bucket_linear = 0;
567 working_copy = h->working_copies[thread_index];
570 new_v = BV (split_and_rehash) (h, working_copy, old_log2_pages,
577 /* Try re-splitting. If that fails, fall back to linear search */
578 new_v = BV (split_and_rehash) (h, working_copy, old_log2_pages,
584 /* pinned collisions, use linear search */
586 BV (split_and_rehash_linear) (h, working_copy, old_log2_pages,
588 mark_bucket_linear = 1;
592 /* Try to add the new entry */
594 new_hash = BV (clib_bihash_hash) (add_v);
595 limit = BIHASH_KVP_PER_PAGE;
596 if (mark_bucket_linear)
597 limit <<= new_log2_pages;
598 new_hash >>= h->log2_nbuckets;
599 new_hash &= (1 << new_log2_pages) - 1;
600 new_v += mark_bucket_linear ? 0 : new_hash;
602 for (i = 0; i < limit; i++)
604 if (BV (clib_bihash_is_free) (&(new_v->kvp[i])))
606 clib_memcpy (&(new_v->kvp[i]), add_v, sizeof (*add_v));
611 /* Crap. Try again */
612 BV (value_free) (h, save_new_v, new_log2_pages);
614 * If we've already doubled the size of the bucket once,
615 * fall back to linear search now.
623 tmp_b.log2_pages = new_log2_pages;
624 tmp_b.offset = BV (clib_bihash_get_offset) (h, save_new_v);
625 tmp_b.linear_search = mark_bucket_linear;
626 tmp_b.refcnt = h->saved_bucket.refcnt + 1;
627 ASSERT (tmp_b.refcnt > 0);
629 CLIB_MEMORY_BARRIER ();
630 b->as_u64 = tmp_b.as_u64;
631 BV (clib_bihash_alloc_unlock) (h);
635 int BV (clib_bihash_add_del)
636 (BVT (clib_bihash) * h, BVT (clib_bihash_kv) * add_v, int is_add)
638 return BV (clib_bihash_add_del_inline) (h, add_v, is_add, 0, 0);
641 int BV (clib_bihash_add_or_overwrite_stale)
642 (BVT (clib_bihash) * h, BVT (clib_bihash_kv) * add_v,
643 int (*stale_callback) (BVT (clib_bihash_kv) *, void *), void *arg)
645 return BV (clib_bihash_add_del_inline) (h, add_v, 1, stale_callback, arg);
648 int BV (clib_bihash_search)
649 (BVT (clib_bihash) * h,
650 BVT (clib_bihash_kv) * search_key, BVT (clib_bihash_kv) * valuep)
654 BVT (clib_bihash_value) * v;
655 BVT (clib_bihash_bucket) * b;
660 hash = BV (clib_bihash_hash) (search_key);
662 bucket_index = hash & (h->nbuckets - 1);
663 b = &h->buckets[bucket_index];
665 if (BV (clib_bihash_bucket_is_empty) (b))
668 if (PREDICT_FALSE (b->lock))
670 volatile BVT (clib_bihash_bucket) * bv = b;
675 hash >>= h->log2_nbuckets;
677 v = BV (clib_bihash_get_value) (h, b->offset);
678 limit = BIHASH_KVP_PER_PAGE;
679 v += (b->linear_search == 0) ? hash & ((1 << b->log2_pages) - 1) : 0;
680 if (PREDICT_FALSE (b->linear_search))
681 limit <<= b->log2_pages;
683 for (i = 0; i < limit; i++)
685 if (BV (clib_bihash_key_compare) (v->kvp[i].key, search_key->key))
694 u8 *BV (format_bihash) (u8 * s, va_list * args)
696 BVT (clib_bihash) * h = va_arg (*args, BVT (clib_bihash) *);
697 int verbose = va_arg (*args, int);
698 BVT (clib_bihash_bucket) * b;
699 BVT (clib_bihash_value) * v;
701 u64 active_elements = 0;
702 u64 active_buckets = 0;
703 u64 linear_buckets = 0;
706 s = format (s, "Hash table %s\n", h->name ? h->name : (u8 *) "(unnamed)");
708 for (i = 0; i < h->nbuckets; i++)
711 if (BV (clib_bihash_bucket_is_empty) (b))
714 s = format (s, "[%d]: empty\n", i);
720 if (b->linear_search)
725 s = format (s, "[%d]: heap offset %lld, len %d, linear %d\n", i,
726 b->offset, (1 << b->log2_pages), b->linear_search);
729 v = BV (clib_bihash_get_value) (h, b->offset);
730 for (j = 0; j < (1 << b->log2_pages); j++)
732 for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
734 if (BV (clib_bihash_is_free) (&v->kvp[k]))
737 s = format (s, " %d: empty\n",
738 j * BIHASH_KVP_PER_PAGE + k);
745 s = format (s, " %d: %U\n",
746 j * BIHASH_KVP_PER_PAGE + k,
747 h->fmt_fn, &(v->kvp[k]));
751 s = format (s, " %d: %U\n",
752 j * BIHASH_KVP_PER_PAGE + k,
753 BV (format_bihash_kvp), &(v->kvp[k]));
762 s = format (s, " %lld active elements %lld active buckets\n",
763 active_elements, active_buckets);
764 s = format (s, " %d free lists\n", vec_len (h->freelists));
766 for (i = 0; i < vec_len (h->freelists); i++)
769 BVT (clib_bihash_value) * free_elt;
771 free_elt = (void *) (uword) h->freelists[i];
775 free_elt = (void *) (uword) free_elt->next_free_as_u64;
778 if (nfree || verbose)
779 s = format (s, " [len %d] %u free elts\n", 1 << i, nfree);
782 s = format (s, " %lld linear search buckets\n", linear_buckets);
783 used_bytes = alloc_arena_next (h) - alloc_arena (h);
785 " arena: base %llx, next %llx\n"
786 " used %lld b (%lld Mbytes) of %lld b (%lld Mbytes)\n",
787 alloc_arena (h), alloc_arena_next (h),
788 used_bytes, used_bytes >> 20,
789 alloc_arena_size (h), alloc_arena_size (h) >> 20);
793 void BV (clib_bihash_foreach_key_value_pair)
794 (BVT (clib_bihash) * h, void *callback, void *arg)
797 BVT (clib_bihash_bucket) * b;
798 BVT (clib_bihash_value) * v;
799 void (*fp) (BVT (clib_bihash_kv) *, void *) = callback;
801 for (i = 0; i < h->nbuckets; i++)
804 if (BV (clib_bihash_bucket_is_empty) (b))
807 v = BV (clib_bihash_get_value) (h, b->offset);
808 for (j = 0; j < (1 << b->log2_pages); j++)
810 for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
812 if (BV (clib_bihash_is_free) (&v->kvp[k]))
815 (*fp) (&v->kvp[k], arg);
817 * In case the callback deletes the last entry in the bucket...
819 if (BV (clib_bihash_bucket_is_empty) (b))
832 * fd.io coding-style-patch-verification: ON
835 * eval: (c-set-style "gnu")