2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 #include <vppinfra/bitmap.h>
16 #include <vppinfra/hash.h>
17 #include <vppinfra/pool.h>
18 #include <vppinfra/timing_wheel.h>
21 timing_wheel_init (timing_wheel_t * w, u64 current_cpu_time, f64 cpu_clocks_per_second)
23 if (w->max_sched_time <= w->min_sched_time)
25 w->min_sched_time = 1e-6;
26 w->max_sched_time = 1e-3;
29 w->cpu_clocks_per_second = cpu_clocks_per_second;
30 w->log2_clocks_per_bin = max_log2 (w->cpu_clocks_per_second * w->min_sched_time);
31 w->log2_bins_per_wheel = max_log2 (w->cpu_clocks_per_second * w->max_sched_time);
32 w->log2_bins_per_wheel -= w->log2_clocks_per_bin;
33 w->log2_clocks_per_wheel = w->log2_bins_per_wheel + w->log2_clocks_per_bin;
34 w->bins_per_wheel = 1 << w->log2_bins_per_wheel;
35 w->bins_per_wheel_mask = w->bins_per_wheel - 1;
37 w->current_time_index = current_cpu_time >> w->log2_clocks_per_bin;
39 if (w->n_wheel_elt_time_bits <= 0 ||
40 w->n_wheel_elt_time_bits >= STRUCT_BITS_OF (timing_wheel_elt_t, cpu_time_relative_to_base))
41 w->n_wheel_elt_time_bits = STRUCT_BITS_OF (timing_wheel_elt_t, cpu_time_relative_to_base) - 1;
43 w->cpu_time_base = current_cpu_time;
44 w->time_index_next_cpu_time_base_update
45 = w->current_time_index + ((u64) 1 << (w->n_wheel_elt_time_bits - w->log2_clocks_per_bin));
49 get_level_and_relative_time (timing_wheel_t * w, u64 cpu_time, uword * rtime_result)
54 dt = (cpu_time >> w->log2_clocks_per_bin);
56 /* Time should always move forward. */
57 ASSERT (dt >= w->current_time_index);
59 dt -= w->current_time_index;
61 /* Find level and offset within level. Level i has bins of size 2^((i+1)*M) */
63 for (level_index = 0; (rtime >> w->log2_bins_per_wheel) != 0; level_index++)
64 rtime = (rtime >> w->log2_bins_per_wheel) - 1;
66 /* Return offset within level and level index. */
67 ASSERT (rtime < w->bins_per_wheel);
68 *rtime_result = rtime;
73 time_index_to_wheel_index (timing_wheel_t * w, uword level_index, u64 ti)
74 { return (ti >> (level_index * w->log2_bins_per_wheel)) & w->bins_per_wheel_mask; }
76 /* Find current time on this level. */
78 current_time_wheel_index (timing_wheel_t * w, uword level_index)
79 { return time_index_to_wheel_index (w, level_index, w->current_time_index); }
81 /* Circular wheel indexing. */
83 wheel_add (timing_wheel_t * w, word x)
84 { return x & w->bins_per_wheel_mask; }
87 rtime_to_wheel_index (timing_wheel_t * w, uword level_index, uword rtime)
89 uword t = current_time_wheel_index (w, level_index);
90 return wheel_add (w, t + rtime);
94 validate_level (timing_wheel_t * w, uword level_index, uword * n_elts)
96 timing_wheel_level_t * level;
97 timing_wheel_elt_t * e;
99 clib_error_t * error = 0;
103 error = CLIB_ERROR_ASSERT (x); \
105 if (error) return error; \
108 level = vec_elt_at_index (w->levels, level_index);
109 for (wi = 0; wi < vec_len (level->elts); wi++)
111 /* Validate occupancy bitmap. */
112 _ (clib_bitmap_get_no_check (level->occupancy_bitmap, wi) == (vec_len (level->elts[wi]) > 0));
114 *n_elts += vec_len (level->elts[wi]);
116 vec_foreach (e, level->elts[wi])
118 /* Validate time bin and level. */
120 uword e_ti, e_li, e_wi;
122 e_time = e->cpu_time_relative_to_base + w->cpu_time_base;
123 e_li = get_level_and_relative_time (w, e_time, &e_ti);
124 e_wi = rtime_to_wheel_index (w, level_index, e_ti);
126 if (e_li == level_index - 1)
127 /* If this element was scheduled on the previous level
128 it must be wrapped. */
129 _ (e_ti + current_time_wheel_index (w, level_index - 1)
130 >= w->bins_per_wheel);
133 _ (e_li == level_index);
137 _ (e_wi == wi || e_wi + 1 == wi || e_wi - 1 == wi);
147 void timing_wheel_validate (timing_wheel_t * w)
150 clib_error_t * error = 0;
156 n_elts = pool_elts (w->overflow_pool);
157 for (l = 0; l < vec_len (w->levels); l++)
159 error = validate_level (w, l, &n_elts);
161 clib_error_report (error);
166 free_elt_vector (timing_wheel_t * w, timing_wheel_elt_t * ev)
168 /* Poison free elements so we never use them by mistake. */
170 memset (ev, ~0, vec_len (ev) * sizeof (ev[0]));
172 vec_add1 (w->free_elt_vectors, ev);
175 static timing_wheel_elt_t *
176 insert_helper (timing_wheel_t * w,
180 timing_wheel_level_t * level;
181 timing_wheel_elt_t * e;
184 /* Circular buffer. */
185 vec_validate (w->levels, level_index);
186 level = vec_elt_at_index (w->levels, level_index);
188 if (PREDICT_FALSE (! level->elts))
190 uword max = w->bins_per_wheel - 1;
191 clib_bitmap_validate (level->occupancy_bitmap, max);
192 vec_validate (level->elts, max);
195 wheel_index = rtime_to_wheel_index (w, level_index, rtime);
197 level->occupancy_bitmap = clib_bitmap_ori (level->occupancy_bitmap, wheel_index);
199 /* Allocate an elt vector from free list if there is one. */
200 if (! level->elts[wheel_index] && vec_len (w->free_elt_vectors))
201 level->elts[wheel_index] = vec_pop (w->free_elt_vectors);
203 /* Add element to vector for this time bin. */
204 vec_add2 (level->elts[wheel_index], e, 1);
209 /* Insert user data on wheel at given CPU time stamp. */
210 static void timing_wheel_insert_helper (timing_wheel_t * w, u64 insert_cpu_time, u32 user_data)
212 timing_wheel_elt_t * e;
214 uword rtime, level_index;
216 level_index = get_level_and_relative_time (w, insert_cpu_time, &rtime);
218 dt = insert_cpu_time - w->cpu_time_base;
219 if (PREDICT_TRUE (0 == (dt >> BITS (e->cpu_time_relative_to_base))))
221 e = insert_helper (w, level_index, rtime);
222 e->user_data = user_data;
223 e->cpu_time_relative_to_base = dt;
227 /* Time too far in the future: add to overflow vector. */
228 timing_wheel_overflow_elt_t * oe;
229 pool_get (w->overflow_pool, oe);
230 oe->user_data = user_data;
231 oe->cpu_time = insert_cpu_time;
236 elt_is_deleted (timing_wheel_t * w, u32 user_data)
238 return (hash_elts (w->deleted_user_data_hash) > 0
239 && hash_get (w->deleted_user_data_hash, user_data));
242 static timing_wheel_elt_t *
243 delete_user_data (timing_wheel_elt_t * elts, u32 user_data)
246 timing_wheel_elt_t * e, * new_elts;
248 /* Quickly scan to see if there are any elements to delete
251 vec_foreach (e, elts)
253 found_match = e->user_data == user_data;
260 /* Re-scan to build vector of new elts with matching user_data deleted. */
262 vec_foreach (e, elts)
264 if (e->user_data != user_data)
265 vec_add1 (new_elts, e[0]);
272 /* Insert user data on wheel at given CPU time stamp. */
273 void timing_wheel_insert (timing_wheel_t * w, u64 insert_cpu_time, u32 user_data)
275 /* Remove previously deleted elements. */
276 if (elt_is_deleted (w, user_data))
278 timing_wheel_level_t * l;
281 /* Delete elts with given user data so that stale events don't expire. */
282 vec_foreach (l, w->levels)
284 clib_bitmap_foreach (wi, l->occupancy_bitmap, ({
285 l->elts[wi] = delete_user_data (l->elts[wi], user_data);
286 if (vec_len (l->elts[wi]) == 0)
287 l->occupancy_bitmap = clib_bitmap_andnoti (l->occupancy_bitmap, wi);
292 timing_wheel_overflow_elt_t * oe;
293 pool_foreach (oe, w->overflow_pool, ({
294 if (oe->user_data == user_data)
295 pool_put (w->overflow_pool, oe);
299 hash_unset (w->deleted_user_data_hash, user_data);
302 timing_wheel_insert_helper (w, insert_cpu_time, user_data);
305 void timing_wheel_delete (timing_wheel_t * w, u32 user_data)
307 if (! w->deleted_user_data_hash)
308 w->deleted_user_data_hash = hash_create (/* capacity */ 0, /* value bytes */ 0);
310 hash_set1 (w->deleted_user_data_hash, user_data);
313 /* Returns time of next expiring element. */
314 u64 timing_wheel_next_expiring_elt_time (timing_wheel_t * w)
316 timing_wheel_level_t * l;
317 timing_wheel_elt_t * e;
325 vec_foreach (l, w->levels)
327 if (! l->occupancy_bitmap)
331 wi0 = wi = current_time_wheel_index (w, li);
335 if (clib_bitmap_get_no_check (l->occupancy_bitmap, wi))
337 vec_foreach (e, l->elts[wi])
338 min_dt = clib_min (min_dt, e->cpu_time_relative_to_base);
340 if (wrapped && li + 1 < vec_len (w->levels))
342 uword wi1 = current_time_wheel_index (w, li + 1);
343 if (l[1].occupancy_bitmap && clib_bitmap_get_no_check (l[1].occupancy_bitmap, wi1))
345 vec_foreach (e, l[1].elts[wi1]) {
346 min_dt = clib_min (min_dt, e->cpu_time_relative_to_base);
351 min_t = w->cpu_time_base + min_dt;
355 wi = wheel_add (w, wi + 1);
359 wrapped = wi != wi + 1;
364 timing_wheel_overflow_elt_t * oe;
367 min_t = w->cpu_time_base + min_dt;
369 pool_foreach (oe, w->overflow_pool,
370 ({ min_t = clib_min (min_t, oe->cpu_time); }));
378 insert_elt (timing_wheel_t * w, timing_wheel_elt_t * e)
380 u64 t = w->cpu_time_base + e->cpu_time_relative_to_base;
381 timing_wheel_insert_helper (w, t, e->user_data);
385 elt_cpu_time (timing_wheel_t * w, timing_wheel_elt_t * e)
386 { return w->cpu_time_base + e->cpu_time_relative_to_base; }
389 validate_expired_elt (timing_wheel_t * w, timing_wheel_elt_t * e,
390 u64 current_cpu_time)
394 u64 e_time = elt_cpu_time (w, e);
396 /* Verify that element is actually expired. */
397 ASSERT ((e_time >> w->log2_clocks_per_bin)
398 <= (current_cpu_time >> w->log2_clocks_per_bin));
403 expire_bin (timing_wheel_t * w,
406 u64 advance_cpu_time,
407 u32 * expired_user_data)
409 timing_wheel_level_t * level = vec_elt_at_index (w->levels, level_index);
410 timing_wheel_elt_t * e;
414 e = vec_elt (level->elts, wheel_index);
417 vec_add2 (expired_user_data, x, e_len);
418 for (i = j = 0; i < e_len; i++)
420 validate_expired_elt (w, &e[i], advance_cpu_time);
421 x[j] = e[i].user_data;
423 /* Only advance if elt is not to be deleted. */
424 j += ! elt_is_deleted (w, e[i].user_data);
427 /* Adjust for deleted elts. */
429 _vec_len (expired_user_data) -= e_len - j;
431 free_elt_vector (w, e);
433 level->elts[wheel_index] = 0;
434 clib_bitmap_set_no_check (level->occupancy_bitmap, wheel_index, 0);
436 return expired_user_data;
439 /* Called rarely. 32 bit times should only overflow every 4 seconds or so on a fast machine. */
441 advance_cpu_time_base (timing_wheel_t * w, u32 * expired_user_data)
443 timing_wheel_level_t * l;
444 timing_wheel_elt_t * e;
447 w->stats.cpu_time_base_advances++;
448 delta = ((u64) 1 << w->n_wheel_elt_time_bits);
449 w->cpu_time_base += delta;
450 w->time_index_next_cpu_time_base_update += delta >> w->log2_clocks_per_bin;
452 vec_foreach (l, w->levels)
455 clib_bitmap_foreach (wi, l->occupancy_bitmap, ({
456 vec_foreach (e, l->elts[wi])
458 /* This should always be true since otherwise we would have already expired
460 ASSERT (e->cpu_time_relative_to_base >= delta);
461 e->cpu_time_relative_to_base -= delta;
466 /* See which overflow elements fit now. */
468 timing_wheel_overflow_elt_t * oe;
469 pool_foreach (oe, w->overflow_pool, ({
470 /* It fits now into 32 bits. */
471 if (0 == ((oe->cpu_time - w->cpu_time_base) >> BITS (e->cpu_time_relative_to_base)))
473 u64 ti = oe->cpu_time >> w->log2_clocks_per_bin;
474 if (ti < w->current_time_index)
476 /* This can happen when timing wheel is not advanced for a long time
477 (for example when at a gdb breakpoint for a while). */
478 if (! elt_is_deleted (w, oe->user_data))
479 vec_add1 (expired_user_data, oe->user_data);
482 timing_wheel_insert_helper (w, oe->cpu_time, oe->user_data);
483 pool_put (w->overflow_pool, oe);
490 refill_level (timing_wheel_t * w,
492 u64 advance_cpu_time,
493 uword from_wheel_index,
494 uword to_wheel_index,
495 u32 * expired_user_data)
497 timing_wheel_level_t * level;
498 timing_wheel_elt_t * to_insert = w->unexpired_elts_pending_insert;
499 u64 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
501 vec_validate (w->stats.refills, level_index);
502 w->stats.refills[level_index] += 1;
504 if (level_index + 1 >= vec_len (w->levels))
507 level = vec_elt_at_index (w->levels, level_index + 1);
508 if (! level->occupancy_bitmap)
513 timing_wheel_elt_t * e, * es;
515 if (clib_bitmap_get_no_check (level->occupancy_bitmap, from_wheel_index))
517 es = level->elts[from_wheel_index];
518 level->elts[from_wheel_index] = 0;
519 clib_bitmap_set_no_check (level->occupancy_bitmap, from_wheel_index, 0);
523 u64 e_time = elt_cpu_time (w, e);
524 u64 ti = e_time >> w->log2_clocks_per_bin;
525 if (ti <= advance_time_index)
527 validate_expired_elt (w, e, advance_cpu_time);
528 if (! elt_is_deleted (w, e->user_data))
529 vec_add1 (expired_user_data, e->user_data);
532 vec_add1 (to_insert, e[0]);
534 free_elt_vector (w, es);
537 if (from_wheel_index == to_wheel_index)
540 from_wheel_index = wheel_add (w, from_wheel_index + 1);
543 timing_wheel_validate (w);
545 w->unexpired_elts_pending_insert = to_insert;
546 return expired_user_data;
549 /* Advance wheel and return any expired user data in vector. */
551 timing_wheel_advance (timing_wheel_t * w, u64 advance_cpu_time, u32 * expired_user_data,
552 u64 * next_expiring_element_cpu_time)
554 timing_wheel_level_t * level;
555 uword level_index, advance_rtime, advance_level_index, advance_wheel_index;
556 uword n_expired_user_data_before;
557 u64 current_time_index, advance_time_index;
559 n_expired_user_data_before = vec_len (expired_user_data);
561 /* Re-fill lower levels when time wraps. */
562 current_time_index = w->current_time_index;
563 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
566 u64 current_ti, advance_ti;
568 current_ti = current_time_index >> w->log2_bins_per_wheel;
569 advance_ti = advance_time_index >> w->log2_bins_per_wheel;
571 if (PREDICT_FALSE (current_ti != advance_ti))
573 if (w->unexpired_elts_pending_insert)
574 _vec_len (w->unexpired_elts_pending_insert) = 0;
577 while (current_ti != advance_ti)
580 c = current_ti & (w->bins_per_wheel - 1);
581 a = advance_ti & (w->bins_per_wheel - 1);
583 expired_user_data = refill_level (w,
588 current_ti >>= w->log2_bins_per_wheel;
589 advance_ti >>= w->log2_bins_per_wheel;
595 advance_level_index = get_level_and_relative_time (w, advance_cpu_time, &advance_rtime);
596 advance_wheel_index = rtime_to_wheel_index (w, advance_level_index, advance_rtime);
598 /* Empty all occupied bins for entire levels that we advance past. */
599 for (level_index = 0; level_index < advance_level_index; level_index++)
603 if (level_index >= vec_len (w->levels))
606 level = vec_elt_at_index (w->levels, level_index);
607 clib_bitmap_foreach (wi, level->occupancy_bitmap, ({
608 expired_user_data = expire_bin (w, level_index, wi, advance_cpu_time,
613 if (PREDICT_TRUE (level_index < vec_len (w->levels)))
616 level = vec_elt_at_index (w->levels, level_index);
617 wi = current_time_wheel_index (w, level_index);
618 if (level->occupancy_bitmap)
621 if (clib_bitmap_get_no_check (level->occupancy_bitmap, wi))
622 expired_user_data = expire_bin (w, advance_level_index, wi, advance_cpu_time,
625 if (wi == advance_wheel_index)
628 wi = wheel_add (w, wi + 1);
632 /* Advance current time index. */
633 w->current_time_index = advance_time_index;
635 if (vec_len (w->unexpired_elts_pending_insert) > 0)
637 timing_wheel_elt_t * e;
638 vec_foreach (e, w->unexpired_elts_pending_insert)
640 _vec_len (w->unexpired_elts_pending_insert) = 0;
643 /* Don't advance until necessary. */
644 while (PREDICT_FALSE (advance_time_index >= w->time_index_next_cpu_time_base_update))
645 advance_cpu_time_base (w, expired_user_data);
647 if (next_expiring_element_cpu_time)
651 /* Anything expired? If so we need to recompute next expiring elt time. */
652 if (vec_len (expired_user_data) == n_expired_user_data_before
653 && w->cached_min_cpu_time_on_wheel != 0ULL)
654 min_t = w->cached_min_cpu_time_on_wheel;
657 min_t = timing_wheel_next_expiring_elt_time (w);
658 w->cached_min_cpu_time_on_wheel = min_t;
661 *next_expiring_element_cpu_time = min_t;
664 return expired_user_data;
667 u8 * format_timing_wheel (u8 * s, va_list * va)
669 timing_wheel_t * w = va_arg (*va, timing_wheel_t *);
670 int verbose = va_arg (*va, int);
671 uword indent = format_get_indent (s);
673 s = format (s, "level 0: %.4e - %.4e secs, 2^%d - 2^%d clocks",
674 (f64) (1 << w->log2_clocks_per_bin) / w->cpu_clocks_per_second,
675 (f64) (1 << w->log2_clocks_per_wheel) / w->cpu_clocks_per_second,
676 w->log2_clocks_per_bin, w->log2_clocks_per_wheel);
682 s = format (s, "\n%Utime base advances %Ld, every %.4e secs",
683 format_white_space, indent + 2,
684 w->stats.cpu_time_base_advances,
685 (f64) ((u64) 1 << w->n_wheel_elt_time_bits) / w->cpu_clocks_per_second);
687 for (l = 0; l < vec_len (w->levels); l++)
688 s = format (s, "\n%Ulevel %d: refills %Ld",
689 format_white_space, indent + 2,
690 l, l < vec_len (w->stats.refills) ? w->stats.refills[l] : (u64) 0);