2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 #include <vppinfra/bitmap.h>
16 #include <vppinfra/hash.h>
17 #include <vppinfra/pool.h>
18 #include <vppinfra/timing_wheel.h>
21 timing_wheel_init (timing_wheel_t * w, u64 current_cpu_time,
22 f64 cpu_clocks_per_second)
24 if (w->max_sched_time <= w->min_sched_time)
26 w->min_sched_time = 1e-6;
27 w->max_sched_time = 1e-3;
30 w->cpu_clocks_per_second = cpu_clocks_per_second;
31 w->log2_clocks_per_bin =
32 max_log2 (w->cpu_clocks_per_second * w->min_sched_time);
33 w->log2_bins_per_wheel =
34 max_log2 (w->cpu_clocks_per_second * w->max_sched_time);
35 w->log2_bins_per_wheel -= w->log2_clocks_per_bin;
36 w->log2_clocks_per_wheel = w->log2_bins_per_wheel + w->log2_clocks_per_bin;
37 w->bins_per_wheel = 1 << w->log2_bins_per_wheel;
38 w->bins_per_wheel_mask = w->bins_per_wheel - 1;
40 w->current_time_index = current_cpu_time >> w->log2_clocks_per_bin;
42 if (w->n_wheel_elt_time_bits <= 0 ||
43 w->n_wheel_elt_time_bits >= STRUCT_BITS_OF (timing_wheel_elt_t,
44 cpu_time_relative_to_base))
45 w->n_wheel_elt_time_bits =
46 STRUCT_BITS_OF (timing_wheel_elt_t, cpu_time_relative_to_base) - 1;
48 w->cpu_time_base = current_cpu_time;
49 w->time_index_next_cpu_time_base_update
51 w->current_time_index +
52 ((u64) 1 << (w->n_wheel_elt_time_bits - w->log2_clocks_per_bin));
56 get_level_and_relative_time (timing_wheel_t * w, u64 cpu_time,
62 dt = (cpu_time >> w->log2_clocks_per_bin);
64 /* Time should always move forward. */
65 ASSERT (dt >= w->current_time_index);
67 dt -= w->current_time_index;
69 /* Find level and offset within level. Level i has bins of size 2^((i+1)*M) */
71 for (level_index = 0; (rtime >> w->log2_bins_per_wheel) != 0; level_index++)
72 rtime = (rtime >> w->log2_bins_per_wheel) - 1;
74 /* Return offset within level and level index. */
75 ASSERT (rtime < w->bins_per_wheel);
76 *rtime_result = rtime;
81 time_index_to_wheel_index (timing_wheel_t * w, uword level_index, u64 ti)
83 return (ti >> (level_index * w->log2_bins_per_wheel)) &
84 w->bins_per_wheel_mask;
87 /* Find current time on this level. */
89 current_time_wheel_index (timing_wheel_t * w, uword level_index)
91 return time_index_to_wheel_index (w, level_index, w->current_time_index);
94 /* Circular wheel indexing. */
96 wheel_add (timing_wheel_t * w, word x)
98 return x & w->bins_per_wheel_mask;
102 rtime_to_wheel_index (timing_wheel_t * w, uword level_index, uword rtime)
104 uword t = current_time_wheel_index (w, level_index);
105 return wheel_add (w, t + rtime);
108 static clib_error_t *
109 validate_level (timing_wheel_t * w, uword level_index, uword * n_elts)
111 timing_wheel_level_t *level;
112 timing_wheel_elt_t *e;
114 clib_error_t *error = 0;
118 error = CLIB_ERROR_ASSERT (x); \
120 if (error) return error; \
123 level = vec_elt_at_index (w->levels, level_index);
124 for (wi = 0; wi < vec_len (level->elts); wi++)
126 /* Validate occupancy bitmap. */
127 _(clib_bitmap_get_no_check (level->occupancy_bitmap, wi) ==
128 (vec_len (level->elts[wi]) > 0));
130 *n_elts += vec_len (level->elts[wi]);
132 vec_foreach (e, level->elts[wi])
134 /* Validate time bin and level. */
136 uword e_ti, e_li, e_wi;
138 e_time = e->cpu_time_relative_to_base + w->cpu_time_base;
139 e_li = get_level_and_relative_time (w, e_time, &e_ti);
140 e_wi = rtime_to_wheel_index (w, level_index, e_ti);
142 if (e_li == level_index - 1)
143 /* If this element was scheduled on the previous level
144 it must be wrapped. */
145 _(e_ti + current_time_wheel_index (w, level_index - 1)
146 >= w->bins_per_wheel);
149 _(e_li == level_index);
153 _(e_wi == wi || e_wi + 1 == wi || e_wi - 1 == wi);
164 timing_wheel_validate (timing_wheel_t * w)
167 clib_error_t *error = 0;
173 n_elts = pool_elts (w->overflow_pool);
174 for (l = 0; l < vec_len (w->levels); l++)
176 error = validate_level (w, l, &n_elts);
178 clib_error_report (error);
183 free_elt_vector (timing_wheel_t * w, timing_wheel_elt_t * ev)
185 /* Poison free elements so we never use them by mistake. */
187 clib_memset (ev, ~0, vec_len (ev) * sizeof (ev[0]));
189 vec_add1 (w->free_elt_vectors, ev);
192 static timing_wheel_elt_t *
193 insert_helper (timing_wheel_t * w, uword level_index, uword rtime)
195 timing_wheel_level_t *level;
196 timing_wheel_elt_t *e;
199 /* Circular buffer. */
200 vec_validate (w->levels, level_index);
201 level = vec_elt_at_index (w->levels, level_index);
203 if (PREDICT_FALSE (!level->elts))
205 uword max = w->bins_per_wheel - 1;
206 clib_bitmap_validate (level->occupancy_bitmap, max);
207 vec_validate (level->elts, max);
210 wheel_index = rtime_to_wheel_index (w, level_index, rtime);
212 level->occupancy_bitmap =
213 clib_bitmap_ori (level->occupancy_bitmap, wheel_index);
215 /* Allocate an elt vector from free list if there is one. */
216 if (!level->elts[wheel_index] && vec_len (w->free_elt_vectors))
217 level->elts[wheel_index] = vec_pop (w->free_elt_vectors);
219 /* Add element to vector for this time bin. */
220 vec_add2 (level->elts[wheel_index], e, 1);
225 /* Insert user data on wheel at given CPU time stamp. */
227 timing_wheel_insert_helper (timing_wheel_t * w, u64 insert_cpu_time,
230 timing_wheel_elt_t *e;
232 uword rtime, level_index;
234 level_index = get_level_and_relative_time (w, insert_cpu_time, &rtime);
236 dt = insert_cpu_time - w->cpu_time_base;
237 if (PREDICT_TRUE (0 == (dt >> BITS (e->cpu_time_relative_to_base))))
239 e = insert_helper (w, level_index, rtime);
240 e->user_data = user_data;
241 e->cpu_time_relative_to_base = dt;
242 if (insert_cpu_time < w->cached_min_cpu_time_on_wheel)
243 w->cached_min_cpu_time_on_wheel = insert_cpu_time;
247 /* Time too far in the future: add to overflow vector. */
248 timing_wheel_overflow_elt_t *oe;
249 pool_get (w->overflow_pool, oe);
250 oe->user_data = user_data;
251 oe->cpu_time = insert_cpu_time;
256 elt_is_deleted (timing_wheel_t * w, u32 user_data)
258 return (hash_elts (w->deleted_user_data_hash) > 0
259 && hash_get (w->deleted_user_data_hash, user_data));
262 static timing_wheel_elt_t *
263 delete_user_data (timing_wheel_elt_t * elts, u32 user_data)
266 timing_wheel_elt_t *e, *new_elts;
268 /* Quickly scan to see if there are any elements to delete
271 vec_foreach (e, elts)
273 found_match = e->user_data == user_data;
280 /* Re-scan to build vector of new elts with matching user_data deleted. */
282 vec_foreach (e, elts)
284 if (e->user_data != user_data)
285 vec_add1 (new_elts, e[0]);
292 /* Insert user data on wheel at given CPU time stamp. */
294 timing_wheel_insert (timing_wheel_t * w, u64 insert_cpu_time, u32 user_data)
296 /* Remove previously deleted elements. */
297 if (elt_is_deleted (w, user_data))
299 timing_wheel_level_t *l;
302 /* Delete elts with given user data so that stale events don't expire. */
303 vec_foreach (l, w->levels)
306 clib_bitmap_foreach (wi, l->occupancy_bitmap) {
307 l->elts[wi] = delete_user_data (l->elts[wi], user_data);
308 if (vec_len (l->elts[wi]) == 0)
309 l->occupancy_bitmap = clib_bitmap_andnoti (l->occupancy_bitmap, wi);
315 timing_wheel_overflow_elt_t *oe;
317 pool_foreach (oe, w->overflow_pool) {
318 if (oe->user_data == user_data)
319 pool_put (w->overflow_pool, oe);
324 hash_unset (w->deleted_user_data_hash, user_data);
327 timing_wheel_insert_helper (w, insert_cpu_time, user_data);
331 timing_wheel_delete (timing_wheel_t * w, u32 user_data)
333 if (!w->deleted_user_data_hash)
334 w->deleted_user_data_hash =
335 hash_create ( /* capacity */ 0, /* value bytes */ 0);
337 hash_set1 (w->deleted_user_data_hash, user_data);
340 /* Returns time of next expiring element. */
342 timing_wheel_next_expiring_elt_time (timing_wheel_t * w)
344 timing_wheel_level_t *l;
345 timing_wheel_elt_t *e;
353 vec_foreach (l, w->levels)
355 if (!l->occupancy_bitmap)
359 wi0 = wi = current_time_wheel_index (w, li);
363 if (clib_bitmap_get_no_check (l->occupancy_bitmap, wi))
365 vec_foreach (e, l->elts[wi])
366 min_dt = clib_min (min_dt, e->cpu_time_relative_to_base);
368 if (wrapped && li + 1 < vec_len (w->levels))
370 uword wi1 = current_time_wheel_index (w, li + 1);
371 if (l[1].occupancy_bitmap
372 && clib_bitmap_get_no_check (l[1].occupancy_bitmap, wi1))
374 vec_foreach (e, l[1].elts[wi1])
377 clib_min (min_dt, e->cpu_time_relative_to_base);
382 min_t = w->cpu_time_base + min_dt;
386 wi = wheel_add (w, wi + 1);
390 wrapped = wi != wi + 1;
395 timing_wheel_overflow_elt_t *oe;
398 min_t = w->cpu_time_base + min_dt;
401 pool_foreach (oe, w->overflow_pool)
402 { min_t = clib_min (min_t, oe->cpu_time); }
411 insert_elt (timing_wheel_t * w, timing_wheel_elt_t * e)
413 u64 t = w->cpu_time_base + e->cpu_time_relative_to_base;
414 timing_wheel_insert_helper (w, t, e->user_data);
418 elt_cpu_time (timing_wheel_t * w, timing_wheel_elt_t * e)
420 return w->cpu_time_base + e->cpu_time_relative_to_base;
424 validate_expired_elt (timing_wheel_t * w, timing_wheel_elt_t * e,
425 u64 current_cpu_time)
429 u64 e_time = elt_cpu_time (w, e);
431 /* Verify that element is actually expired. */
432 ASSERT ((e_time >> w->log2_clocks_per_bin)
433 <= (current_cpu_time >> w->log2_clocks_per_bin));
438 expire_bin (timing_wheel_t * w,
440 uword wheel_index, u64 advance_cpu_time, u32 * expired_user_data)
442 timing_wheel_level_t *level = vec_elt_at_index (w->levels, level_index);
443 timing_wheel_elt_t *e;
447 e = vec_elt (level->elts, wheel_index);
450 vec_add2 (expired_user_data, x, e_len);
451 for (i = j = 0; i < e_len; i++)
453 validate_expired_elt (w, &e[i], advance_cpu_time);
454 x[j] = e[i].user_data;
456 /* Only advance if elt is not to be deleted. */
457 j += !elt_is_deleted (w, e[i].user_data);
460 /* Adjust for deleted elts. */
462 _vec_len (expired_user_data) -= e_len - j;
464 free_elt_vector (w, e);
466 level->elts[wheel_index] = 0;
467 clib_bitmap_set_no_check (level->occupancy_bitmap, wheel_index, 0);
469 return expired_user_data;
472 /* Called rarely. 32 bit times should only overflow every 4 seconds or so on a fast machine. */
474 advance_cpu_time_base (timing_wheel_t * w, u32 * expired_user_data)
476 timing_wheel_level_t *l;
477 timing_wheel_elt_t *e;
480 w->stats.cpu_time_base_advances++;
481 delta = ((u64) 1 << w->n_wheel_elt_time_bits);
482 w->cpu_time_base += delta;
483 w->time_index_next_cpu_time_base_update += delta >> w->log2_clocks_per_bin;
485 vec_foreach (l, w->levels)
489 clib_bitmap_foreach (wi, l->occupancy_bitmap) {
490 vec_foreach (e, l->elts[wi])
492 /* This should always be true since otherwise we would have already expired
493 this element. Note that in the second half of this function we need
494 to take care not to place the expired elements ourselves. */
495 ASSERT (e->cpu_time_relative_to_base >= delta);
496 e->cpu_time_relative_to_base -= delta;
502 /* See which overflow elements fit now. */
504 timing_wheel_overflow_elt_t *oe;
506 pool_foreach (oe, w->overflow_pool) {
507 /* It fits now into 32 bits. */
508 if (0 == ((oe->cpu_time - w->cpu_time_base) >> BITS (e->cpu_time_relative_to_base)))
510 u64 ti = oe->cpu_time >> w->log2_clocks_per_bin;
511 if (ti <= w->current_time_index)
513 /* This can happen when timing wheel is not advanced for a long time
514 (for example when at a gdb breakpoint for a while). */
515 /* Note: the ti == w->current_time_index means it is also an expired timer */
516 if (! elt_is_deleted (w, oe->user_data))
517 vec_add1 (expired_user_data, oe->user_data);
520 timing_wheel_insert_helper (w, oe->cpu_time, oe->user_data);
521 pool_put (w->overflow_pool, oe);
526 return expired_user_data;
530 refill_level (timing_wheel_t * w,
532 u64 advance_cpu_time,
533 uword from_wheel_index,
534 uword to_wheel_index, u32 * expired_user_data)
536 timing_wheel_level_t *level;
537 timing_wheel_elt_t *to_insert = w->unexpired_elts_pending_insert;
538 u64 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
540 vec_validate (w->stats.refills, level_index);
541 w->stats.refills[level_index] += 1;
543 if (level_index + 1 >= vec_len (w->levels))
546 level = vec_elt_at_index (w->levels, level_index + 1);
547 if (!level->occupancy_bitmap)
552 timing_wheel_elt_t *e, *es;
554 if (clib_bitmap_get_no_check
555 (level->occupancy_bitmap, from_wheel_index))
557 es = level->elts[from_wheel_index];
558 level->elts[from_wheel_index] = 0;
559 clib_bitmap_set_no_check (level->occupancy_bitmap, from_wheel_index,
564 u64 e_time = elt_cpu_time (w, e);
565 u64 ti = e_time >> w->log2_clocks_per_bin;
566 if (ti <= advance_time_index)
568 validate_expired_elt (w, e, advance_cpu_time);
569 if (!elt_is_deleted (w, e->user_data))
570 vec_add1 (expired_user_data, e->user_data);
573 vec_add1 (to_insert, e[0]);
575 free_elt_vector (w, es);
578 if (from_wheel_index == to_wheel_index)
581 from_wheel_index = wheel_add (w, from_wheel_index + 1);
584 timing_wheel_validate (w);
586 w->unexpired_elts_pending_insert = to_insert;
587 return expired_user_data;
590 /* Advance wheel and return any expired user data in vector. */
592 timing_wheel_advance (timing_wheel_t * w, u64 advance_cpu_time,
593 u32 * expired_user_data,
594 u64 * next_expiring_element_cpu_time)
596 timing_wheel_level_t *level;
597 uword level_index, advance_rtime, advance_level_index, advance_wheel_index;
598 uword n_expired_user_data_before;
599 u64 current_time_index, advance_time_index;
601 n_expired_user_data_before = vec_len (expired_user_data);
603 /* Re-fill lower levels when time wraps. */
604 current_time_index = w->current_time_index;
605 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
608 u64 current_ti, advance_ti;
610 current_ti = current_time_index >> w->log2_bins_per_wheel;
611 advance_ti = advance_time_index >> w->log2_bins_per_wheel;
613 if (PREDICT_FALSE (current_ti != advance_ti))
615 if (w->unexpired_elts_pending_insert)
616 _vec_len (w->unexpired_elts_pending_insert) = 0;
619 while (current_ti != advance_ti)
622 c = current_ti & (w->bins_per_wheel - 1);
623 a = advance_ti & (w->bins_per_wheel - 1);
625 expired_user_data = refill_level (w,
628 c, a, expired_user_data);
629 current_ti >>= w->log2_bins_per_wheel;
630 advance_ti >>= w->log2_bins_per_wheel;
636 advance_level_index =
637 get_level_and_relative_time (w, advance_cpu_time, &advance_rtime);
638 advance_wheel_index =
639 rtime_to_wheel_index (w, advance_level_index, advance_rtime);
641 /* Empty all occupied bins for entire levels that we advance past. */
642 for (level_index = 0; level_index < advance_level_index; level_index++)
646 if (level_index >= vec_len (w->levels))
649 level = vec_elt_at_index (w->levels, level_index);
651 clib_bitmap_foreach (wi, level->occupancy_bitmap) {
652 expired_user_data = expire_bin (w, level_index, wi, advance_cpu_time,
658 if (PREDICT_TRUE (level_index < vec_len (w->levels)))
661 level = vec_elt_at_index (w->levels, level_index);
662 wi = current_time_wheel_index (w, level_index);
663 if (level->occupancy_bitmap)
666 if (clib_bitmap_get_no_check (level->occupancy_bitmap, wi))
668 expire_bin (w, advance_level_index, wi, advance_cpu_time,
671 /* When we jump out, we have already just expired the bin,
672 corresponding to advance_wheel_index */
673 if (wi == advance_wheel_index)
676 wi = wheel_add (w, wi + 1);
680 /* Advance current time index. */
681 w->current_time_index = advance_time_index;
683 if (vec_len (w->unexpired_elts_pending_insert) > 0)
685 timing_wheel_elt_t *e;
686 vec_foreach (e, w->unexpired_elts_pending_insert) insert_elt (w, e);
687 _vec_len (w->unexpired_elts_pending_insert) = 0;
690 /* Don't advance until necessary. */
691 /* However, if the timing_wheel_advance() hasn't been called for some time,
692 the while() loop will ensure multiple calls to advance_cpu_time_base()
693 in a row until the w->cpu_time_base is fresh enough. */
695 (advance_time_index >= w->time_index_next_cpu_time_base_update))
696 expired_user_data = advance_cpu_time_base (w, expired_user_data);
698 if (next_expiring_element_cpu_time)
702 /* Anything expired? If so we need to recompute next expiring elt time. */
703 if (vec_len (expired_user_data) == n_expired_user_data_before
704 && w->cached_min_cpu_time_on_wheel != 0ULL)
705 min_t = w->cached_min_cpu_time_on_wheel;
708 min_t = timing_wheel_next_expiring_elt_time (w);
709 w->cached_min_cpu_time_on_wheel = min_t;
712 *next_expiring_element_cpu_time = min_t;
715 return expired_user_data;
719 format_timing_wheel (u8 * s, va_list * va)
721 timing_wheel_t *w = va_arg (*va, timing_wheel_t *);
722 int verbose = va_arg (*va, int);
723 u32 indent = format_get_indent (s);
725 s = format (s, "level 0: %.4e - %.4e secs, 2^%d - 2^%d clocks",
726 (f64) (1 << w->log2_clocks_per_bin) / w->cpu_clocks_per_second,
727 (f64) (1 << w->log2_clocks_per_wheel) /
728 w->cpu_clocks_per_second, w->log2_clocks_per_bin,
729 w->log2_clocks_per_wheel);
735 s = format (s, "\n%Utime base advances %Ld, every %.4e secs",
736 format_white_space, indent + 2,
737 w->stats.cpu_time_base_advances,
738 (f64) ((u64) 1 << w->n_wheel_elt_time_bits) /
739 w->cpu_clocks_per_second);
741 for (l = 0; l < vec_len (w->levels); l++)
742 s = format (s, "\n%Ulevel %d: refills %Ld",
743 format_white_space, indent + 2,
746 vec_len (w->stats.refills) ? w->stats.
747 refills[l] : (u64) 0);
754 * fd.io coding-style-patch-verification: ON
757 * eval: (c-set-style "gnu")