2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 #include <vppinfra/bitmap.h>
16 #include <vppinfra/hash.h>
17 #include <vppinfra/pool.h>
18 #include <vppinfra/timing_wheel.h>
21 timing_wheel_init (timing_wheel_t * w, u64 current_cpu_time,
22 f64 cpu_clocks_per_second)
24 if (w->max_sched_time <= w->min_sched_time)
26 w->min_sched_time = 1e-6;
27 w->max_sched_time = 1e-3;
30 w->cpu_clocks_per_second = cpu_clocks_per_second;
31 w->log2_clocks_per_bin =
32 max_log2 (w->cpu_clocks_per_second * w->min_sched_time);
33 w->log2_bins_per_wheel =
34 max_log2 (w->cpu_clocks_per_second * w->max_sched_time);
35 w->log2_bins_per_wheel -= w->log2_clocks_per_bin;
36 w->log2_clocks_per_wheel = w->log2_bins_per_wheel + w->log2_clocks_per_bin;
37 w->bins_per_wheel = 1 << w->log2_bins_per_wheel;
38 w->bins_per_wheel_mask = w->bins_per_wheel - 1;
40 w->current_time_index = current_cpu_time >> w->log2_clocks_per_bin;
42 if (w->n_wheel_elt_time_bits <= 0 ||
43 w->n_wheel_elt_time_bits >= STRUCT_BITS_OF (timing_wheel_elt_t,
44 cpu_time_relative_to_base))
45 w->n_wheel_elt_time_bits =
46 STRUCT_BITS_OF (timing_wheel_elt_t, cpu_time_relative_to_base) - 1;
48 w->cpu_time_base = current_cpu_time;
49 w->time_index_next_cpu_time_base_update
51 w->current_time_index +
52 ((u64) 1 << (w->n_wheel_elt_time_bits - w->log2_clocks_per_bin));
56 get_level_and_relative_time (timing_wheel_t * w, u64 cpu_time,
62 dt = (cpu_time >> w->log2_clocks_per_bin);
64 /* Time should always move forward. */
65 ASSERT (dt >= w->current_time_index);
67 dt -= w->current_time_index;
69 /* Find level and offset within level. Level i has bins of size 2^((i+1)*M) */
71 for (level_index = 0; (rtime >> w->log2_bins_per_wheel) != 0; level_index++)
72 rtime = (rtime >> w->log2_bins_per_wheel) - 1;
74 /* Return offset within level and level index. */
75 ASSERT (rtime < w->bins_per_wheel);
76 *rtime_result = rtime;
81 time_index_to_wheel_index (timing_wheel_t * w, uword level_index, u64 ti)
83 return (ti >> (level_index * w->log2_bins_per_wheel)) &
84 w->bins_per_wheel_mask;
87 /* Find current time on this level. */
89 current_time_wheel_index (timing_wheel_t * w, uword level_index)
91 return time_index_to_wheel_index (w, level_index, w->current_time_index);
94 /* Circular wheel indexing. */
96 wheel_add (timing_wheel_t * w, word x)
98 return x & w->bins_per_wheel_mask;
102 rtime_to_wheel_index (timing_wheel_t * w, uword level_index, uword rtime)
104 uword t = current_time_wheel_index (w, level_index);
105 return wheel_add (w, t + rtime);
108 static clib_error_t *
109 validate_level (timing_wheel_t * w, uword level_index, uword * n_elts)
111 timing_wheel_level_t *level;
112 timing_wheel_elt_t *e;
114 clib_error_t *error = 0;
118 error = CLIB_ERROR_ASSERT (x); \
120 if (error) return error; \
123 level = vec_elt_at_index (w->levels, level_index);
124 for (wi = 0; wi < vec_len (level->elts); wi++)
126 /* Validate occupancy bitmap. */
127 _(clib_bitmap_get_no_check (level->occupancy_bitmap, wi) ==
128 (vec_len (level->elts[wi]) > 0));
130 *n_elts += vec_len (level->elts[wi]);
132 vec_foreach (e, level->elts[wi])
134 /* Validate time bin and level. */
136 uword e_ti, e_li, e_wi;
138 e_time = e->cpu_time_relative_to_base + w->cpu_time_base;
139 e_li = get_level_and_relative_time (w, e_time, &e_ti);
140 e_wi = rtime_to_wheel_index (w, level_index, e_ti);
142 if (e_li == level_index - 1)
143 /* If this element was scheduled on the previous level
144 it must be wrapped. */
145 _(e_ti + current_time_wheel_index (w, level_index - 1)
146 >= w->bins_per_wheel);
149 _(e_li == level_index);
153 _(e_wi == wi || e_wi + 1 == wi || e_wi - 1 == wi);
164 timing_wheel_validate (timing_wheel_t * w)
167 clib_error_t *error = 0;
173 n_elts = pool_elts (w->overflow_pool);
174 for (l = 0; l < vec_len (w->levels); l++)
176 error = validate_level (w, l, &n_elts);
178 clib_error_report (error);
183 free_elt_vector (timing_wheel_t * w, timing_wheel_elt_t * ev)
185 /* Poison free elements so we never use them by mistake. */
187 clib_memset (ev, ~0, vec_len (ev) * sizeof (ev[0]));
189 vec_add1 (w->free_elt_vectors, ev);
192 static timing_wheel_elt_t *
193 insert_helper (timing_wheel_t * w, uword level_index, uword rtime)
195 timing_wheel_level_t *level;
196 timing_wheel_elt_t *e;
199 /* Circular buffer. */
200 vec_validate (w->levels, level_index);
201 level = vec_elt_at_index (w->levels, level_index);
203 if (PREDICT_FALSE (!level->elts))
205 uword max = w->bins_per_wheel - 1;
206 clib_bitmap_validate (level->occupancy_bitmap, max);
207 vec_validate (level->elts, max);
210 wheel_index = rtime_to_wheel_index (w, level_index, rtime);
212 level->occupancy_bitmap =
213 clib_bitmap_ori (level->occupancy_bitmap, wheel_index);
215 /* Allocate an elt vector from free list if there is one. */
216 if (!level->elts[wheel_index] && vec_len (w->free_elt_vectors))
217 level->elts[wheel_index] = vec_pop (w->free_elt_vectors);
219 /* Add element to vector for this time bin. */
220 vec_add2 (level->elts[wheel_index], e, 1);
225 /* Insert user data on wheel at given CPU time stamp. */
227 timing_wheel_insert_helper (timing_wheel_t * w, u64 insert_cpu_time,
230 timing_wheel_elt_t *e;
232 uword rtime, level_index;
234 level_index = get_level_and_relative_time (w, insert_cpu_time, &rtime);
236 dt = insert_cpu_time - w->cpu_time_base;
237 if (PREDICT_TRUE (0 == (dt >> BITS (e->cpu_time_relative_to_base))))
239 e = insert_helper (w, level_index, rtime);
240 e->user_data = user_data;
241 e->cpu_time_relative_to_base = dt;
242 if (insert_cpu_time < w->cached_min_cpu_time_on_wheel)
243 w->cached_min_cpu_time_on_wheel = insert_cpu_time;
247 /* Time too far in the future: add to overflow vector. */
248 timing_wheel_overflow_elt_t *oe;
249 pool_get (w->overflow_pool, oe);
250 oe->user_data = user_data;
251 oe->cpu_time = insert_cpu_time;
256 elt_is_deleted (timing_wheel_t * w, u32 user_data)
258 return (hash_elts (w->deleted_user_data_hash) > 0
259 && hash_get (w->deleted_user_data_hash, user_data));
262 static timing_wheel_elt_t *
263 delete_user_data (timing_wheel_elt_t * elts, u32 user_data)
266 timing_wheel_elt_t *e, *new_elts;
268 /* Quickly scan to see if there are any elements to delete
271 vec_foreach (e, elts)
273 found_match = e->user_data == user_data;
280 /* Re-scan to build vector of new elts with matching user_data deleted. */
282 vec_foreach (e, elts)
284 if (e->user_data != user_data)
285 vec_add1 (new_elts, e[0]);
292 /* Insert user data on wheel at given CPU time stamp. */
294 timing_wheel_insert (timing_wheel_t * w, u64 insert_cpu_time, u32 user_data)
296 /* Remove previously deleted elements. */
297 if (elt_is_deleted (w, user_data))
299 timing_wheel_level_t *l;
302 /* Delete elts with given user data so that stale events don't expire. */
303 vec_foreach (l, w->levels)
305 clib_bitmap_foreach (wi, l->occupancy_bitmap) {
306 l->elts[wi] = delete_user_data (l->elts[wi], user_data);
307 if (vec_len (l->elts[wi]) == 0)
308 l->occupancy_bitmap = clib_bitmap_andnoti (l->occupancy_bitmap, wi);
313 timing_wheel_overflow_elt_t *oe;
314 pool_foreach (oe, w->overflow_pool) {
315 if (oe->user_data == user_data)
316 pool_put (w->overflow_pool, oe);
320 hash_unset (w->deleted_user_data_hash, user_data);
323 timing_wheel_insert_helper (w, insert_cpu_time, user_data);
327 timing_wheel_delete (timing_wheel_t * w, u32 user_data)
329 if (!w->deleted_user_data_hash)
330 w->deleted_user_data_hash =
331 hash_create ( /* capacity */ 0, /* value bytes */ 0);
333 hash_set1 (w->deleted_user_data_hash, user_data);
336 /* Returns time of next expiring element. */
338 timing_wheel_next_expiring_elt_time (timing_wheel_t * w)
340 timing_wheel_level_t *l;
341 timing_wheel_elt_t *e;
349 vec_foreach (l, w->levels)
351 if (!l->occupancy_bitmap)
355 wi0 = wi = current_time_wheel_index (w, li);
359 if (clib_bitmap_get_no_check (l->occupancy_bitmap, wi))
361 vec_foreach (e, l->elts[wi])
362 min_dt = clib_min (min_dt, e->cpu_time_relative_to_base);
364 if (wrapped && li + 1 < vec_len (w->levels))
366 uword wi1 = current_time_wheel_index (w, li + 1);
367 if (l[1].occupancy_bitmap
368 && clib_bitmap_get_no_check (l[1].occupancy_bitmap, wi1))
370 vec_foreach (e, l[1].elts[wi1])
373 clib_min (min_dt, e->cpu_time_relative_to_base);
378 min_t = w->cpu_time_base + min_dt;
382 wi = wheel_add (w, wi + 1);
386 wrapped = wi != wi + 1;
391 timing_wheel_overflow_elt_t *oe;
394 min_t = w->cpu_time_base + min_dt;
396 pool_foreach (oe, w->overflow_pool)
397 { min_t = clib_min (min_t, oe->cpu_time); }
405 insert_elt (timing_wheel_t * w, timing_wheel_elt_t * e)
407 u64 t = w->cpu_time_base + e->cpu_time_relative_to_base;
408 timing_wheel_insert_helper (w, t, e->user_data);
412 elt_cpu_time (timing_wheel_t * w, timing_wheel_elt_t * e)
414 return w->cpu_time_base + e->cpu_time_relative_to_base;
418 validate_expired_elt (timing_wheel_t * w, timing_wheel_elt_t * e,
419 u64 current_cpu_time)
423 u64 e_time = elt_cpu_time (w, e);
425 /* Verify that element is actually expired. */
426 ASSERT ((e_time >> w->log2_clocks_per_bin)
427 <= (current_cpu_time >> w->log2_clocks_per_bin));
432 expire_bin (timing_wheel_t * w,
434 uword wheel_index, u64 advance_cpu_time, u32 * expired_user_data)
436 timing_wheel_level_t *level = vec_elt_at_index (w->levels, level_index);
437 timing_wheel_elt_t *e;
441 e = vec_elt (level->elts, wheel_index);
444 vec_add2 (expired_user_data, x, e_len);
445 for (i = j = 0; i < e_len; i++)
447 validate_expired_elt (w, &e[i], advance_cpu_time);
448 x[j] = e[i].user_data;
450 /* Only advance if elt is not to be deleted. */
451 j += !elt_is_deleted (w, e[i].user_data);
454 /* Adjust for deleted elts. */
456 vec_dec_len (expired_user_data, e_len - j);
458 free_elt_vector (w, e);
460 level->elts[wheel_index] = 0;
461 clib_bitmap_set_no_check (level->occupancy_bitmap, wheel_index, 0);
463 return expired_user_data;
466 /* Called rarely. 32 bit times should only overflow every 4 seconds or so on a fast machine. */
468 advance_cpu_time_base (timing_wheel_t * w, u32 * expired_user_data)
470 timing_wheel_level_t *l;
471 timing_wheel_elt_t *e;
474 w->stats.cpu_time_base_advances++;
475 delta = ((u64) 1 << w->n_wheel_elt_time_bits);
476 w->cpu_time_base += delta;
477 w->time_index_next_cpu_time_base_update += delta >> w->log2_clocks_per_bin;
479 vec_foreach (l, w->levels)
482 clib_bitmap_foreach (wi, l->occupancy_bitmap) {
483 vec_foreach (e, l->elts[wi])
485 /* This should always be true since otherwise we would have already expired
486 this element. Note that in the second half of this function we need
487 to take care not to place the expired elements ourselves. */
488 ASSERT (e->cpu_time_relative_to_base >= delta);
489 e->cpu_time_relative_to_base -= delta;
494 /* See which overflow elements fit now. */
496 timing_wheel_overflow_elt_t *oe;
497 pool_foreach (oe, w->overflow_pool) {
498 /* It fits now into 32 bits. */
499 if (0 == ((oe->cpu_time - w->cpu_time_base) >> BITS (e->cpu_time_relative_to_base)))
501 u64 ti = oe->cpu_time >> w->log2_clocks_per_bin;
502 if (ti <= w->current_time_index)
504 /* This can happen when timing wheel is not advanced for a long time
505 (for example when at a gdb breakpoint for a while). */
506 /* Note: the ti == w->current_time_index means it is also an expired timer */
507 if (! elt_is_deleted (w, oe->user_data))
508 vec_add1 (expired_user_data, oe->user_data);
511 timing_wheel_insert_helper (w, oe->cpu_time, oe->user_data);
512 pool_put (w->overflow_pool, oe);
516 return expired_user_data;
520 refill_level (timing_wheel_t * w,
522 u64 advance_cpu_time,
523 uword from_wheel_index,
524 uword to_wheel_index, u32 * expired_user_data)
526 timing_wheel_level_t *level;
527 timing_wheel_elt_t *to_insert = w->unexpired_elts_pending_insert;
528 u64 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
530 vec_validate (w->stats.refills, level_index);
531 w->stats.refills[level_index] += 1;
533 if (level_index + 1 >= vec_len (w->levels))
536 level = vec_elt_at_index (w->levels, level_index + 1);
537 if (!level->occupancy_bitmap)
542 timing_wheel_elt_t *e, *es;
544 if (clib_bitmap_get_no_check
545 (level->occupancy_bitmap, from_wheel_index))
547 es = level->elts[from_wheel_index];
548 level->elts[from_wheel_index] = 0;
549 clib_bitmap_set_no_check (level->occupancy_bitmap, from_wheel_index,
554 u64 e_time = elt_cpu_time (w, e);
555 u64 ti = e_time >> w->log2_clocks_per_bin;
556 if (ti <= advance_time_index)
558 validate_expired_elt (w, e, advance_cpu_time);
559 if (!elt_is_deleted (w, e->user_data))
560 vec_add1 (expired_user_data, e->user_data);
563 vec_add1 (to_insert, e[0]);
565 free_elt_vector (w, es);
568 if (from_wheel_index == to_wheel_index)
571 from_wheel_index = wheel_add (w, from_wheel_index + 1);
574 timing_wheel_validate (w);
576 w->unexpired_elts_pending_insert = to_insert;
577 return expired_user_data;
580 /* Advance wheel and return any expired user data in vector. */
582 timing_wheel_advance (timing_wheel_t * w, u64 advance_cpu_time,
583 u32 * expired_user_data,
584 u64 * next_expiring_element_cpu_time)
586 timing_wheel_level_t *level;
587 uword level_index, advance_rtime, advance_level_index, advance_wheel_index;
588 uword n_expired_user_data_before;
589 u64 current_time_index, advance_time_index;
591 n_expired_user_data_before = vec_len (expired_user_data);
593 /* Re-fill lower levels when time wraps. */
594 current_time_index = w->current_time_index;
595 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
598 u64 current_ti, advance_ti;
600 current_ti = current_time_index >> w->log2_bins_per_wheel;
601 advance_ti = advance_time_index >> w->log2_bins_per_wheel;
603 if (PREDICT_FALSE (current_ti != advance_ti))
605 if (w->unexpired_elts_pending_insert)
606 vec_set_len (w->unexpired_elts_pending_insert, 0);
609 while (current_ti != advance_ti)
612 c = current_ti & (w->bins_per_wheel - 1);
613 a = advance_ti & (w->bins_per_wheel - 1);
615 expired_user_data = refill_level (w,
618 c, a, expired_user_data);
619 current_ti >>= w->log2_bins_per_wheel;
620 advance_ti >>= w->log2_bins_per_wheel;
626 advance_level_index =
627 get_level_and_relative_time (w, advance_cpu_time, &advance_rtime);
628 advance_wheel_index =
629 rtime_to_wheel_index (w, advance_level_index, advance_rtime);
631 /* Empty all occupied bins for entire levels that we advance past. */
632 for (level_index = 0; level_index < advance_level_index; level_index++)
636 if (level_index >= vec_len (w->levels))
639 level = vec_elt_at_index (w->levels, level_index);
640 clib_bitmap_foreach (wi, level->occupancy_bitmap) {
641 expired_user_data = expire_bin (w, level_index, wi, advance_cpu_time,
646 if (PREDICT_TRUE (level_index < vec_len (w->levels)))
649 level = vec_elt_at_index (w->levels, level_index);
650 wi = current_time_wheel_index (w, level_index);
651 if (level->occupancy_bitmap)
654 if (clib_bitmap_get_no_check (level->occupancy_bitmap, wi))
656 expire_bin (w, advance_level_index, wi, advance_cpu_time,
659 /* When we jump out, we have already just expired the bin,
660 corresponding to advance_wheel_index */
661 if (wi == advance_wheel_index)
664 wi = wheel_add (w, wi + 1);
668 /* Advance current time index. */
669 w->current_time_index = advance_time_index;
671 if (vec_len (w->unexpired_elts_pending_insert) > 0)
673 timing_wheel_elt_t *e;
674 vec_foreach (e, w->unexpired_elts_pending_insert) insert_elt (w, e);
675 vec_set_len (w->unexpired_elts_pending_insert, 0);
678 /* Don't advance until necessary. */
679 /* However, if the timing_wheel_advance() hasn't been called for some time,
680 the while() loop will ensure multiple calls to advance_cpu_time_base()
681 in a row until the w->cpu_time_base is fresh enough. */
683 (advance_time_index >= w->time_index_next_cpu_time_base_update))
684 expired_user_data = advance_cpu_time_base (w, expired_user_data);
686 if (next_expiring_element_cpu_time)
690 /* Anything expired? If so we need to recompute next expiring elt time. */
691 if (vec_len (expired_user_data) == n_expired_user_data_before
692 && w->cached_min_cpu_time_on_wheel != 0ULL)
693 min_t = w->cached_min_cpu_time_on_wheel;
696 min_t = timing_wheel_next_expiring_elt_time (w);
697 w->cached_min_cpu_time_on_wheel = min_t;
700 *next_expiring_element_cpu_time = min_t;
703 return expired_user_data;
707 format_timing_wheel (u8 * s, va_list * va)
709 timing_wheel_t *w = va_arg (*va, timing_wheel_t *);
710 int verbose = va_arg (*va, int);
711 u32 indent = format_get_indent (s);
713 s = format (s, "level 0: %.4e - %.4e secs, 2^%d - 2^%d clocks",
714 (f64) (1 << w->log2_clocks_per_bin) / w->cpu_clocks_per_second,
715 (f64) (1 << w->log2_clocks_per_wheel) /
716 w->cpu_clocks_per_second, w->log2_clocks_per_bin,
717 w->log2_clocks_per_wheel);
723 s = format (s, "\n%Utime base advances %Ld, every %.4e secs",
724 format_white_space, indent + 2,
725 w->stats.cpu_time_base_advances,
726 (f64) ((u64) 1 << w->n_wheel_elt_time_bits) /
727 w->cpu_clocks_per_second);
729 for (l = 0; l < vec_len (w->levels); l++)
730 s = format (s, "\n%Ulevel %d: refills %Ld",
731 format_white_space, indent + 2,
734 vec_len (w->stats.refills) ? w->stats.
735 refills[l] : (u64) 0);
742 * fd.io coding-style-patch-verification: ON
745 * eval: (c-set-style "gnu")