2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 #include <vppinfra/bitmap.h>
16 #include <vppinfra/hash.h>
17 #include <vppinfra/pool.h>
18 #include <vppinfra/timing_wheel.h>
21 timing_wheel_init (timing_wheel_t * w, u64 current_cpu_time,
22 f64 cpu_clocks_per_second)
24 if (w->max_sched_time <= w->min_sched_time)
26 w->min_sched_time = 1e-6;
27 w->max_sched_time = 1e-3;
30 w->cpu_clocks_per_second = cpu_clocks_per_second;
31 w->log2_clocks_per_bin =
32 max_log2 (w->cpu_clocks_per_second * w->min_sched_time);
33 w->log2_bins_per_wheel =
34 max_log2 (w->cpu_clocks_per_second * w->max_sched_time);
35 w->log2_bins_per_wheel -= w->log2_clocks_per_bin;
36 w->log2_clocks_per_wheel = w->log2_bins_per_wheel + w->log2_clocks_per_bin;
37 w->bins_per_wheel = 1 << w->log2_bins_per_wheel;
38 w->bins_per_wheel_mask = w->bins_per_wheel - 1;
40 w->current_time_index = current_cpu_time >> w->log2_clocks_per_bin;
42 if (w->n_wheel_elt_time_bits <= 0 ||
43 w->n_wheel_elt_time_bits >= STRUCT_BITS_OF (timing_wheel_elt_t,
44 cpu_time_relative_to_base))
45 w->n_wheel_elt_time_bits =
46 STRUCT_BITS_OF (timing_wheel_elt_t, cpu_time_relative_to_base) - 1;
48 w->cpu_time_base = current_cpu_time;
49 w->time_index_next_cpu_time_base_update
51 w->current_time_index +
52 ((u64) 1 << (w->n_wheel_elt_time_bits - w->log2_clocks_per_bin));
56 get_level_and_relative_time (timing_wheel_t * w, u64 cpu_time,
62 dt = (cpu_time >> w->log2_clocks_per_bin);
64 /* Time should always move forward. */
65 ASSERT (dt >= w->current_time_index);
67 dt -= w->current_time_index;
69 /* Find level and offset within level. Level i has bins of size 2^((i+1)*M) */
71 for (level_index = 0; (rtime >> w->log2_bins_per_wheel) != 0; level_index++)
72 rtime = (rtime >> w->log2_bins_per_wheel) - 1;
74 /* Return offset within level and level index. */
75 ASSERT (rtime < w->bins_per_wheel);
76 *rtime_result = rtime;
81 time_index_to_wheel_index (timing_wheel_t * w, uword level_index, u64 ti)
83 return (ti >> (level_index * w->log2_bins_per_wheel)) &
84 w->bins_per_wheel_mask;
87 /* Find current time on this level. */
89 current_time_wheel_index (timing_wheel_t * w, uword level_index)
91 return time_index_to_wheel_index (w, level_index, w->current_time_index);
94 /* Circular wheel indexing. */
96 wheel_add (timing_wheel_t * w, word x)
98 return x & w->bins_per_wheel_mask;
102 rtime_to_wheel_index (timing_wheel_t * w, uword level_index, uword rtime)
104 uword t = current_time_wheel_index (w, level_index);
105 return wheel_add (w, t + rtime);
108 static clib_error_t *
109 validate_level (timing_wheel_t * w, uword level_index, uword * n_elts)
111 timing_wheel_level_t *level;
112 timing_wheel_elt_t *e;
114 clib_error_t *error = 0;
118 error = CLIB_ERROR_ASSERT (x); \
120 if (error) return error; \
123 level = vec_elt_at_index (w->levels, level_index);
124 for (wi = 0; wi < vec_len (level->elts); wi++)
126 /* Validate occupancy bitmap. */
127 _(clib_bitmap_get_no_check (level->occupancy_bitmap, wi) ==
128 (vec_len (level->elts[wi]) > 0));
130 *n_elts += vec_len (level->elts[wi]);
132 vec_foreach (e, level->elts[wi])
134 /* Validate time bin and level. */
136 uword e_ti, e_li, e_wi;
138 e_time = e->cpu_time_relative_to_base + w->cpu_time_base;
139 e_li = get_level_and_relative_time (w, e_time, &e_ti);
140 e_wi = rtime_to_wheel_index (w, level_index, e_ti);
142 if (e_li == level_index - 1)
143 /* If this element was scheduled on the previous level
144 it must be wrapped. */
145 _(e_ti + current_time_wheel_index (w, level_index - 1)
146 >= w->bins_per_wheel);
149 _(e_li == level_index);
153 _(e_wi == wi || e_wi + 1 == wi || e_wi - 1 == wi);
164 timing_wheel_validate (timing_wheel_t * w)
167 clib_error_t *error = 0;
173 n_elts = pool_elts (w->overflow_pool);
174 for (l = 0; l < vec_len (w->levels); l++)
176 error = validate_level (w, l, &n_elts);
178 clib_error_report (error);
183 free_elt_vector (timing_wheel_t * w, timing_wheel_elt_t * ev)
185 /* Poison free elements so we never use them by mistake. */
187 memset (ev, ~0, vec_len (ev) * sizeof (ev[0]));
189 vec_add1 (w->free_elt_vectors, ev);
192 static timing_wheel_elt_t *
193 insert_helper (timing_wheel_t * w, uword level_index, uword rtime)
195 timing_wheel_level_t *level;
196 timing_wheel_elt_t *e;
199 /* Circular buffer. */
200 vec_validate (w->levels, level_index);
201 level = vec_elt_at_index (w->levels, level_index);
203 if (PREDICT_FALSE (!level->elts))
205 uword max = w->bins_per_wheel - 1;
206 clib_bitmap_validate (level->occupancy_bitmap, max);
207 vec_validate (level->elts, max);
210 wheel_index = rtime_to_wheel_index (w, level_index, rtime);
212 level->occupancy_bitmap =
213 clib_bitmap_ori (level->occupancy_bitmap, wheel_index);
215 /* Allocate an elt vector from free list if there is one. */
216 if (!level->elts[wheel_index] && vec_len (w->free_elt_vectors))
217 level->elts[wheel_index] = vec_pop (w->free_elt_vectors);
219 /* Add element to vector for this time bin. */
220 vec_add2 (level->elts[wheel_index], e, 1);
225 /* Insert user data on wheel at given CPU time stamp. */
227 timing_wheel_insert_helper (timing_wheel_t * w, u64 insert_cpu_time,
230 timing_wheel_elt_t *e;
232 uword rtime, level_index;
234 level_index = get_level_and_relative_time (w, insert_cpu_time, &rtime);
236 dt = insert_cpu_time - w->cpu_time_base;
237 if (PREDICT_TRUE (0 == (dt >> BITS (e->cpu_time_relative_to_base))))
239 e = insert_helper (w, level_index, rtime);
240 e->user_data = user_data;
241 e->cpu_time_relative_to_base = dt;
245 /* Time too far in the future: add to overflow vector. */
246 timing_wheel_overflow_elt_t *oe;
247 pool_get (w->overflow_pool, oe);
248 oe->user_data = user_data;
249 oe->cpu_time = insert_cpu_time;
254 elt_is_deleted (timing_wheel_t * w, u32 user_data)
256 return (hash_elts (w->deleted_user_data_hash) > 0
257 && hash_get (w->deleted_user_data_hash, user_data));
260 static timing_wheel_elt_t *
261 delete_user_data (timing_wheel_elt_t * elts, u32 user_data)
264 timing_wheel_elt_t *e, *new_elts;
266 /* Quickly scan to see if there are any elements to delete
269 vec_foreach (e, elts)
271 found_match = e->user_data == user_data;
278 /* Re-scan to build vector of new elts with matching user_data deleted. */
280 vec_foreach (e, elts)
282 if (e->user_data != user_data)
283 vec_add1 (new_elts, e[0]);
290 /* Insert user data on wheel at given CPU time stamp. */
292 timing_wheel_insert (timing_wheel_t * w, u64 insert_cpu_time, u32 user_data)
294 /* Remove previously deleted elements. */
295 if (elt_is_deleted (w, user_data))
297 timing_wheel_level_t *l;
300 /* Delete elts with given user data so that stale events don't expire. */
301 vec_foreach (l, w->levels)
304 clib_bitmap_foreach (wi, l->occupancy_bitmap, ({
305 l->elts[wi] = delete_user_data (l->elts[wi], user_data);
306 if (vec_len (l->elts[wi]) == 0)
307 l->occupancy_bitmap = clib_bitmap_andnoti (l->occupancy_bitmap, wi);
313 timing_wheel_overflow_elt_t *oe;
315 pool_foreach (oe, w->overflow_pool, ({
316 if (oe->user_data == user_data)
317 pool_put (w->overflow_pool, oe);
322 hash_unset (w->deleted_user_data_hash, user_data);
325 timing_wheel_insert_helper (w, insert_cpu_time, user_data);
329 timing_wheel_delete (timing_wheel_t * w, u32 user_data)
331 if (!w->deleted_user_data_hash)
332 w->deleted_user_data_hash =
333 hash_create ( /* capacity */ 0, /* value bytes */ 0);
335 hash_set1 (w->deleted_user_data_hash, user_data);
338 /* Returns time of next expiring element. */
340 timing_wheel_next_expiring_elt_time (timing_wheel_t * w)
342 timing_wheel_level_t *l;
343 timing_wheel_elt_t *e;
351 vec_foreach (l, w->levels)
353 if (!l->occupancy_bitmap)
357 wi0 = wi = current_time_wheel_index (w, li);
361 if (clib_bitmap_get_no_check (l->occupancy_bitmap, wi))
363 vec_foreach (e, l->elts[wi])
364 min_dt = clib_min (min_dt, e->cpu_time_relative_to_base);
366 if (wrapped && li + 1 < vec_len (w->levels))
368 uword wi1 = current_time_wheel_index (w, li + 1);
369 if (l[1].occupancy_bitmap
370 && clib_bitmap_get_no_check (l[1].occupancy_bitmap, wi1))
372 vec_foreach (e, l[1].elts[wi1])
375 clib_min (min_dt, e->cpu_time_relative_to_base);
380 min_t = w->cpu_time_base + min_dt;
384 wi = wheel_add (w, wi + 1);
388 wrapped = wi != wi + 1;
393 timing_wheel_overflow_elt_t *oe;
396 min_t = w->cpu_time_base + min_dt;
399 pool_foreach (oe, w->overflow_pool,
400 ({ min_t = clib_min (min_t, oe->cpu_time); }));
409 insert_elt (timing_wheel_t * w, timing_wheel_elt_t * e)
411 u64 t = w->cpu_time_base + e->cpu_time_relative_to_base;
412 timing_wheel_insert_helper (w, t, e->user_data);
416 elt_cpu_time (timing_wheel_t * w, timing_wheel_elt_t * e)
418 return w->cpu_time_base + e->cpu_time_relative_to_base;
422 validate_expired_elt (timing_wheel_t * w, timing_wheel_elt_t * e,
423 u64 current_cpu_time)
427 u64 e_time = elt_cpu_time (w, e);
429 /* Verify that element is actually expired. */
430 ASSERT ((e_time >> w->log2_clocks_per_bin)
431 <= (current_cpu_time >> w->log2_clocks_per_bin));
436 expire_bin (timing_wheel_t * w,
438 uword wheel_index, u64 advance_cpu_time, u32 * expired_user_data)
440 timing_wheel_level_t *level = vec_elt_at_index (w->levels, level_index);
441 timing_wheel_elt_t *e;
445 e = vec_elt (level->elts, wheel_index);
448 vec_add2 (expired_user_data, x, e_len);
449 for (i = j = 0; i < e_len; i++)
451 validate_expired_elt (w, &e[i], advance_cpu_time);
452 x[j] = e[i].user_data;
454 /* Only advance if elt is not to be deleted. */
455 j += !elt_is_deleted (w, e[i].user_data);
458 /* Adjust for deleted elts. */
460 _vec_len (expired_user_data) -= e_len - j;
462 free_elt_vector (w, e);
464 level->elts[wheel_index] = 0;
465 clib_bitmap_set_no_check (level->occupancy_bitmap, wheel_index, 0);
467 return expired_user_data;
470 /* Called rarely. 32 bit times should only overflow every 4 seconds or so on a fast machine. */
472 advance_cpu_time_base (timing_wheel_t * w, u32 * expired_user_data)
474 timing_wheel_level_t *l;
475 timing_wheel_elt_t *e;
478 w->stats.cpu_time_base_advances++;
479 delta = ((u64) 1 << w->n_wheel_elt_time_bits);
480 w->cpu_time_base += delta;
481 w->time_index_next_cpu_time_base_update += delta >> w->log2_clocks_per_bin;
483 vec_foreach (l, w->levels)
487 clib_bitmap_foreach (wi, l->occupancy_bitmap, ({
488 vec_foreach (e, l->elts[wi])
490 /* This should always be true since otherwise we would have already expired
492 ASSERT (e->cpu_time_relative_to_base >= delta);
493 e->cpu_time_relative_to_base -= delta;
499 /* See which overflow elements fit now. */
501 timing_wheel_overflow_elt_t *oe;
503 pool_foreach (oe, w->overflow_pool, ({
504 /* It fits now into 32 bits. */
505 if (0 == ((oe->cpu_time - w->cpu_time_base) >> BITS (e->cpu_time_relative_to_base)))
507 u64 ti = oe->cpu_time >> w->log2_clocks_per_bin;
508 if (ti < w->current_time_index)
510 /* This can happen when timing wheel is not advanced for a long time
511 (for example when at a gdb breakpoint for a while). */
512 if (! elt_is_deleted (w, oe->user_data))
513 vec_add1 (expired_user_data, oe->user_data);
516 timing_wheel_insert_helper (w, oe->cpu_time, oe->user_data);
517 pool_put (w->overflow_pool, oe);
522 return expired_user_data;
526 refill_level (timing_wheel_t * w,
528 u64 advance_cpu_time,
529 uword from_wheel_index,
530 uword to_wheel_index, u32 * expired_user_data)
532 timing_wheel_level_t *level;
533 timing_wheel_elt_t *to_insert = w->unexpired_elts_pending_insert;
534 u64 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
536 vec_validate (w->stats.refills, level_index);
537 w->stats.refills[level_index] += 1;
539 if (level_index + 1 >= vec_len (w->levels))
542 level = vec_elt_at_index (w->levels, level_index + 1);
543 if (!level->occupancy_bitmap)
548 timing_wheel_elt_t *e, *es;
550 if (clib_bitmap_get_no_check
551 (level->occupancy_bitmap, from_wheel_index))
553 es = level->elts[from_wheel_index];
554 level->elts[from_wheel_index] = 0;
555 clib_bitmap_set_no_check (level->occupancy_bitmap, from_wheel_index,
560 u64 e_time = elt_cpu_time (w, e);
561 u64 ti = e_time >> w->log2_clocks_per_bin;
562 if (ti <= advance_time_index)
564 validate_expired_elt (w, e, advance_cpu_time);
565 if (!elt_is_deleted (w, e->user_data))
566 vec_add1 (expired_user_data, e->user_data);
569 vec_add1 (to_insert, e[0]);
571 free_elt_vector (w, es);
574 if (from_wheel_index == to_wheel_index)
577 from_wheel_index = wheel_add (w, from_wheel_index + 1);
580 timing_wheel_validate (w);
582 w->unexpired_elts_pending_insert = to_insert;
583 return expired_user_data;
586 /* Advance wheel and return any expired user data in vector. */
588 timing_wheel_advance (timing_wheel_t * w, u64 advance_cpu_time,
589 u32 * expired_user_data,
590 u64 * next_expiring_element_cpu_time)
592 timing_wheel_level_t *level;
593 uword level_index, advance_rtime, advance_level_index, advance_wheel_index;
594 uword n_expired_user_data_before;
595 u64 current_time_index, advance_time_index;
597 n_expired_user_data_before = vec_len (expired_user_data);
599 /* Re-fill lower levels when time wraps. */
600 current_time_index = w->current_time_index;
601 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
604 u64 current_ti, advance_ti;
606 current_ti = current_time_index >> w->log2_bins_per_wheel;
607 advance_ti = advance_time_index >> w->log2_bins_per_wheel;
609 if (PREDICT_FALSE (current_ti != advance_ti))
611 if (w->unexpired_elts_pending_insert)
612 _vec_len (w->unexpired_elts_pending_insert) = 0;
615 while (current_ti != advance_ti)
618 c = current_ti & (w->bins_per_wheel - 1);
619 a = advance_ti & (w->bins_per_wheel - 1);
621 expired_user_data = refill_level (w,
624 c, a, expired_user_data);
625 current_ti >>= w->log2_bins_per_wheel;
626 advance_ti >>= w->log2_bins_per_wheel;
632 advance_level_index =
633 get_level_and_relative_time (w, advance_cpu_time, &advance_rtime);
634 advance_wheel_index =
635 rtime_to_wheel_index (w, advance_level_index, advance_rtime);
637 /* Empty all occupied bins for entire levels that we advance past. */
638 for (level_index = 0; level_index < advance_level_index; level_index++)
642 if (level_index >= vec_len (w->levels))
645 level = vec_elt_at_index (w->levels, level_index);
647 clib_bitmap_foreach (wi, level->occupancy_bitmap, ({
648 expired_user_data = expire_bin (w, level_index, wi, advance_cpu_time,
654 if (PREDICT_TRUE (level_index < vec_len (w->levels)))
657 level = vec_elt_at_index (w->levels, level_index);
658 wi = current_time_wheel_index (w, level_index);
659 if (level->occupancy_bitmap)
662 if (clib_bitmap_get_no_check (level->occupancy_bitmap, wi))
664 expire_bin (w, advance_level_index, wi, advance_cpu_time,
667 if (wi == advance_wheel_index)
670 wi = wheel_add (w, wi + 1);
674 /* Advance current time index. */
675 w->current_time_index = advance_time_index;
677 if (vec_len (w->unexpired_elts_pending_insert) > 0)
679 timing_wheel_elt_t *e;
680 vec_foreach (e, w->unexpired_elts_pending_insert) insert_elt (w, e);
681 _vec_len (w->unexpired_elts_pending_insert) = 0;
684 /* Don't advance until necessary. */
686 (advance_time_index >= w->time_index_next_cpu_time_base_update))
687 expired_user_data = advance_cpu_time_base (w, expired_user_data);
689 if (next_expiring_element_cpu_time)
693 /* Anything expired? If so we need to recompute next expiring elt time. */
694 if (vec_len (expired_user_data) == n_expired_user_data_before
695 && w->cached_min_cpu_time_on_wheel != 0ULL)
696 min_t = w->cached_min_cpu_time_on_wheel;
699 min_t = timing_wheel_next_expiring_elt_time (w);
700 w->cached_min_cpu_time_on_wheel = min_t;
703 *next_expiring_element_cpu_time = min_t;
706 return expired_user_data;
710 format_timing_wheel (u8 * s, va_list * va)
712 timing_wheel_t *w = va_arg (*va, timing_wheel_t *);
713 int verbose = va_arg (*va, int);
714 uword indent = format_get_indent (s);
716 s = format (s, "level 0: %.4e - %.4e secs, 2^%d - 2^%d clocks",
717 (f64) (1 << w->log2_clocks_per_bin) / w->cpu_clocks_per_second,
718 (f64) (1 << w->log2_clocks_per_wheel) /
719 w->cpu_clocks_per_second, w->log2_clocks_per_bin,
720 w->log2_clocks_per_wheel);
726 s = format (s, "\n%Utime base advances %Ld, every %.4e secs",
727 format_white_space, indent + 2,
728 w->stats.cpu_time_base_advances,
729 (f64) ((u64) 1 << w->n_wheel_elt_time_bits) /
730 w->cpu_clocks_per_second);
732 for (l = 0; l < vec_len (w->levels); l++)
733 s = format (s, "\n%Ulevel %d: refills %Ld",
734 format_white_space, indent + 2,
737 vec_len (w->stats.refills) ? w->stats.
738 refills[l] : (u64) 0);
745 * fd.io coding-style-patch-verification: ON
748 * eval: (c-set-style "gnu")