2 * Copyright (c) 2019 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 * TCP byte tracker that can generate delivery rate estimates. Based on
16 * draft-cheng-iccrg-delivery-rate-estimation-00
19 #include <vnet/tcp/tcp.h>
21 static tcp_bt_sample_t *
22 bt_get_sample (tcp_byte_tracker_t * bt, u32 bts_index)
24 if (pool_is_free_index (bt->samples, bts_index))
26 return pool_elt_at_index (bt->samples, bts_index);
29 static tcp_bt_sample_t *
30 bt_next_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
32 return bt_get_sample (bt, bts->next);
35 static tcp_bt_sample_t *
36 bt_prev_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
38 return bt_get_sample (bt, bts->prev);
42 bt_sample_index (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
45 return TCP_BTS_INVALID_INDEX;
46 return bts - bt->samples;
50 bt_seq_lt (u32 a, u32 b)
55 static tcp_bt_sample_t *
56 bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq, u32 max_seq)
60 pool_get_zero (bt->samples, bts);
61 bts->next = bts->prev = TCP_BTS_INVALID_INDEX;
62 bts->min_seq = min_seq;
63 bts->max_seq = max_seq;
64 rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, bts - bt->samples,
70 bt_free_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
72 if (bts->prev != TCP_BTS_INVALID_INDEX)
74 tcp_bt_sample_t *prev = bt_prev_sample (bt, bts);
75 prev->next = bts->next;
80 if (bts->next != TCP_BTS_INVALID_INDEX)
82 tcp_bt_sample_t *next = bt_next_sample (bt, bts);
83 next->prev = bts->prev;
88 rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt);
90 memset (bts, 0xfc, sizeof (*bts));
91 pool_put (bt->samples, bts);
94 static tcp_bt_sample_t *
95 bt_split_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq)
97 tcp_bt_sample_t *ns, *next;
100 bts_index = bt_sample_index (bt, bts);
102 ASSERT (seq_leq (bts->min_seq, seq) && seq_lt (seq, bts->max_seq));
104 ns = bt_alloc_sample (bt, seq, bts->max_seq);
105 bts = bt_get_sample (bt, bts_index);
111 next = bt_next_sample (bt, bts);
113 next->prev = bt_sample_index (bt, ns);
115 bt->tail = bt_sample_index (bt, ns);
117 bts->next = bt_sample_index (bt, ns);
118 ns->prev = bt_sample_index (bt, bts);
123 static tcp_bt_sample_t *
124 bt_merge_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * prev,
125 tcp_bt_sample_t * cur)
127 ASSERT (prev->max_seq == cur->min_seq);
128 prev->max_seq = cur->max_seq;
129 if (bt_sample_index (bt, cur) == bt->tail)
130 bt->tail = bt_sample_index (bt, prev);
131 bt_free_sample (bt, cur);
135 static tcp_bt_sample_t *
136 bt_lookup_seq (tcp_byte_tracker_t * bt, u32 seq)
138 rb_tree_t *rt = &bt->sample_lookup;
139 rb_node_t *cur, *prev;
140 tcp_bt_sample_t *bts;
142 cur = rb_node (rt, rt->root);
143 if (rb_node_is_tnil (rt, cur))
146 while (seq != cur->key)
149 if (seq_lt (seq, cur->key))
150 cur = rb_node_left (rt, cur);
152 cur = rb_node_right (rt, cur);
154 if (rb_node_is_tnil (rt, cur))
156 /* Hit tnil as a left child. Find predecessor */
157 if (seq_lt (seq, prev->key))
159 cur = rb_tree_predecessor (rt, prev);
160 if (rb_node_is_tnil (rt, cur))
162 bts = bt_get_sample (bt, cur->opaque);
164 /* Hit tnil as a right child */
167 bts = bt_get_sample (bt, prev->opaque);
170 if (seq_geq (seq, bts->min_seq))
177 if (!rb_node_is_tnil (rt, cur))
178 return bt_get_sample (bt, cur->opaque);
184 bt_update_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq)
186 rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt);
188 rb_tree_add_custom (&bt->sample_lookup, bts->min_seq,
189 bt_sample_index (bt, bts), bt_seq_lt);
192 static tcp_bt_sample_t *
193 bt_fix_overlapped (tcp_byte_tracker_t * bt, tcp_bt_sample_t * start,
196 tcp_bt_sample_t *cur, *next;
199 while (cur && seq_leq (cur->max_seq, seq))
201 next = bt_next_sample (bt, cur);
202 bt_free_sample (bt, cur);
206 if (cur && seq_lt (cur->min_seq, seq))
207 bt_update_sample (bt, cur, seq);
213 tcp_bt_is_sane (tcp_byte_tracker_t * bt)
215 tcp_bt_sample_t *bts, *tmp;
217 if (pool_elts (bt->samples) != pool_elts (bt->sample_lookup.nodes) - 1)
220 if (bt->head == TCP_BTS_INVALID_INDEX)
222 if (bt->tail != TCP_BTS_INVALID_INDEX)
224 if (pool_elts (bt->samples) != 0)
229 bts = bt_get_sample (bt, bt->tail);
233 bts = bt_get_sample (bt, bt->head);
234 if (!bts || bts->prev != TCP_BTS_INVALID_INDEX)
239 tmp = bt_lookup_seq (bt, bts->min_seq);
244 tmp = bt_next_sample (bt, bts);
247 if (tmp->prev != bt_sample_index (bt, bts))
249 clib_warning ("next %u thinks prev is %u should be %u",
250 bts->next, tmp->prev, bt_sample_index (bt, bts));
253 if (!seq_lt (bts->min_seq, tmp->min_seq))
258 if (bt->tail != bt_sample_index (bt, bts))
260 if (bts->next != TCP_BTS_INVALID_INDEX)
268 static tcp_bt_sample_t *
269 tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq, u32 max_seq)
271 tcp_bt_sample_t *bts;
272 bts = bt_alloc_sample (tc->bt, min_seq, max_seq);
273 bts->delivered = tc->delivered;
274 bts->delivered_time = tc->delivered_time;
275 bts->tx_time = tcp_time_now_us (tc->c_thread_index);
276 bts->first_tx_time = tc->first_tx_time;
277 bts->flags |= tc->app_limited ? TCP_BTS_IS_APP_LIMITED : 0;
282 tcp_bt_check_app_limited (tcp_connection_t * tc)
284 u32 available_bytes, flight_size;
286 available_bytes = transport_max_tx_dequeue (&tc->connection);
287 flight_size = tcp_flight_size (tc);
289 /* Not enough bytes to fill the cwnd */
290 if (available_bytes + flight_size + tc->snd_mss < tc->cwnd
291 /* Bytes considered lost have been retransmitted */
292 && tc->sack_sb.lost_bytes <= tc->snd_rxt_bytes)
293 tc->app_limited = tc->delivered + flight_size ? : 1;
297 tcp_bt_track_tx (tcp_connection_t * tc, u32 len)
299 tcp_byte_tracker_t *bt = tc->bt;
300 tcp_bt_sample_t *bts, *tail;
303 tail = bt_get_sample (bt, bt->tail);
304 if (tail && tail->max_seq == tc->snd_nxt
305 && tail->tx_time == tcp_time_now_us (tc->c_thread_index))
307 tail->max_seq += len;
311 if (tc->snd_una == tc->snd_nxt)
313 tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
314 tc->first_tx_time = tc->delivered_time;
317 bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt, tc->snd_nxt + len);
318 bts_index = bt_sample_index (bt, bts);
319 tail = bt_get_sample (bt, bt->tail);
322 tail->next = bts_index;
323 bts->prev = bt->tail;
324 bt->tail = bts_index;
328 bt->tail = bt->head = bts_index;
333 tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end)
335 tcp_byte_tracker_t *bt = tc->bt;
336 tcp_bt_sample_t *bts, *next, *cur, *prev, *nbts;
337 u32 bts_index, cur_index, next_index, prev_index, max_seq;
338 u8 is_end = end == tc->snd_nxt;
340 /* Contiguous blocks retransmitted at the same time */
341 bts = bt_get_sample (bt, bt->last_ooo);
342 if (bts && bts->max_seq == start
343 && bts->tx_time == tcp_time_now_us (tc->c_thread_index))
346 next = bt_next_sample (bt, bts);
348 bt_fix_overlapped (bt, next, end, is_end);
353 /* Find original tx sample */
354 bts = bt_lookup_seq (bt, start);
356 ASSERT (bts != 0 && seq_geq (start, bts->min_seq));
358 /* Head in the past */
359 if (seq_lt (bts->min_seq, tc->snd_una))
360 bt_update_sample (bt, bts, tc->snd_una);
363 if (bts->min_seq == start)
365 prev_index = bts->prev;
366 next = bt_fix_overlapped (bt, bts, end, is_end);
367 next_index = bt_sample_index (bt, next);
369 cur = tcp_bt_alloc_tx_sample (tc, start, end);
370 cur->flags |= TCP_BTS_IS_RXT;
371 if (bts->flags & TCP_BTS_IS_RXT)
372 cur->flags |= TCP_BTS_IS_RXT_LOST;
373 cur->next = next_index;
374 cur->prev = prev_index;
376 cur_index = bt_sample_index (bt, cur);
378 if (next_index != TCP_BTS_INVALID_INDEX)
380 next = bt_get_sample (bt, next_index);
381 next->prev = cur_index;
385 bt->tail = cur_index;
388 if (prev_index != TCP_BTS_INVALID_INDEX)
390 prev = bt_get_sample (bt, prev_index);
391 prev->next = cur_index;
395 bt->head = cur_index;
398 bt->last_ooo = cur_index;
402 bts_index = bt_sample_index (bt, bts);
403 next = bt_next_sample (bt, bts);
405 next = bt_fix_overlapped (bt, next, end, is_end);
407 max_seq = bts->max_seq;
408 ASSERT (seq_lt (start, max_seq));
410 /* Have to split or tail overlap */
411 cur = tcp_bt_alloc_tx_sample (tc, start, end);
412 cur->flags |= TCP_BTS_IS_RXT;
413 if (bts->flags & TCP_BTS_IS_RXT)
414 cur->flags |= TCP_BTS_IS_RXT_LOST;
415 cur->prev = bts_index;
416 cur_index = bt_sample_index (bt, cur);
418 /* Split. Allocate another sample */
419 if (seq_lt (end, max_seq))
421 nbts = tcp_bt_alloc_tx_sample (tc, end, bts->max_seq);
422 cur = bt_get_sample (bt, cur_index);
423 bts = bt_get_sample (bt, bts_index);
428 if (nbts->next != TCP_BTS_INVALID_INDEX)
430 next = bt_get_sample (bt, nbts->next);
431 next->prev = bt_sample_index (bt, nbts);
434 bt->tail = bt_sample_index (bt, nbts);
436 bts->next = nbts->prev = cur_index;
437 cur->next = bt_sample_index (bt, nbts);
439 bts->max_seq = start;
440 bt->last_ooo = cur_index;
442 /* Tail completely overlapped */
445 bts = bt_get_sample (bt, bts_index);
446 bts->max_seq = start;
448 if (bts->next != TCP_BTS_INVALID_INDEX)
450 next = bt_get_sample (bt, bts->next);
451 next->prev = cur_index;
454 bt->tail = cur_index;
456 cur->next = bts->next;
457 bts->next = cur_index;
459 bt->last_ooo = cur_index;
464 tcp_bt_sample_to_rate_sample (tcp_connection_t * tc, tcp_bt_sample_t * bts,
465 tcp_rate_sample_t * rs)
467 if (bts->flags & TCP_BTS_IS_SACKED)
470 if (rs->prior_delivered && rs->prior_delivered >= bts->delivered)
473 rs->prior_delivered = bts->delivered;
474 rs->prior_time = bts->delivered_time;
475 rs->interval_time = bts->tx_time - bts->first_tx_time;
476 rs->rtt_time = tc->delivered_time - bts->tx_time;
477 rs->flags = bts->flags;
478 tc->first_tx_time = bts->tx_time;
482 tcp_bt_walk_samples (tcp_connection_t * tc, tcp_rate_sample_t * rs)
484 tcp_byte_tracker_t *bt = tc->bt;
485 tcp_bt_sample_t *next, *cur;
487 cur = bt_get_sample (bt, bt->head);
488 while (cur && seq_leq (cur->max_seq, tc->snd_una))
490 next = bt_next_sample (bt, cur);
491 tcp_bt_sample_to_rate_sample (tc, cur, rs);
492 bt_free_sample (bt, cur);
496 if (cur && seq_lt (cur->min_seq, tc->snd_una))
497 tcp_bt_sample_to_rate_sample (tc, cur, rs);
501 tcp_bt_walk_samples_ooo (tcp_connection_t * tc, tcp_rate_sample_t * rs)
503 sack_block_t *blks = tc->rcv_opts.sacks, *blk;
504 tcp_byte_tracker_t *bt = tc->bt;
505 tcp_bt_sample_t *cur, *prev, *next;
508 for (i = 0; i < vec_len (blks); i++)
512 /* Ignore blocks that are already covered by snd_una */
513 if (seq_lt (blk->end, tc->snd_una))
516 cur = bt_lookup_seq (bt, blk->start);
520 ASSERT (seq_geq (blk->start, cur->min_seq)
521 && seq_lt (blk->start, cur->max_seq));
523 /* Current should be split. Second part will be consumed */
524 if (PREDICT_FALSE (cur->min_seq != blk->start))
526 cur = bt_split_sample (bt, cur, blk->start);
527 prev = bt_prev_sample (bt, cur);
530 prev = bt_prev_sample (bt, cur);
532 while (cur && seq_leq (cur->max_seq, blk->end))
534 if (!(cur->flags & TCP_BTS_IS_SACKED))
536 tcp_bt_sample_to_rate_sample (tc, cur, rs);
537 cur->flags |= TCP_BTS_IS_SACKED;
538 if (prev && (prev->flags & TCP_BTS_IS_SACKED))
540 cur = bt_merge_sample (bt, prev, cur);
541 next = bt_next_sample (bt, cur);
545 next = bt_next_sample (bt, cur);
546 if (next && (next->flags & TCP_BTS_IS_SACKED))
548 cur = bt_merge_sample (bt, cur, next);
549 next = bt_next_sample (bt, cur);
554 next = bt_next_sample (bt, cur);
560 if (cur && seq_lt (cur->min_seq, blk->end))
562 tcp_bt_sample_to_rate_sample (tc, cur, rs);
563 prev = bt_prev_sample (bt, cur);
564 /* Extend previous to include the newly sacked bytes */
565 if (prev && (prev->flags & TCP_BTS_IS_SACKED))
567 prev->max_seq = blk->end;
568 bt_update_sample (bt, cur, blk->end);
570 /* Split sample into two. First part is consumed */
573 next = bt_split_sample (bt, cur, blk->end);
574 cur = bt_prev_sample (bt, next);
575 cur->flags |= TCP_BTS_IS_SACKED;
582 tcp_bt_sample_delivery_rate (tcp_connection_t * tc, tcp_rate_sample_t * rs)
586 if (PREDICT_FALSE (tc->flags & TCP_CONN_FINSNT))
589 delivered = tc->bytes_acked + tc->sack_sb.last_sacked_bytes;
590 if (!delivered || tc->bt->head == TCP_BTS_INVALID_INDEX)
593 /* Do not count bytes that were previously sacked again */
594 tc->delivered += delivered - tc->sack_sb.last_bytes_delivered;
595 tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
597 if (tc->app_limited && tc->delivered > tc->app_limited)
601 tcp_bt_walk_samples (tc, rs);
603 if (tc->sack_sb.last_sacked_bytes)
604 tcp_bt_walk_samples_ooo (tc, rs);
606 rs->interval_time = clib_max ((tc->delivered_time - rs->prior_time),
608 rs->delivered = tc->delivered - rs->prior_delivered;
609 rs->acked_and_sacked = delivered;
610 rs->lost = tc->sack_sb.last_lost_bytes;
614 tcp_bt_flush_samples (tcp_connection_t * tc)
616 tcp_byte_tracker_t *bt = tc->bt;
617 tcp_bt_sample_t *bts;
618 u32 *samples = 0, *si;
620 vec_validate (samples, pool_elts (bt->samples) - 1);
621 vec_reset_length (samples);
624 pool_foreach (bts, bt->samples, ({
625 vec_add1 (samples, bts - bt->samples);
629 vec_foreach (si, samples)
631 bts = bt_get_sample (bt, *si);
632 bt_free_sample (bt, bts);
639 tcp_bt_cleanup (tcp_connection_t * tc)
641 tcp_byte_tracker_t *bt = tc->bt;
643 rb_tree_free_nodes (&bt->sample_lookup);
644 pool_free (bt->samples);
650 tcp_bt_init (tcp_connection_t * tc)
652 tcp_byte_tracker_t *bt;
654 bt = clib_mem_alloc (sizeof (tcp_byte_tracker_t));
655 clib_memset (bt, 0, sizeof (tcp_byte_tracker_t));
657 rb_tree_init (&bt->sample_lookup);
658 bt->head = bt->tail = TCP_BTS_INVALID_INDEX;
663 format_tcp_bt_sample (u8 * s, va_list * args)
665 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
666 tcp_bt_sample_t *bts = va_arg (*args, tcp_bt_sample_t *);
667 f64 now = tcp_time_now_us (tc->c_thread_index);
668 s = format (s, "[%u, %u] d %u dt %.3f txt %.3f ftxt %.3f flags 0x%x",
669 bts->min_seq - tc->iss, bts->max_seq - tc->iss, bts->delivered,
670 now - bts->delivered_time, now - bts->tx_time,
671 now - bts->first_tx_time, bts->flags);
676 format_tcp_bt (u8 * s, va_list * args)
678 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
679 tcp_byte_tracker_t *bt = tc->bt;
680 tcp_bt_sample_t *bts;
682 bts = bt_get_sample (bt, bt->head);
685 s = format (s, "%U\n", format_tcp_bt_sample, tc, bts);
686 bts = bt_next_sample (bt, bts);
693 * fd.io coding-style-patch-verification: ON
696 * eval: (c-set-style "gnu")