2 * Copyright (c) 2019 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
15 * TCP byte tracker that can generate delivery rate estimates. Based on
16 * draft-cheng-iccrg-delivery-rate-estimation-00
19 #include <vnet/tcp/tcp.h>
21 static tcp_bt_sample_t *
22 bt_get_sample (tcp_byte_tracker_t * bt, u32 bts_index)
24 if (pool_is_free_index (bt->samples, bts_index))
26 return pool_elt_at_index (bt->samples, bts_index);
29 static tcp_bt_sample_t *
30 bt_next_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
32 return bt_get_sample (bt, bts->next);
35 static tcp_bt_sample_t *
36 bt_prev_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
38 return bt_get_sample (bt, bts->prev);
42 bt_sample_index (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
45 return TCP_BTS_INVALID_INDEX;
46 return bts - bt->samples;
50 bt_seq_lt (u32 a, u32 b)
55 static tcp_bt_sample_t *
56 bt_alloc_sample (tcp_byte_tracker_t * bt, u32 min_seq)
60 pool_get_zero (bt->samples, bts);
61 bts->next = bts->prev = TCP_BTS_INVALID_INDEX;
62 bts->min_seq = min_seq;
63 rb_tree_add_custom (&bt->sample_lookup, bts->min_seq, bts - bt->samples,
69 bt_free_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts)
71 if (bts->prev != TCP_BTS_INVALID_INDEX)
73 tcp_bt_sample_t *prev = bt_prev_sample (bt, bts);
74 prev->next = bts->next;
79 if (bts->next != TCP_BTS_INVALID_INDEX)
81 tcp_bt_sample_t *next = bt_next_sample (bt, bts);
82 next->prev = bts->prev;
87 rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt);
89 memset (bts, 0xfc, sizeof (*bts));
90 pool_put (bt->samples, bts);
93 static tcp_bt_sample_t *
94 bt_lookup_seq (tcp_byte_tracker_t * bt, u32 seq)
96 rb_tree_t *rt = &bt->sample_lookup;
97 rb_node_t *cur, *prev;
100 cur = rb_node (rt, rt->root);
101 if (rb_node_is_tnil (rt, cur))
104 while (seq != cur->key)
107 if (seq_lt (seq, cur->key))
108 cur = rb_node_left (rt, cur);
110 cur = rb_node_right (rt, cur);
112 if (rb_node_is_tnil (rt, cur))
114 /* Hit tnil as a left child. Find predecessor */
115 if (seq_lt (seq, prev->key))
117 cur = rb_tree_predecessor (rt, prev);
118 if (rb_node_is_tnil (rt, cur))
120 bts = bt_get_sample (bt, cur->opaque);
122 /* Hit tnil as a right child */
125 bts = bt_get_sample (bt, prev->opaque);
128 if (seq_geq (seq, bts->min_seq))
135 if (!rb_node_is_tnil (rt, cur))
136 return bt_get_sample (bt, cur->opaque);
142 bt_update_sample (tcp_byte_tracker_t * bt, tcp_bt_sample_t * bts, u32 seq)
144 rb_tree_del_custom (&bt->sample_lookup, bts->min_seq, bt_seq_lt);
146 rb_tree_add_custom (&bt->sample_lookup, bts->min_seq,
147 bt_sample_index (bt, bts), bt_seq_lt);
150 static tcp_bt_sample_t *
151 bt_fix_overlapped (tcp_byte_tracker_t * bt, tcp_bt_sample_t * start,
154 tcp_bt_sample_t *cur, *next;
157 while ((next = bt_next_sample (bt, cur)) && seq_lt (next->min_seq, seq))
159 bt_free_sample (bt, cur);
165 bt_free_sample (bt, cur);
169 /* Overlapping current entirely */
172 bt_free_sample (bt, cur);
176 /* Overlapping head of current but not all */
177 bt_update_sample (bt, cur, seq);
182 tcp_bt_is_sane (tcp_byte_tracker_t * bt)
184 tcp_bt_sample_t *bts, *tmp;
186 if (pool_elts (bt->samples) != pool_elts (bt->sample_lookup.nodes) - 1)
189 if (bt->head == TCP_BTS_INVALID_INDEX)
191 if (bt->tail != TCP_BTS_INVALID_INDEX)
193 if (pool_elts (bt->samples) != 0)
198 bts = bt_get_sample (bt, bt->tail);
202 bts = bt_get_sample (bt, bt->head);
203 if (!bts || bts->prev != TCP_BTS_INVALID_INDEX)
208 tmp = bt_lookup_seq (bt, bts->min_seq);
213 tmp = bt_next_sample (bt, bts);
216 if (tmp->prev != bt_sample_index (bt, bts))
218 clib_warning ("next %u thinks prev is %u should be %u",
219 bts->next, tmp->prev, bt_sample_index (bt, bts));
222 if (!seq_lt (bts->min_seq, tmp->min_seq))
227 if (bt->tail != bt_sample_index (bt, bts))
229 if (bts->next != TCP_BTS_INVALID_INDEX)
237 static tcp_bt_sample_t *
238 tcp_bt_alloc_tx_sample (tcp_connection_t * tc, u32 min_seq)
240 tcp_bt_sample_t *bts;
241 bts = bt_alloc_sample (tc->bt, min_seq);
242 bts->delivered = tc->delivered;
243 bts->delivered_time = tc->delivered_time;
244 bts->tx_rate = transport_connection_tx_pacer_rate (&tc->connection);
245 bts->flags |= tc->app_limited ? TCP_BTS_IS_APP_LIMITED : 0;
250 tcp_bt_check_app_limited (tcp_connection_t * tc)
252 u32 available_bytes, flight_size;
254 available_bytes = transport_max_tx_dequeue (&tc->connection);
255 flight_size = tcp_flight_size (tc);
257 /* Not enough bytes to fill the cwnd */
258 if (available_bytes + flight_size + tc->snd_mss < tc->cwnd
259 /* Bytes considered lost have been retransmitted */
260 && tc->sack_sb.lost_bytes <= tc->snd_rxt_bytes)
261 tc->app_limited = tc->delivered + flight_size ? : 1;
265 tcp_bt_track_tx (tcp_connection_t * tc)
267 tcp_byte_tracker_t *bt = tc->bt;
268 tcp_bt_sample_t *bts, *tail;
271 if (!tcp_flight_size (tc))
272 tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
274 bts = tcp_bt_alloc_tx_sample (tc, tc->snd_nxt);
275 bts_index = bt_sample_index (bt, bts);
276 tail = bt_get_sample (bt, bt->tail);
279 tail->next = bts_index;
280 bts->prev = bt->tail;
281 bt->tail = bts_index;
285 bt->tail = bt->head = bts_index;
290 tcp_bt_track_rxt (tcp_connection_t * tc, u32 start, u32 end)
292 tcp_byte_tracker_t *bt = tc->bt;
293 tcp_bt_sample_t *bts, *next, *cur, *prev, *nbts;
294 u32 bts_index, cur_index, next_index, prev_index, min_seq;
295 u8 is_end = end == tc->snd_nxt;
297 bts = bt_get_sample (bt, bt->last_ooo);
298 if (bts && bts->max_seq == start)
301 next = bt_next_sample (bt, bts);
303 bt_fix_overlapped (bt, next, end, is_end);
308 /* Find original tx sample */
309 bts = bt_lookup_seq (bt, start);
311 ASSERT (bts != 0 && seq_geq (start, bts->min_seq));
313 /* Head in the past */
314 if (seq_lt (bts->min_seq, tc->snd_una))
315 bt_update_sample (bt, bts, tc->snd_una);
318 if (bts->min_seq == start)
320 prev_index = bts->prev;
321 next = bt_fix_overlapped (bt, bts, end, is_end);
322 next_index = bt_sample_index (bt, next);
324 cur = tcp_bt_alloc_tx_sample (tc, start);
326 cur->flags |= TCP_BTS_IS_RXT;
327 cur->next = next_index;
328 cur->prev = prev_index;
330 cur_index = bt_sample_index (bt, cur);
332 if (next_index != TCP_BTS_INVALID_INDEX)
334 next = bt_get_sample (bt, next_index);
335 next->prev = cur_index;
339 bt->tail = cur_index;
342 if (prev_index != TCP_BTS_INVALID_INDEX)
344 prev = bt_get_sample (bt, prev_index);
345 prev->next = cur_index;
349 bt->head = cur_index;
352 bt->last_ooo = cur_index;
356 bts_index = bt_sample_index (bt, bts);
357 next = bt_next_sample (bt, bts);
359 next = bt_fix_overlapped (bt, next, end, is_end);
361 min_seq = next ? next->min_seq : tc->snd_nxt;
362 ASSERT (seq_lt (start, min_seq));
364 /* Have to split or tail overlap */
365 cur = tcp_bt_alloc_tx_sample (tc, start);
367 cur->flags |= TCP_BTS_IS_RXT;
368 cur->prev = bts_index;
369 cur_index = bt_sample_index (bt, cur);
371 /* Split. Allocate another sample */
372 if (seq_lt (end, min_seq))
374 nbts = tcp_bt_alloc_tx_sample (tc, end);
375 cur = bt_get_sample (bt, cur_index);
376 bts = bt_get_sample (bt, bts_index);
381 if (nbts->next != TCP_BTS_INVALID_INDEX)
383 next = bt_get_sample (bt, nbts->next);
384 next->prev = bt_sample_index (bt, nbts);
387 bt->tail = bt_sample_index (bt, nbts);
389 bts->next = nbts->prev = cur_index;
390 cur->next = bt_sample_index (bt, nbts);
392 bt->last_ooo = cur_index;
394 /* Tail completely overlapped */
397 bts = bt_get_sample (bt, bts_index);
399 if (bts->next != TCP_BTS_INVALID_INDEX)
401 next = bt_get_sample (bt, bts->next);
402 next->prev = cur_index;
405 bt->tail = cur_index;
407 cur->next = bts->next;
408 bts->next = cur_index;
410 bt->last_ooo = cur_index;
415 tcp_bt_sample_to_rate_sample (tcp_connection_t * tc, tcp_bt_sample_t * bts,
416 tcp_rate_sample_t * rs)
418 if (rs->sample_delivered && rs->sample_delivered >= bts->delivered)
421 rs->sample_delivered = bts->delivered;
422 rs->delivered = tc->delivered - bts->delivered;
423 rs->ack_time = tc->delivered_time - bts->delivered_time;
424 rs->tx_rate = bts->tx_rate;
425 rs->flags = bts->flags;
429 tcp_bt_walk_samples (tcp_connection_t * tc, tcp_rate_sample_t * rs)
431 tcp_byte_tracker_t *bt = tc->bt;
432 tcp_bt_sample_t *next, *cur;
434 cur = bt_get_sample (bt, bt->head);
435 tcp_bt_sample_to_rate_sample (tc, cur, rs);
436 while ((next = bt_get_sample (bt, cur->next))
437 && seq_lt (next->min_seq, tc->snd_una))
439 bt_free_sample (bt, cur);
440 tcp_bt_sample_to_rate_sample (tc, next, rs);
444 ASSERT (seq_lt (cur->min_seq, tc->snd_una));
446 /* All samples acked */
447 if (tc->snd_una == tc->snd_nxt)
449 ASSERT (pool_elts (bt->samples) == 1);
450 bt_free_sample (bt, cur);
454 /* Current sample completely consumed */
455 if (next && next->min_seq == tc->snd_una)
457 bt_free_sample (bt, cur);
463 tcp_bt_walk_samples_ooo (tcp_connection_t * tc, tcp_rate_sample_t * rs)
465 sack_block_t *blks = tc->rcv_opts.sacks, *blk;
466 tcp_byte_tracker_t *bt = tc->bt;
467 tcp_bt_sample_t *next, *cur;
470 for (i = 0; i < vec_len (blks); i++)
474 /* Ignore blocks that are already covered by snd_una */
475 if (seq_lt (blk->end, tc->snd_una))
478 cur = bt_lookup_seq (bt, blk->start);
482 tcp_bt_sample_to_rate_sample (tc, cur, rs);
484 /* Current shouldn't be removed */
485 if (cur->min_seq != blk->start)
487 cur = bt_next_sample (bt, cur);
492 while ((next = bt_get_sample (bt, cur->next))
493 && seq_lt (next->min_seq, blk->end))
495 bt_free_sample (bt, cur);
496 tcp_bt_sample_to_rate_sample (tc, next, rs);
500 /* Current consumed entirely */
501 if (next && next->min_seq == blk->end)
502 bt_free_sample (bt, cur);
507 tcp_bt_sample_delivery_rate (tcp_connection_t * tc, tcp_rate_sample_t * rs)
511 if (PREDICT_FALSE (tc->flags & TCP_CONN_FINSNT))
514 delivered = tc->bytes_acked + tc->sack_sb.last_sacked_bytes;
515 if (!delivered || tc->bt->head == TCP_BTS_INVALID_INDEX)
518 /* Do not count bytes that were previously sacked again */
519 tc->delivered += delivered - tc->sack_sb.last_bytes_delivered;
520 tc->delivered_time = tcp_time_now_us (tc->c_thread_index);
522 if (tc->app_limited && tc->delivered > tc->app_limited)
526 tcp_bt_walk_samples (tc, rs);
528 if (tc->sack_sb.last_sacked_bytes)
529 tcp_bt_walk_samples_ooo (tc, rs);
533 tcp_bt_flush_samples (tcp_connection_t * tc)
535 tcp_byte_tracker_t *bt = tc->bt;
536 tcp_bt_sample_t *bts;
537 u32 *samples = 0, *si;
539 vec_validate (samples, pool_elts (bt->samples) - 1);
542 pool_foreach (bts, bt->samples, ({
543 vec_add1 (samples, bts - bt->samples);
547 vec_foreach (si, samples)
549 bts = bt_get_sample (bt, *si);
550 bt_free_sample (bt, bts);
557 tcp_bt_cleanup (tcp_connection_t * tc)
559 tcp_byte_tracker_t *bt = tc->bt;
561 rb_tree_free_nodes (&bt->sample_lookup);
562 pool_free (bt->samples);
568 tcp_bt_init (tcp_connection_t * tc)
570 tcp_byte_tracker_t *bt;
572 bt = clib_mem_alloc (sizeof (tcp_byte_tracker_t));
573 clib_memset (bt, 0, sizeof (tcp_byte_tracker_t));
575 rb_tree_init (&bt->sample_lookup);
576 bt->head = bt->tail = TCP_BTS_INVALID_INDEX;
581 * fd.io coding-style-patch-verification: ON
584 * eval: (c-set-style "gnu")