From ca1c8f3e782dc68a51aa2792771d9b4aac696ddd Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 23 May 2018 21:01:30 -0700 Subject: [PATCH] tcp: loss recovery improvements/fixes - fix newreno cwnd computation - reset snd_una_max on entering recovery - accept acks beyond snd_nxt but less than snd_congestion when in recovery - avoid entering fast recovery multiple times when using sacks - avoid as much as possible sending small segments when doing fast retransmit - more event logging Change-Id: I19dd151d7704e39d4eae06de3a26f5e124875366 Signed-off-by: Florin Coras --- src/vnet/session/session.c | 2 +- src/vnet/session/session_node.c | 9 +- src/vnet/tcp/tcp.c | 6 +- src/vnet/tcp/tcp.h | 6 +- src/vnet/tcp/tcp_debug.h | 274 +++++++++++++++++++++++++--------------- src/vnet/tcp/tcp_input.c | 69 ++++++---- src/vnet/tcp/tcp_newreno.c | 2 +- src/vnet/tcp/tcp_output.c | 26 ++-- 8 files changed, 248 insertions(+), 146 deletions(-) diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 2697c26381e..c0163255fb6 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -774,7 +774,7 @@ stream_session_reset_notify (transport_connection_t * tc) stream_session_t *s; application_t *app; s = session_get (tc->s_index, tc->thread_index); - + s->session_state = SESSION_STATE_CLOSED; app = application_get (s->app_index); app->cb_fns.session_reset_callback (s); } diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 46fc4dc8745..e046efba81f 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -260,6 +260,8 @@ session_tx_not_ready (stream_session_t * s, u8 peek_data) * session is not ready or closed */ if (s->session_state < SESSION_STATE_READY) return 1; + if (s->session_state == SESSION_STATE_CLOSED) + return 2; } return 0; } @@ -364,11 +366,12 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, session_tx_context_t *ctx = &smm->ctx[thread_index]; transport_proto_t tp; vlib_buffer_t *pb; - u16 n_bufs; + u16 n_bufs, rv; - if (PREDICT_FALSE (session_tx_not_ready (s, peek_data))) + if (PREDICT_FALSE ((rv = session_tx_not_ready (s, peek_data)))) { - vec_add1 (smm->pending_event_vector[thread_index], *e); + if (rv < 2) + vec_add1 (smm->pending_event_vector[thread_index], *e); return 0; } diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 25292d1e588..15ac7d37edc 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -734,9 +734,9 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n", tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); - s = format (s, " flight size %u send space %u rcv_wnd_av %d\n", + s = format (s, " flight size %u out space %u cc space %u rcv_wnd_av %u\n", tcp_flight_size (tc), tcp_available_output_snd_space (tc), - tcp_rcv_wnd_available (tc)); + tcp_available_cc_snd_space (tc), tcp_rcv_wnd_available (tc)); s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); @@ -1022,7 +1022,7 @@ tcp_snd_space (tcp_connection_t * tc) * bytes of previously unsent data. */ if (tcp_in_fastrecovery (tc) && !tcp_fastrecovery_sent_1_smss (tc)) { - if (tcp_available_output_snd_space (tc) < tc->snd_mss) + if (tcp_available_cc_snd_space (tc) < tc->snd_mss) return 0; tcp_fastrecovery_1_smss_on (tc); return tc->snd_mss; diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 837b5b4d0d2..10aa721a4eb 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -119,7 +119,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(FAST_RECOVERY, "Fast Recovery") \ _(FR_1_SMSS, "Sent 1 SMSS") \ _(HALF_OPEN_DONE, "Half-open completed") \ - _(FINPNDG, "FIN pending") + _(FINPNDG, "FIN pending") \ typedef enum _tcp_connection_flag_bits { @@ -617,7 +617,7 @@ tcp_available_output_snd_space (const tcp_connection_t * tc) * Estimate of how many bytes we can still push into the network */ always_inline u32 -tcp_available_snd_space (const tcp_connection_t * tc) +tcp_available_cc_snd_space (const tcp_connection_t * tc) { u32 available_wnd = tcp_available_snd_wnd (tc); u32 flight_size = tcp_flight_size (tc); @@ -652,6 +652,7 @@ fib_node_index_t tcp_lookup_rmt_in_fib (tcp_connection_t * tc); /* Made public for unit testing only */ void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); +u32 tcp_sack_list_bytes (tcp_connection_t * tc); always_inline u32 tcp_time_now (void) @@ -791,7 +792,6 @@ tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) void scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole); -void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb); sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, u32 start, u32 end); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 4af4f2e7052..a52efe00720 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -56,6 +56,9 @@ _(CC_PACK, "cc partial ack") \ _(CC_STAT, "cc stats") \ _(CC_RTO_STAT, "cc rto stats") \ + _(CC_SCOREBOARD, "scoreboard stats") \ + _(CC_SACKS, "snd sacks stats") \ + _(CC_INPUT, "ooo data delivered") \ _(SEG_INVALID, "invalid segment") \ _(PAWS_FAIL, "failed paws check") \ _(ACK_RCV_ERR, "invalid ack") \ @@ -192,7 +195,7 @@ typedef enum _tcp_dbg_evt ed->data[0] = _tc->c_c_index; \ } -#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \ +#define TCP_EVT_SYN_RCVD_HANDLER(_tc,_init, ...) \ { \ if (_init) \ TCP_EVT_INIT_HANDLER(_tc, 0); \ @@ -277,9 +280,9 @@ typedef enum _tcp_dbg_evt }; \ DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->iss; \ - ed->data[1] = _tc->snd_una - _tc->iss; \ + ed->data[1] = _tc->snd_una - _tc->iss; \ ed->data[2] = _tc->snd_una_max - _tc->iss; \ - ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } @@ -288,14 +291,14 @@ typedef enum _tcp_dbg_evt ELOG_TYPE_DECLARE (_e) = \ { \ .format = "synack-tx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ - .format_args = "i4i4i4i4i4", \ + .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->iss; \ ed->data[1] = _tc->irs; \ - ed->data[2] = _tc->snd_una - _tc->iss; \ - ed->data[3] = _tc->snd_nxt - _tc->iss; \ - ed->data[4] = _tc->rcv_nxt - _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ } #define TCP_EVT_SYNACK_RCVD_HANDLER(_tc, ...) \ @@ -303,14 +306,14 @@ typedef enum _tcp_dbg_evt ELOG_TYPE_DECLARE (_e) = \ { \ .format = "synack-rx: iss %u irs %u snd_una %u snd_nxt %u rcv_nxt %u",\ - .format_args = "i4i4i4i4i4", \ + .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->iss; \ ed->data[1] = _tc->irs; \ - ed->data[2] = _tc->snd_una - _tc->iss; \ - ed->data[3] = _tc->snd_nxt - _tc->iss; \ - ed->data[4] = _tc->rcv_nxt - _tc->irs; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ TCP_EVT_STATE_CHANGE_HANDLER(_tc); \ } @@ -371,7 +374,7 @@ if (_tc) \ ELOG_TYPE_DECLARE (_e) = \ { \ .format = "%s-rxt: iss %u irs %u snd_nxt %u rcv_nxt %u", \ - .format_args = "t4i4i4i4i4", \ + .format_args = "t4i4i4i4i4", \ .n_enum_strings = 2, \ .enum_strings = { \ "syn", \ @@ -382,10 +385,9 @@ if (_tc) \ ed->data[0] = _type; \ ed->data[1] = _tc->iss; \ ed->data[2] = _tc->irs; \ - ed->data[3] = _tc->snd_nxt - _tc->iss; \ - ed->data[4] = _tc->rcv_nxt - _tc->irs; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->rcv_nxt - _tc->irs; \ } - #else #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) #define TCP_EVT_SYNACK_SENT_HANDLER(_tc, ...) @@ -399,6 +401,81 @@ if (_tc) \ #endif #if TCP_DEBUG_SM > 1 +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _btcp, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _btcp.seq_number - _tc->irs; \ + ed->data[1] = _btcp.seq_end - _tc->irs; \ + ed->data[2] = _tc->rcv_las - _tc->irs; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_wnd; \ +} + +#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "paws-err: seq %u end %u tsval %u tsval_recent %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _seq - _tc->irs; \ + ed->data[1] = _end - _tc->irs; \ + ed->data[2] = _tc->rcv_opts.tsval; \ + ed->data[3] = _tc->tsval_recent; \ +} + +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 3, \ + .enum_strings = { \ + "invalid", \ + "old", \ + "future", \ + }, \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _type; \ + ed->data[1] = _ack - _tc->iss; \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_una_max - _tc->iss; \ +} + +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +{ \ +if (_av > 0) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_wnd; \ + ed->data[1] = _obs; \ + ed->data[2] = _av; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_las - _tc->irs; \ +} \ +} +#else +#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _btcp, ...) +#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) +#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) +#endif + +#if TCP_DEBUG_SM > 2 #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) \ { \ @@ -505,90 +582,18 @@ if (_tc) \ _tc_index); \ } \ } - -#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) \ -{ \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\ - .format_args = "i4i4i4i4i4", \ - }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _seq - _tc->irs; \ - ed->data[1] = _end - _tc->irs; \ - ed->data[2] = _tc->rcv_las - _tc->irs; \ - ed->data[3] = _tc->rcv_nxt - _tc->irs; \ - ed->data[4] = _tc->rcv_wnd; \ -} - -#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) \ -{ \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "paws-err: seq %u end %u tsval %u tsval_recent %u", \ - .format_args = "i4i4i4i4", \ - }; \ - DECLARE_ETD(_tc, _e, 4); \ - ed->data[0] = _seq - _tc->irs; \ - ed->data[1] = _end - _tc->irs; \ - ed->data[2] = _tc->rcv_opts.tsval; \ - ed->data[3] = _tc->tsval_recent; \ -} - -#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) \ -{ \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "ack-err: %s ack %u snd_una %u snd_nxt %u una_max %u", \ - .format_args = "t4i4i4i4i4", \ - .n_enum_strings = 3, \ - .enum_strings = { \ - "invalid", \ - "old", \ - "future", \ - }, \ - }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _type; \ - ed->data[1] = _ack - _tc->iss; \ - ed->data[2] = _tc->snd_una - _tc->iss; \ - ed->data[3] = _tc->snd_nxt - _tc->iss; \ - ed->data[4] = _tc->snd_una_max - _tc->iss; \ -} - -#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ -{ \ -if (_av > 0) \ -{ \ - ELOG_TYPE_DECLARE (_e) = \ - { \ - .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ - .format_args = "i4i4i4i4i4", \ - }; \ - DECLARE_ETD(_tc, _e, 5); \ - ed->data[0] = _tc->rcv_wnd; \ - ed->data[1] = _obs; \ - ed->data[2] = _av; \ - ed->data[3] = _tc->rcv_nxt - _tc->irs; \ - ed->data[4] = _tc->rcv_las - _tc->irs; \ -} \ -} #else #define TCP_EVT_ACK_SENT_HANDLER(_tc, ...) #define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) #define TCP_EVT_PKTIZE_HANDLER(_tc, ...) #define TCP_EVT_INPUT_HANDLER(_tc, _type, _len, _written, ...) #define TCP_EVT_TIMER_POP_HANDLER(_tc_index, _timer_id, ...) -#define TCP_EVT_SEG_INVALID_HANDLER(_tc, _seq, _end, ...) -#define TCP_EVT_PAWS_FAIL_HANDLER(_tc, _seq, _end, ...) -#define TCP_EVT_ACK_RCV_ERR_HANDLER(_tc, _type, _ack, ...) -#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) #endif /* * State machine verbose */ -#if TCP_DEBUG_SM > 2 +#if TCP_DEBUG_SM > 3 #define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ @@ -626,9 +631,9 @@ if (_av > 0) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "cc: %s snd_space %u snd_cong %u rxt_bytes %u", \ - .format_args = "t4i4i4i4", \ - .n_enum_strings = 6, \ + .format = "cc: %s snd_space %u snd_una %u out %u flight %u", \ + .format_args = "t4i4i4i4i4", \ + .n_enum_strings = 7, \ .enum_strings = { \ "fast-rxt", \ "rxt-timeout", \ @@ -636,13 +641,15 @@ if (_av > 0) \ "recovered", \ "congestion", \ "undo", \ + "recovery", \ }, \ }; \ - DECLARE_ETD(_tc, _e, 4); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _sub_evt; \ - ed->data[1] = tcp_available_snd_space (_tc); \ - ed->data[2] = _tc->snd_congestion - _tc->iss; \ - ed->data[3] = _tc->snd_rxt_bytes; \ + ed->data[1] = tcp_available_cc_snd_space (_tc); \ + ed->data[2] = _tc->snd_una - _tc->iss; \ + ed->data[3] = tcp_bytes_out(_tc); \ + ed->data[4] = tcp_flight_size (_tc); \ } #define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) \ @@ -659,19 +666,19 @@ if (_av > 0) \ ed->data[3] = _tc->snd_rxt_bytes; \ } -#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, _btcp, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "dack-tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ + .format = "dack-tx: rcv_nxt %u seq %u rcv_wnd %u snd_nxt %u av_wnd %u",\ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->rcv_nxt - _tc->irs; \ - ed->data[1] = _tc->rcv_wnd; \ - ed->data[2] = _tc->snd_nxt - _tc->iss; \ - ed->data[3] = tcp_available_snd_wnd(_tc); \ - ed->data[4] = _tc->snd_wnd; \ + ed->data[1] = _btcp.seq_number - _tc->irs; \ + ed->data[2] = _tc->rcv_wnd; \ + ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = tcp_available_snd_wnd(_tc); \ } #define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) \ @@ -700,12 +707,75 @@ if (_av > 0) \ ed->data[0] = _tc->snd_una - _tc->iss; \ ed->data[1] = _tc->snd_una_max - _tc->iss; \ } +#define TCP_EVT_CC_SCOREBOARD_HANDLER(_tc, ...) \ +{ \ +if (TCP_DEBUG_CC > 1 && _tc->sack_sb.last_sacked_bytes) \ + { \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "sb1: holes %u lost %u sacked %u high %u highrxt %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = pool_elts(_tc->sack_sb.holes); \ + ed->data[1] = _tc->sack_sb.lost_bytes; \ + ed->data[2] = _tc->sack_sb.sacked_bytes; \ + ed->data[3] = _tc->sack_sb.high_sacked - _tc->iss; \ + ed->data[4] = _tc->sack_sb.high_rxt - _tc->iss; \ + } \ +if (TCP_DEBUG_CC > 1 && _tc->sack_sb.last_sacked_bytes) \ + { \ + sack_scoreboard_hole_t *hole; \ + hole = scoreboard_first_hole (&_tc->sack_sb); \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "sb2: first start: %u end %u last start %u end %u", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = hole ? hole->start - _tc->iss : 0; \ + ed->data[1] = hole ? hole->end - _tc->iss : 0; \ + hole = scoreboard_last_hole (&_tc->sack_sb); \ + ed->data[2] = hole ? hole->start - _tc->iss : 0; \ + ed->data[3] = hole ? hole->end - _tc->iss : 0; \ + } \ +} +#define TCP_EVT_CC_SACKS_HANDLER(_tc, ...) \ +{ \ +if (TCP_DEBUG_CC > 1) \ + { \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "sacks: blocks %u bytes %u", \ + .format_args = "i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 2); \ + ed->data[0] = vec_len (_tc->snd_sacks); \ + ed->data[1] = tcp_sack_list_bytes (_tc); \ + } \ +} +#define TCP_EVT_CC_INPUT_HANDLER(_tc, _len, _written, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "cc input: len %u written %d rcv_nxt %u rcv_wnd(o) %d", \ + .format_args = "i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 4); \ + ed->data[0] = _len; \ + ed->data[1] = _written; \ + ed->data[2] = _tc->rcv_nxt - _tc->irs; \ + ed->data[3] = _tc->rcv_wnd - (_tc->rcv_nxt - _tc->rcv_las); \ +} #else #define TCP_EVT_CC_RTX_HANDLER(_tc, offset, n_bytes, ...) -#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) +#define TCP_EVT_DUPACK_SENT_HANDLER(_tc, _btcp, ...) #define TCP_EVT_DUPACK_RCVD_HANDLER(_tc, ...) #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) +#define TCP_EVT_CC_SCOREBOARD_HANDLER(_tc, ...) +#define TCP_EVT_CC_SACKS_HANDLER(_tc, ...) +#define TCP_EVT_CC_INPUT_HANDLER(_tc, _len, _written, ...) #endif /* diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index c86432037fd..19ecc7deef8 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -275,6 +275,14 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0, u32 * error0) { + /* We could get a burst of RSTs interleaved with acks */ + if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED)) + { + tcp_send_reset (tc0); + *error0 = TCP_ERROR_CONNECTION_CLOSED; + goto drop; + } + if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) { *error0 = TCP_ERROR_SEGMENT_INVALID; @@ -292,13 +300,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, { *error0 = TCP_ERROR_PAWS; if (CLIB_DEBUG > 2) - { - clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2); - clib_warning ("seq %u seq_end %u ack %u", - vnet_buffer (b0)->tcp.seq_number - tc0->irs, - vnet_buffer (b0)->tcp.seq_end - tc0->irs, - vnet_buffer (b0)->tcp.ack_number - tc0->iss); - } + clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2); TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); @@ -317,7 +319,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (!tcp_rst (th0)) { tcp_make_ack (tc0, b0); - TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp); goto error; } } @@ -329,7 +331,6 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, vnet_buffer (b0)->tcp.seq_end)) { *error0 = TCP_ERROR_RCV_WND; - /* If our window is 0 and the packet is in sequence, let it pass * through for ack processing. It should be dropped later. */ if (!(tc0->rcv_wnd == 0 @@ -339,7 +340,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (!tcp_rst (th0)) { tcp_make_ack (tc0, b0); - TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp); goto error; } goto drop; @@ -889,13 +890,14 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) scoreboard_update_bytes (tc, sb); sb->last_sacked_bytes = sb->sacked_bytes - (old_sacked_bytes - sb->last_bytes_delivered); - ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes); + ASSERT (sb->last_sacked_bytes <= sb->sacked_bytes || tcp_in_recovery (tc)); ASSERT (sb->sacked_bytes == 0 || sb->sacked_bytes < tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->last_sacked_bytes + sb->lost_bytes <= tc->snd_una_max - seq_max (tc->snd_una, ack)); ASSERT (sb->head == TCP_INVALID_SACK_HOLE_INDEX || tcp_in_recovery (tc) || sb->holes[sb->head].start == ack + sb->snd_una_adv); + TCP_EVT_DBG (TCP_EVT_CC_SCOREBOARD, tc); } /** @@ -1063,11 +1065,18 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { u32 rxt_delivered; + if (tcp_in_fastrecovery (tc) && tcp_opts_sack_permitted (&tc->rcv_opts)) + { + if (tc->bytes_acked) + goto partial_ack; + tcp_fast_retransmit (tc); + return; + } /* * Duplicate ACK. Check if we should enter fast recovery, or if already in * it account for the bytes that left the network. */ - if (is_dack && !tcp_in_recovery (tc)) + else if (is_dack && !tcp_in_recovery (tc)) { TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); ASSERT (tc->snd_una != tc->snd_una_max @@ -1128,7 +1137,6 @@ tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) { tcp_fast_retransmit_no_sack (tc); } - return; } else if (!tc->bytes_acked @@ -1237,6 +1245,16 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) { + /* When we entered recovery, we reset snd_nxt to snd_una. Seems peer + * still has the data so accept the ack */ + if (tcp_in_recovery (tc) + && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_congestion) + && seq_geq (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + { + tc->snd_una_max = tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + goto process_ack; + } + /* If we have outstanding data and this is within the window, accept it, * probably retransmit has timed out. Otherwise ACK segment and then * drop it */ @@ -1264,9 +1282,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, vnet_buffer (b)->tcp.ack_number); if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) - { - tcp_cc_handle_event (tc, 1); - } + tcp_cc_handle_event (tc, 1); /* Don't drop yet */ return 0; } @@ -1274,7 +1290,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, /* * Looks okay, process feedback */ - +process_ack: if (tcp_opts_sack_permitted (&tc->rcv_opts)) tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); @@ -1390,6 +1406,15 @@ tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) ASSERT (tcp_sack_vector_is_sane (tc->snd_sacks)); } +u32 +tcp_sack_list_bytes (tcp_connection_t * tc) +{ + u32 bytes = 0, i; + for (i = 0; i < vec_len (tc->snd_sacks); i++) + bytes += tc->snd_sacks[i].end - tc->snd_sacks[i].start; + return bytes; +} + /** Enqueue data for delivery to application */ always_inline int tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, @@ -1416,6 +1441,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, /* Send ACK confirming the update */ tc->flags |= TCP_CONN_SNDACK; + TCP_EVT_DBG (TCP_EVT_CC_INPUT, tc, data_len, written); } else if (written > 0) { @@ -1488,6 +1514,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, end = start + ooo_segment_length (s0->server_rx_fifo, newest); tcp_update_sack_list (tc, start, end); svm_fifo_newest_ooo_segment_reset (s0->server_rx_fifo); + TCP_EVT_DBG (TCP_EVT_CC_SACKS, tc); } } @@ -1508,7 +1535,7 @@ tcp_can_delack (tcp_connection_t * tc) /* constrained to send ack */ || (tc->flags & TCP_CONN_SNDACK) != 0 /* we're almost out of tx wnd */ - || tcp_available_snd_space (tc) < 4 * tc->snd_mss) + || tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss) return 0; return 1; @@ -1592,7 +1619,7 @@ tcp_segment_rcv (tcp_connection_t * tc, vlib_buffer_t * b, u32 * next0) *next0 = tcp_next_output (tc->c_is_ip4); tcp_make_ack (tc, b); vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; - TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc); + TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp); goto done; } @@ -1773,9 +1800,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, &error0))) { tcp_maybe_inc_err_counter (err_counters, error0); - TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0, - vnet_buffer (b0)->tcp.seq_number, - vnet_buffer (b0)->tcp.seq_end); + TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0, vnet_buffer (b0)->tcp); goto done; } diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 0f43d21dfde..a9ec58c262f 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -41,8 +41,8 @@ newreno_rcv_ack (tcp_connection_t * tc) if (tc->cwnd_acc_bytes >= tc->cwnd) { u32 inc = tc->cwnd_acc_bytes / tc->cwnd; - tc->cwnd += inc * tc->snd_mss; tc->cwnd_acc_bytes -= inc * tc->cwnd; + tc->cwnd += inc * tc->snd_mss; } tc->cwnd = clib_min (tc->cwnd, transport_tx_fifo_size (&tc->connection)); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 27450654f71..a036287a51c 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -389,6 +389,7 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, { case TCP_STATE_ESTABLISHED: case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_CLOSED: return tcp_make_established_options (tc, opts); case TCP_STATE_SYN_RCVD: return tcp_make_synack_options (tc, opts); @@ -1337,8 +1338,9 @@ done: * Reset congestion control, switch cwnd to loss window and try again. */ static void -tcp_rtx_timeout_cc (tcp_connection_t * tc) +tcp_rxt_timeout_cc (tcp_connection_t * tc) { + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 6); tc->prev_ssthresh = tc->ssthresh; tc->prev_cwnd = tc->cwnd; @@ -1383,6 +1385,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; } + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); + if (tc->state >= TCP_STATE_ESTABLISHED) { /* Lost FIN, retransmit and return */ @@ -1414,13 +1418,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* First retransmit timeout */ if (tc->rto_boff == 1) - tcp_rtx_timeout_cc (tc); + tcp_rxt_timeout_cc (tc); tc->snd_una_max = tc->snd_nxt = tc->snd_una; tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - /* Send one segment. Note that n_bytes may be zero due to buffer shortfall */ n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); @@ -1627,7 +1629,7 @@ void tcp_fast_retransmit_sack (tcp_connection_t * tc) { vlib_main_t *vm = vlib_get_main (); - u32 n_written = 0, offset, max_bytes; + u32 n_written = 0, offset, max_bytes, n_segs = 0; vlib_buffer_t *b = 0; sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; @@ -1636,14 +1638,17 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) u8 snd_limited = 0, can_rescue = 0; ASSERT (tcp_in_fastrecovery (tc)); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); old_snd_nxt = tc->snd_nxt; sb = &tc->sack_sb; - snd_space = tcp_available_snd_space (tc); + snd_space = tcp_available_cc_snd_space (tc); + if (snd_space < tc->snd_mss) + goto done; + + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); - while (hole && snd_space > 0) + while (hole && snd_space > 0 && n_segs++ < VLIB_FRAME_SIZE) { hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), @@ -1717,7 +1722,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) /* Start resending from first un-acked segment */ old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_snd_space (tc); + snd_space = tcp_available_cc_snd_space (tc); while (snd_space > 0) { @@ -1743,8 +1748,7 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) void tcp_fast_retransmit (tcp_connection_t * tc) { - if (tcp_opts_sack_permitted (&tc->rcv_opts) - && scoreboard_first_hole (&tc->sack_sb)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) tcp_fast_retransmit_sack (tc); else tcp_fast_retransmit_no_sack (tc); -- 2.16.6