From 3e350af5d3e9744a4529a28dd293b2d4601442f7 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Thu, 30 Mar 2017 02:54:28 -0700 Subject: [PATCH] TCP cc/window management fixes and debugging - added persist timer - update rcv_las whenever sending an ack - moved fifo size to its own cache line - improved session and builtin client debugging Change-Id: Ia649cf942cf0c061a713e8b67f0eb6974a6cd55b Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 37 +++++----- src/svm/svm_fifo.h | 8 ++- src/vnet/session/node.c | 24 ++++--- src/vnet/session/session.c | 6 +- src/vnet/session/session.h | 12 +++- src/vnet/session/session_debug.h | 32 ++++++++- src/vnet/tcp/builtin_client.c | 146 +++++++++++++++++++++++++++------------ src/vnet/tcp/builtin_server.c | 12 +++- src/vnet/tcp/tcp.c | 6 +- src/vnet/tcp/tcp.h | 36 +++++++++- src/vnet/tcp/tcp_debug.h | 66 +++++++++++++----- src/vnet/tcp/tcp_input.c | 68 +++++++++--------- src/vnet/tcp/tcp_output.c | 79 +++++++++++++++++---- 13 files changed, 386 insertions(+), 146 deletions(-) diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 07b0d2dfbe3..cc84feb9a6d 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -254,6 +254,10 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) { ooo_segment_t *s; u32 index, bytes = 0, diff; + u32 cursize; + + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); @@ -286,8 +290,8 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) /* If tail is adjacent to an ooo segment, 'consume' it */ if (diff == 0) { - bytes = ((f->nitems - f->cursize) >= s->length) ? s->length : - f->nitems - f->cursize; + bytes = ((f->nitems - cursize) >= s->length) ? s->length : + f->nitems - cursize; f->tail += bytes; f->tail %= f->nitems; @@ -305,11 +309,12 @@ svm_fifo_enqueue_internal (svm_fifo_t * f, u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; - if (PREDICT_FALSE (f->cursize == f->nitems)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + + if (PREDICT_FALSE (cursize == f->nitems)) return -2; /* fifo stuffed */ - /* read cursize, which can only decrease while we're working */ - cursize = f->cursize; nitems = f->nitems; /* Number of bytes we're going to copy */ @@ -382,8 +387,8 @@ svm_fifo_enqueue_with_offset_internal (svm_fifo_t * f, ASSERT (offset > 0); - /* read cursize, which can only decrease while we're working */ - cursize = f->cursize; + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); nitems = f->nitems; /* Will this request fit? */ @@ -437,11 +442,11 @@ svm_fifo_dequeue_internal (svm_fifo_t * f, u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems; - if (PREDICT_FALSE (f->cursize == 0)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + if (PREDICT_FALSE (cursize == 0)) return -2; /* nothing in the fifo */ - /* read cursize, which can only increase while we're working */ - cursize = f->cursize; nitems = f->nitems; /* Number of bytes we're going to copy */ @@ -495,11 +500,11 @@ svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; u32 cursize, nitems, real_head; - if (PREDICT_FALSE (f->cursize == 0)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + if (PREDICT_FALSE (cursize == 0)) return -2; /* nothing in the fifo */ - /* read cursize, which can only increase while we're working */ - cursize = f->cursize; nitems = f->nitems; real_head = f->head + offset; real_head = real_head >= nitems ? real_head - nitems : real_head; @@ -532,11 +537,11 @@ svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes) u32 total_drop_bytes, first_drop_bytes, second_drop_bytes; u32 cursize, nitems; - if (PREDICT_FALSE (f->cursize == 0)) + /* read cursize, which can only increase while we're working */ + cursize = svm_fifo_max_dequeue (f); + if (PREDICT_FALSE (cursize == 0)) return -2; /* nothing in the fifo */ - /* read cursize, which can only increase while we're working */ - cursize = f->cursize; nitems = f->nitems; /* Number of bytes we're going to drop */ diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h index 39556173e4d..80e5b0f2d39 100644 --- a/src/svm/svm_fifo.h +++ b/src/svm/svm_fifo.h @@ -44,14 +44,16 @@ typedef struct typedef struct { + volatile u32 cursize; /**< current fifo size */ + u32 nitems; + CLIB_CACHE_LINE_ALIGN_MARK (end_cursize); + pthread_mutex_t mutex; /* 8 bytes */ pthread_cond_t condvar; /* 8 bytes */ svm_lock_tag_t tag; - volatile u32 cursize; /**< current fifo size */ volatile u8 has_event; /**< non-zero if deq event exists */ u32 owner_pid; - u32 nitems; /* Backpointers */ u32 server_session_index; @@ -105,7 +107,7 @@ svm_fifo_max_dequeue (svm_fifo_t * f) static inline u32 svm_fifo_max_enqueue (svm_fifo_t * f) { - return f->nitems - f->cursize; + return f->nitems - svm_fifo_max_dequeue (f); } static inline u8 diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index 8681105c284..b86e87d9bac 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -119,15 +119,20 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, /* Nothing to read return */ if (max_dequeue0 == 0) - { - return 0; - } + return 0; /* Ensure we're not writing more than transport window allows */ - max_len_to_snd0 = clib_min (max_dequeue0, snd_space0); - - /* TODO check if transport is willing to send len_to_snd0 - * bytes (Nagle) */ + if (max_dequeue0 < snd_space0) + { + /* Constrained by tx queue. Try to send only fully formed segments */ + max_len_to_snd0 = (max_dequeue0 > snd_mss0) ? + max_dequeue0 - max_dequeue0 % snd_mss0 : max_dequeue0; + /* TODO Nagle ? */ + } + else + { + max_len_to_snd0 = snd_space0; + } n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE; n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes); @@ -308,11 +313,14 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, int n_tx_packets = 0; u32 my_thread_index = vm->cpu_index; int i, rv; + f64 now = vlib_time_now (vm); + + SESSION_EVT_DBG (SESSION_EVT_POLL_GAP_TRACK, smm, my_thread_index); /* * Update TCP time */ - tcp_update_time (vlib_time_now (vm), my_thread_index); + tcp_update_time (now, my_thread_index); /* * Get vpp queue events diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index f10918aa3c4..8e2b2616f58 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -556,7 +556,7 @@ session_manager_allocate_session_fifos (session_manager_main_t * smm, u8 * added_a_segment) { svm_fifo_segment_private_t *fifo_segment; - u32 fifo_size, default_fifo_size = 128 << 10; /* TODO config */ + u32 fifo_size, default_fifo_size = 1 << 16; /* TODO config */ int i; *added_a_segment = 0; @@ -1293,6 +1293,10 @@ session_manager_main_enable (vlib_main_t * vm) vec_validate (smm->current_enqueue_epoch, num_threads - 1); vec_validate (smm->vpp_event_queues, num_threads - 1); +#if SESSION_DBG + vec_validate (smm->last_event_poll_by_thread, num_threads - 1); +#endif + /* $$$$ preallocate hack config parameter */ for (i = 0; i < 200000; i++) { diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index a39bc06ffca..6878b4d25c8 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -20,6 +20,7 @@ #include #include #include +#include #define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0) #define INVALID_INDEX ((u32)~0) @@ -36,7 +37,7 @@ typedef enum FIFO_EVENT_BUILTIN_RX } fifo_event_type_t; -#define foreach_session_input_error \ +#define foreach_session_input_error \ _(NO_SESSION, "No session drops") \ _(NO_LISTENER, "No listener for dst port drops") \ _(ENQUEUED, "Packets pushed into rx fifo") \ @@ -218,6 +219,15 @@ struct _session_manager_main /* Convenience */ vlib_main_t *vlib_main; vnet_main_t *vnet_main; + +#if SESSION_DBG + /** + * last event poll time by thread + * Debug only. Will cause false cache-line sharing as-is + */ + f64 *last_event_poll_by_thread; +#endif + }; extern session_manager_main_t session_manager_main; diff --git a/src/vnet/session/session_debug.h b/src/vnet/session/session_debug.h index 80a97cd5f09..eb11f1a00b9 100644 --- a/src/vnet/session/session_debug.h +++ b/src/vnet/session/session_debug.h @@ -16,13 +16,13 @@ #define SRC_VNET_SESSION_SESSION_DEBUG_H_ #include -#include #include #define foreach_session_dbg_evt \ _(ENQ, "enqueue") \ _(DEQ, "dequeue") \ - _(DEQ_NODE, "dequeue") + _(DEQ_NODE, "dequeue") \ + _(POLL_GAP_TRACK, "poll gap track") \ typedef enum _session_evt_dbg { @@ -33,6 +33,7 @@ typedef enum _session_evt_dbg #define SESSION_DBG (0) #define SESSION_DEQ_NODE_EVTS (0) +#define SESSION_EVT_POLL_DBG (1) #if TRANSPORT_DEBUG && SESSION_DBG @@ -97,9 +98,34 @@ typedef enum _session_evt_dbg #define SESSION_EVT_DEQ_NODE_HANDLER(_node_evt) #endif +#if SESSION_DBG && SESSION_EVT_POLL_DBG +#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "nixon-gap: %d MS", \ + .format_args = "i4", \ + }; \ + DEC_SESSION_ED(_e, 1); \ + ed->data[0] = (u32) ((now - \ + _smm->last_event_poll_by_thread[my_thread_index])*1000.0); \ +} +#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index) \ +{ \ + if (PREDICT_TRUE( \ + smm->last_event_poll_by_thread[my_thread_index] != 0.0)) \ + if (now > smm->last_event_poll_by_thread[_my_thread_index] + 500e-6)\ + SESSION_EVT_POLL_GAP(smm, my_thread_index); \ + _smm->last_event_poll_by_thread[my_thread_index] = now; \ +} + +#else +#define SESSION_EVT_POLL_GAP(_smm, _my_thread_index) +#define SESSION_EVT_POLL_GAP_TRACK_HANDLER(_smm, _my_thread_index) +#endif + #define CONCAT_HELPER(_a, _b) _a##_b #define CC(_a, _b) CONCAT_HELPER(_a, _b) - #define SESSION_EVT_DBG(_evt, _args...) CC(_evt, _HANDLER)(_args) #else diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 83cdbc1be6d..e3705060400 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -43,6 +43,10 @@ #include #undef vl_printfun +#define TCP_BUILTIN_CLIENT_DBG (1) +#define TCP_BUILTIN_CLIENT_VPP_THREAD (0) +#define TCP_BUILTIN_CLIENT_PTHREAD (!TCP_BUILTIN_CLIENT_VPP_THREAD) + static void send_test_chunk (tclient_main_t * tm, session_t * s) { @@ -52,35 +56,50 @@ send_test_chunk (tclient_main_t * tm, session_t * s) session_fifo_event_t evt; static int serial_number = 0; int rv; + test_buf_offset = s->bytes_sent % vec_len (test_data); + bytes_this_chunk = vec_len (test_data) - test_buf_offset; - while (s->bytes_to_send > 0) - { + bytes_this_chunk = bytes_this_chunk < s->bytes_to_send + ? bytes_this_chunk : s->bytes_to_send; - test_buf_offset = s->bytes_sent % vec_len (test_data); - bytes_this_chunk = vec_len (test_data) - test_buf_offset; + rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ , + bytes_this_chunk, + test_data + test_buf_offset); - bytes_this_chunk = bytes_this_chunk < s->bytes_to_send - ? bytes_this_chunk : s->bytes_to_send; + /* If we managed to enqueue data... */ + if (rv > 0) + { + if (TCP_BUILTIN_CLIENT_DBG) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-enq: %d bytes", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 data[1]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = rv; + } - rv = svm_fifo_enqueue_nowait (s->server_tx_fifo, 0 /*pid */ , - bytes_this_chunk, - test_data + test_buf_offset); + /* Account for it... */ + s->bytes_to_send -= rv; + s->bytes_sent += rv; - if (rv > 0) + /* Poke the TCP state machine */ + if (svm_fifo_set_event (s->server_tx_fifo)) { - s->bytes_to_send -= rv; - s->bytes_sent += rv; + /* Fabricate TX event, send to vpp */ + evt.fifo = s->server_tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + evt.event_id = serial_number++; - if (svm_fifo_set_event (s->server_tx_fifo)) - { - /* Fabricate TX event, send to vpp */ - evt.fifo = s->server_tx_fifo; - evt.event_type = FIFO_EVENT_SERVER_TX; - evt.event_id = serial_number++; - - unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, - 0 /* do wait for mutex */ ); - } + unix_shared_memory_queue_add (tm->vpp_event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); } } } @@ -89,39 +108,55 @@ static void receive_test_chunk (tclient_main_t * tm, session_t * s) { svm_fifo_t *rx_fifo = s->server_rx_fifo; - int n_read, bytes, i; + int n_read, test_bytes = 0; - bytes = svm_fifo_max_dequeue (rx_fifo); /* Allow enqueuing of new event */ - svm_fifo_unset_event (rx_fifo); + // svm_fifo_unset_event (rx_fifo); - /* Read the bytes */ - do + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf), + tm->rx_buf); + if (n_read > 0) { - n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len (tm->rx_buf), - tm->rx_buf); - if (n_read > 0) + if (TCP_BUILTIN_CLIENT_DBG) { - bytes -= n_read; + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "rx-deq: %d bytes", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 data[1]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = n_read; + } + + if (test_bytes) + { + int i; for (i = 0; i < n_read; i++) { if (tm->rx_buf[i] != ((s->bytes_received + i) & 0xff)) { clib_warning ("read %d error at byte %lld, 0x%x not 0x%x", - n_read, s->bytes_received + i, - tm->rx_buf[i], + n_read, s->bytes_received + i, tm->rx_buf[i], ((s->bytes_received + i) & 0xff)); } } - s->bytes_to_receive -= n_read; - s->bytes_received += n_read; } - + s->bytes_to_receive -= n_read; + s->bytes_received += n_read; } - while (n_read < 0 || bytes > 0); } +#if TCP_BUILTIN_CLIENT_VPP_THREAD +static void +#else static void * +#endif tclient_thread_fn (void *arg) { tclient_main_t *tm = &tclient_main; @@ -139,6 +174,8 @@ tclient_thread_fn (void *arg) pthread_sigmask (SIG_SETMASK, &s, 0); } + clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0]; + while (1) { /* Wait until we're told to get busy */ @@ -186,12 +223,12 @@ tclient_thread_fn (void *arg) /* Disconnect sessions... */ vec_reset_length (session_indices); - pool_foreach (sp, tm->sessions, ( - { - vec_add1 (session_indices, - sp - tm->sessions); - } - )); + + /* *INDENT-OFF* */ + pool_foreach (sp, tm->sessions, ({ + vec_add1 (session_indices, sp - tm->sessions); + })); + /* *INDENT-ON* */ for (i = 0; i < vec_len (session_indices); i++) { @@ -207,7 +244,9 @@ tclient_thread_fn (void *arg) } } /* NOTREACHED */ +#if TCP_BUILTIN_CLIENT_PTHREAD return 0; +#endif } /* So we don't get "no handler for... " msgs */ @@ -333,7 +372,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { - u8 *connect_uri = (u8 *) "tcp://6.0.1.2/1234"; + u8 *connect_uri = (u8 *) "tcp://6.0.1.1/1234"; u8 *uri; tclient_main_t *tm = &tclient_main; int i; @@ -349,7 +388,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, ; else if (unformat (input, "iterations %d", &tm->n_iterations)) ; - else if (unformat (input, "bytes %d", &tm->bytes_to_send)) + else if (unformat (input, "bytes %lld", &tm->bytes_to_send)) ; else if (unformat (input, "uri %s", &tm->connect_uri)) ; @@ -366,17 +405,20 @@ test_tcp_clients_command_fn (vlib_main_t * vm, create_api_loopback (tm); +#if TCP_BUILTIN_CLIENT_PTHREAD /* Start a transmit thread */ if (tm->client_thread_handle == 0) { int rv = pthread_create (&tm->client_thread_handle, - NULL /*attr */ , tclient_thread_fn, 0); + NULL /*attr */ , + tclient_thread_fn, 0); if (rv) { tm->client_thread_handle = 0; return clib_error_return (0, "pthread_create returned %d", rv); } } +#endif /* Fire off connect requests, in something approaching a normal manner */ for (i = 0; i < n_clients; i++) @@ -397,6 +439,18 @@ test_tcp_clients_command_fn (vlib_main_t * vm, return 0; } +#if TCP_BUILTIN_CLIENT_VPP_THREAD +/* *INDENT-OFF* */ +VLIB_REGISTER_THREAD (builtin_client_reg, static) = { + .name = "tcp-builtin-client", + .function = tclient_thread_fn, + .fixed_count = 1, + .count = 1, + .no_data_structure_clone = 1, +}; +/* *INDENT-ON* */ +#endif + /* *INDENT-OFF* */ VLIB_CLI_COMMAND (test_clients_command, static) = { diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index efd26e91791..917d4bd3b94 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -127,6 +127,7 @@ builtin_server_rx_callback (stream_session_t * s) { /* XXX timeout for session that are stuck */ + rx_event: /* Program self-tap to retry */ if (svm_fifo_set_event (rx_fifo)) { @@ -158,7 +159,9 @@ builtin_server_rx_callback (stream_session_t * s) n_written = svm_fifo_enqueue_nowait (tx_fifo, 0, actual_transfer, bsm->rx_buf); - ASSERT (n_written == max_transfer); + + if (n_written != max_transfer) + clib_warning ("short trout!"); if (svm_fifo_set_event (tx_fifo)) { @@ -171,6 +174,9 @@ builtin_server_rx_callback (stream_session_t * s) (u8 *) & evt, 0 /* do wait for mutex */ ); } + if (PREDICT_FALSE (max_enqueue < max_dequeue)) + goto rx_event; + return 0; } @@ -204,8 +210,8 @@ server_create (vlib_main_t * vm) a->session_cb_vft = &builtin_session_cb_vft; a->options = options; a->options[SESSION_OPTIONS_SEGMENT_SIZE] = 128 << 20; - a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 64 << 10; - a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 64 << 10; + a->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 1 << 16; + a->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 1 << 16; a->segment_name = segment_name; a->segment_name_length = ARRAY_LEN (segment_name); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index c3df5bc1d3e..b2a371e2cd8 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -578,7 +578,9 @@ tcp_session_send_space (transport_connection_t * trans_conn) /* If we can't write at least a segment, don't try at all */ if (snd_space < tc->snd_mss) return 0; - return snd_space; + + /* round down to mss multiple */ + return snd_space - (snd_space % tc->snd_mss); } /* If in fast recovery, send 1 SMSS if wnd allows */ @@ -706,7 +708,7 @@ static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = { tcp_timer_retransmit_handler, tcp_timer_delack_handler, - 0, + tcp_timer_persist_handler, tcp_timer_keep_handler, tcp_timer_waitclose_handler, tcp_timer_retransmit_syn_handler, diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index b4286bc4f5a..2f5da108813 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -81,6 +81,7 @@ typedef void (timer_expiration_handler) (u32 index); extern timer_expiration_handler tcp_timer_delack_handler; extern timer_expiration_handler tcp_timer_retransmit_handler; +extern timer_expiration_handler tcp_timer_persist_handler; extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_TIMER_HANDLE_INVALID ((u32) ~0) @@ -253,13 +254,25 @@ struct _tcp_cc_algorithm #define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY +#define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY +#define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) -#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) +#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY)) #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) +#define tcp_in_cong_recovery(tc) ((tc)->flags & \ + (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) + +always_inline void +tcp_cong_recovery_off (tcp_connection_t * tc) +{ + tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY); + tcp_fastrecovery_1_smss_off (tc); +} + typedef enum { TCP_IP4, @@ -538,6 +551,27 @@ tcp_retransmit_timer_reset (tcp_connection_t * tc) tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); } +always_inline void +tcp_persist_timer_set (tcp_connection_t * tc) +{ + /* Reuse RTO. It's backed off in handler */ + tcp_timer_set (tc, TCP_TIMER_PERSIST, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_persist_timer_update (tcp_connection_t * tc) +{ + tcp_timer_update (tc, TCP_TIMER_PERSIST, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_persist_timer_reset (tcp_connection_t * tc) +{ + tcp_timer_reset (tc, TCP_TIMER_PERSIST); +} + always_inline u8 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) { diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 5a71694ec13..0090e15e581 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -31,6 +31,7 @@ _(UNBIND, "unbind") \ _(DELETE, "delete") \ _(SYN_SENT, "SYN sent") \ + _(SYN_RTX, "SYN retransmit") \ _(FIN_SENT, "FIN sent") \ _(ACK_SENT, "ACK sent") \ _(DUPACK_SENT, "DUPACK sent") \ @@ -50,6 +51,7 @@ _(CC_PACK, "cc partial ack") \ _(SEG_INVALID, "invalid segment") \ _(ACK_RCV_ERR, "invalid ack") \ + _(RCV_WND_SHRUNK, "shrunk rcv_wnd") \ typedef enum _tcp_dbg { @@ -159,35 +161,48 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "ack_prep: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u", \ - .format_args = "i4i4i4i4", \ + .format = "ack_tx: acked %u rcv_nxt %u rcv_wnd %u snd_nxt %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 4); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->rcv_nxt - _tc->rcv_las; \ ed->data[1] = _tc->rcv_nxt - _tc->irs; \ ed->data[2] = _tc->rcv_wnd; \ ed->data[3] = _tc->snd_nxt - _tc->iss; \ + ed->data[4] = _tc->snd_wnd; \ } #define TCP_EVT_DUPACK_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av-wnd %u", \ - .format_args = "i4i4i4i4", \ + .format = "dack_tx: rcv_nxt %u rcv_wnd %u snd_nxt %u av_wnd %u snd_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 4); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->rcv_nxt - _tc->irs; \ ed->data[1] = _tc->rcv_wnd; \ ed->data[2] = _tc->snd_nxt - _tc->iss; \ ed->data[3] = tcp_available_wnd(_tc); \ + ed->data[4] = _tc->snd_wnd; \ } #define TCP_EVT_SYN_SENT_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "SYNtx: iss %u", \ + .format = "SYNtx: iss %u", \ + .format_args = "i4", \ + }; \ + DECLARE_ETD(_tc, _e, 1); \ + ed->data[0] = _tc->iss; \ +} + +#define TCP_EVT_SYN_RTX_HANDLER(_tc, ...) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "SYNrtx: iss %u", \ .format_args = "i4", \ }; \ DECLARE_ETD(_tc, _e, 1); \ @@ -254,17 +269,17 @@ typedef enum _tcp_dbg_evt ed->data[1] = _tc->rcv_nxt - _tc->irs; \ } -#define TCP_EVT_ACK_RCVD_HANDLER(_tc, _ack, ...) \ +#define TCP_EVT_ACK_RCVD_HANDLER(_tc, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "acked: %u snd_una %u ack %u cwnd %u inflight %u", \ + .format = "acked: %u snd_una %u snd_wnd %u cwnd %u inflight %u", \ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->bytes_acked; \ ed->data[1] = _tc->snd_una - _tc->iss; \ - ed->data[2] = _ack - _tc->iss; \ + ed->data[2] = _tc->snd_wnd; \ ed->data[3] = _tc->cwnd; \ ed->data[4] = tcp_flight_size(_tc); \ } @@ -273,14 +288,15 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u inflight %u", \ - .format_args = "i4i4i4i4", \ + .format = "dack_rx: snd_una %u cwnd %u snd_wnd %u flight %u rcv_wnd %u",\ + .format_args = "i4i4i4i4i4", \ }; \ - DECLARE_ETD(_tc, _e, 4); \ + DECLARE_ETD(_tc, _e, 5); \ ed->data[0] = _tc->snd_una - _tc->iss; \ ed->data[1] = _tc->cwnd; \ ed->data[2] = _tc->snd_wnd; \ ed->data[3] = tcp_flight_size(_tc); \ + ed->data[4] = _tc->rcv_wnd; \ } #define TCP_EVT_PKTIZE_HANDLER(_tc, ...) \ @@ -302,7 +318,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "in: %s len %u written %d rcv_nxt %u free wnd %d", \ + .format = "in: %s len %u written %d rcv_nxt %u rcv_wnd(o) %d", \ .format_args = "t4i4i4i4i4", \ .n_enum_strings = 2, \ .enum_strings = { \ @@ -338,7 +354,7 @@ typedef enum _tcp_dbg_evt .enum_strings = { \ "retransmit", \ "delack", \ - "BUG", \ + "persist", \ "keep", \ "waitclose", \ "retransmit syn", \ @@ -354,7 +370,7 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u wnd %u", \ + .format = "seg-inv: seq %u end %u rcv_las %u rcv_nxt %u rcv_wnd %u",\ .format_args = "i4i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 5); \ @@ -445,6 +461,24 @@ typedef enum _tcp_dbg_evt #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) #endif +#define TCP_EVT_RCV_WND_SHRUNK_HANDLER(_tc, _obs, _av, ...) \ +{ \ +if (_av > 0) \ +{ \ + ELOG_TYPE_DECLARE (_e) = \ + { \ + .format = "huh?: rcv_wnd %u obsd %u av %u rcv_nxt %u rcv_las %u", \ + .format_args = "i4i4i4i4i4", \ + }; \ + DECLARE_ETD(_tc, _e, 5); \ + ed->data[0] = _tc->rcv_wnd; \ + ed->data[1] = _obs; \ + ed->data[2] = _av; \ + ed->data[3] = _tc->rcv_nxt - _tc->irs; \ + ed->data[4] = _tc->rcv_las - _tc->irs; \ +} \ +} + #if TCP_DBG_VERBOSE #define TCP_EVT_SND_WND_HANDLER(_tc, ...) \ { \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 5d11985faa7..a8224dc2450 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -276,8 +276,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tc0->rcv_wnd == 0 && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number) { - /* Make it look as if there's nothing to dequeue */ - vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number; + /* TODO Should segment be tagged? */ } else { @@ -375,7 +374,6 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) { mrtt = tcp_time_now () - tc->rtt_ts; - tc->rtt_seq = 0; } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances @@ -395,6 +393,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + /* Allow measuring of RTT and make sure boff is 0 */ + tc->rtt_seq = 0; + tc->rto_boff = 0; + return 1; } @@ -408,11 +410,7 @@ tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); /* Update rtt and rto */ - if (tcp_update_rtt (tc, ack)) - { - /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */ - tc->rto_boff = 0; - } + tcp_update_rtt (tc, ack); } /** @@ -672,6 +670,13 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl1 = seq; tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); + + /* Set probe timer if we just got 0 wnd */ + if (tc->snd_wnd < tc->snd_mss + && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) + tcp_persist_timer_set (tc); + else + tcp_persist_timer_reset (tc); } } @@ -686,6 +691,10 @@ tcp_cc_congestion (tcp_connection_t * tc) void tcp_cc_recover (tcp_connection_t * tc) { + /* TODO: check if time to recover was small. It might be that RTO popped + * too soon. + */ + tc->cc_algo->recovered (tc); tc->rtx_bytes = 0; @@ -695,8 +704,7 @@ tcp_cc_recover (tcp_connection_t * tc) tc->cc_algo->rcv_ack (tc); tc->tsecr_last_ack = tc->opt.tsecr; - tcp_fastrecovery_1_smss_off (tc); - tcp_fastrecovery_off (tc); + tcp_cong_recovery_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -706,7 +714,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; - if (tcp_in_recovery (tc)) + if (tcp_in_cong_recovery (tc)) { partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) @@ -724,10 +732,10 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) /* In case snd_nxt is still in the past and output tries to * shove some new bytes */ - tc->snd_nxt = tc->snd_una; + tc->snd_nxt = tc->snd_una_max; /* XXX need proper RFC6675 support */ - if (tc->sack_sb.last_sacked_bytes) + if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc)) { tcp_fast_retransmit (tc); } @@ -735,9 +743,6 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { /* Retransmit first unacked segment */ tcp_retransmit_first_unacked (tc); - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; } } } @@ -814,10 +819,11 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, return -1; } - tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; - *error = TCP_ERROR_ACK_FUTURE; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2, vnet_buffer (b)->tcp.ack_number); + + tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + *error = TCP_ERROR_ACK_FUTURE; } /* If old ACK, probably it's an old dupack */ @@ -863,7 +869,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, * timer. */ if (tc->bytes_acked) { - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); /* Updates congestion control (slow start/congestion avoidance) */ tcp_cc_rcv_ack (tc, b); @@ -966,11 +972,14 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, tc->rcv_nxt += written; /* Depending on how fast the app is, all remaining buffers in burst will - * not be enqueued. Should we inform peer of the damage? XXX */ + * not be enqueued. Inform peer */ + tc->flags |= TCP_CONN_SNDACK; + return TCP_ERROR_PARTIALLY_ENQUEUED; } else { + tc->flags |= TCP_CONN_SNDACK; return TCP_ERROR_FIFO_FULL; } @@ -1101,25 +1110,17 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, goto done; } - if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL)) - *next0 = TCP_NEXT_DROP; - /* Check if ACK can be delayed */ - if (!tcp_can_delack (tc)) - { - /* Nothing to do for pure ACKs XXX */ - if (n_data_bytes == 0) - goto done; - - *next0 = tcp_next_output (tc->c_is_ip4); - tcp_make_ack (tc, b); - } - else + if (tcp_can_delack (tc)) { if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK)) tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME); + goto done; } + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); + done: return error; } @@ -2084,6 +2085,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->irs = vnet_buffer (b0)->tcp.seq_number; child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; + child0->rcv_las = child0->rcv_nxt; child0->state = TCP_STATE_SYN_RCVD; /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a671f728824..ea157bd76b7 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -155,8 +155,7 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) max_fifo = stream_session_fifo_size (&tc->connection); ASSERT (tc->opt.mss < max_fifo); - - if (available_space < tc->opt.mss && available_space < max_fifo / 8) + if (available_space < tc->opt.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -170,16 +169,21 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) /* Bad. Thou shalt not shrink */ if (available_space < observed_wnd) { - /* Does happen! */ wnd = observed_wnd; + TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space); } else { wnd = available_space; } - if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd)) - wnd += 1 << tc->rcv_wscale; + /* Make sure we have a multiple of rcv_wscale */ + if (wnd && tc->rcv_wscale) + { + wnd &= ~(1 << tc->rcv_wscale); + if (wnd == 0) + wnd = 1 << tc->rcv_wscale; + } tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } @@ -462,8 +466,9 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + tc->rcv_las = tc->rcv_nxt; } /** @@ -908,6 +913,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, vnet_buffer (b)->tcp.connection_index = tc->c_c_index; tc->snd_nxt += data_len; + /* TODO this is updated in output as well ... */ if (tc->snd_nxt > tc->snd_una_max) tc->snd_una_max = tc->snd_nxt; @@ -929,7 +935,6 @@ tcp_send_ack (tcp_connection_t * tc) /* Fill in the ACK */ tcp_make_ack (tc, b); - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } @@ -942,7 +947,6 @@ tcp_timer_delack_handler (u32 index) tc = tcp_connection_get (index, thread_index); tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; -// tc->flags &= ~TCP_CONN_DELACK; tcp_send_ack (tc); } @@ -995,7 +999,7 @@ done: * Reset congestion control, switch cwnd to loss window and try again. */ static void -tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) +tcp_rtx_timeout_cc (tcp_connection_t * tc) { /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) @@ -1008,6 +1012,7 @@ tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) } /* Start again from the beginning */ + tcp_recovery_on (tc); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; } @@ -1048,7 +1053,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { /* First retransmit timeout */ if (tc->rto_boff == 1) - tcp_rtx_timeout_cc_recover (tc); + tcp_rtx_timeout_cc (tc); /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); @@ -1114,6 +1119,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_SYN_SENT); + TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc); + /* This goes straight to ipx_lookup */ tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); @@ -1136,6 +1143,55 @@ tcp_timer_retransmit_syn_handler (u32 index) tcp_timer_retransmit_handler_i (index, 1); } +/** + * Got 0 snd_wnd from peer, try to do something about it. + * + */ +void +tcp_timer_persist_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, n_bytes; + + tc = tcp_connection_get (index, thread_index); + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + + /* Problem already solved or worse */ + if (tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + return; + + /* Increment RTO backoff */ + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* Try to force the first unsent segment */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (b), + tc->snd_una_max - tc->snd_una, + tc->snd_mss); + /* Nothing to send */ + if (n_bytes == 0) + { + tcp_return_buffer (tm); + return; + } + + b->current_length = n_bytes; + tcp_push_hdr_i (tc, b, tc->state); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Re-enable persist timer */ + tcp_persist_timer_set (tc); +} + /** * Retransmit first unacked segment */ @@ -1329,9 +1385,6 @@ tcp46_output_inline (vlib_main_t * vm, } } - /* Retransmitted SYNs do reach this but it should be harmless */ - tc0->rcv_las = tc0->rcv_nxt; - /* Stop DELACK timer and fix flags */ tc0->flags &= ~(TCP_CONN_SNDACK); if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) -- 2.16.6