X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_output.c;h=33e599ece32c1b942284cafcf4afe82f4a7c2f13;hb=f6d68ed2db2bcd41c9b7ddde5e411073c1566c29;hp=a671f72882474757ae22b78db9e36d2780acc509;hpb=6792ec059696a358b6c98d8d86e9740b34c01e24;p=vpp.git diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a671f728824..33e599ece32 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -42,44 +42,28 @@ static char *tcp_error_strings[] = { typedef struct { - u16 src_port; - u16 dst_port; - u8 state; + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; } tcp_tx_trace_t; -u16 dummy_mtu = 400; +u16 dummy_mtu = 1460; u8 * format_tcp_tx_trace (u8 * s, va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *); + uword indent = format_get_indent (s); - s = format (s, "TBD\n"); + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection_verbose, &t->tcp_connection); return s; } -void -tcp_set_snd_mss (tcp_connection_t * tc) -{ - u16 snd_mss; - - /* TODO find our iface MTU */ - snd_mss = dummy_mtu; - - /* TODO cache mss and consider PMTU discovery */ - snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; - - tc->snd_mss = snd_mss; - - if (tc->snd_mss == 0) - { - clib_warning ("snd mss is 0"); - tc->snd_mss = dummy_mtu; - } -} - static u8 tcp_window_compute_scale (u32 available_space) { @@ -96,7 +80,7 @@ tcp_window_compute_scale (u32 available_space) always_inline u32 tcp_initial_wnd_unscaled (tcp_connection_t * tc) { - return TCP_IW_N_SEGMENTS * dummy_mtu; + return TCP_IW_N_SEGMENTS * tc->mss; } /** @@ -155,8 +139,7 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) max_fifo = stream_session_fifo_size (&tc->connection); ASSERT (tc->opt.mss < max_fifo); - - if (available_space < tc->opt.mss && available_space < max_fifo / 8) + if (available_space < tc->opt.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -170,16 +153,21 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) /* Bad. Thou shalt not shrink */ if (available_space < observed_wnd) { - /* Does happen! */ wnd = observed_wnd; + TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space); } else { wnd = available_space; } - if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd)) - wnd += 1 << tc->rcv_wscale; + /* Make sure we have a multiple of rcv_wscale */ + if (wnd && tc->rcv_wscale) + { + wnd &= ~(1 << tc->rcv_wscale); + if (wnd == 0) + wnd = 1 << tc->rcv_wscale; + } tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } @@ -302,7 +290,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) u8 len = 0; opts->flags |= TCP_OPTS_FLAG_MSS; - opts->mss = dummy_mtu; /*XXX discover that */ + opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; if (tcp_opts_wscale (&tc->opt)) @@ -351,7 +339,8 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) { opts->flags |= TCP_OPTS_FLAG_SACK; opts->sacks = tc->snd_sacks; - opts->n_sack_blocks = vec_len (tc->snd_sacks); + opts->n_sack_blocks = clib_min (vec_len (tc->snd_sacks), + TCP_OPTS_MAX_SACK_BLOCKS); len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks; } } @@ -380,11 +369,62 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, } } +/** + * Update max segment size we're able to process. + * + * The value is constrained by our interface's MTU and IP options. It is + * also what we advertise to our peer. + */ +void +tcp_update_rcv_mss (tcp_connection_t * tc) +{ + /* TODO find our iface MTU */ + tc->mss = dummy_mtu; +} + +/** + * Update snd_mss to reflect the effective segment size that we can send + * by taking into account all TCP options, including SACKs + */ +void +tcp_update_snd_mss (tcp_connection_t * tc) +{ + /* Compute options to be used for connection. These may be reused when + * sending data or to compute the effective mss (snd_mss) */ + tc->snd_opts_len = + tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); + + /* XXX check if MTU has been updated */ + tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len; +} + +void +tcp_init_mss (tcp_connection_t * tc) +{ + tcp_update_rcv_mss (tc); + + /* TODO cache mss and consider PMTU discovery */ + tc->snd_mss = clib_min (tc->opt.mss, tc->mss); + + if (tc->snd_mss == 0) + { + clib_warning ("snd mss is 0"); + tc->snd_mss = tc->mss; + } + + /* We should have enough space for 40 bytes of options */ + ASSERT (tc->snd_mss > 45); + + /* If we use timestamp option, account for it */ + if (tcp_opts_tstamp (&tc->opt)) + tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; +} + #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ @@ -392,7 +432,7 @@ do { \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ tm->vlib_main, my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[cpu_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -404,8 +444,8 @@ do { \ #define tcp_return_buffer(tm) \ do { \ u32 *my_tx_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ _vec_len (my_tx_buffers) +=1; \ } while (0) @@ -462,8 +502,9 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + tc->rcv_las = tc->rcv_nxt; } /** @@ -876,20 +917,20 @@ tcp_make_state_flags (tcp_state_t next_state) */ static void tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, - tcp_state_t next_state) + tcp_state_t next_state, u8 compute_opts) { u32 advertise_wnd, data_len; - u8 tcp_opts_len, tcp_hdr_opts_len, opts_write_len, flags; - tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_hdr_opts_len, opts_write_len, flags; tcp_header_t *th; - data_len = b->current_length; + data_len = b->current_length + b->total_length_not_including_first_buffer; vnet_buffer (b)->tcp.flags = 0; - /* Make and write options */ - memset (snd_opts, 0, sizeof (*snd_opts)); - tcp_opts_len = tcp_make_options (tc, snd_opts, next_state); - tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + if (compute_opts) + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + + /* Write pre-computed options */ + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); /* Get rcv window to advertise */ advertise_wnd = tcp_window_to_advertise (tc, next_state); @@ -900,17 +941,25 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); - opts_write_len = tcp_options_write ((u8 *) (th + 1), snd_opts); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); - ASSERT (opts_write_len == tcp_opts_len); + ASSERT (opts_write_len == tc->snd_opts_len); /* Tag the buffer with the connection index */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; tc->snd_nxt += data_len; + tc->rcv_las = tc->rcv_nxt; + /* TODO this is updated in output as well ... */ if (tc->snd_nxt > tc->snd_una_max) tc->snd_una_max = tc->snd_nxt; + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -929,7 +978,6 @@ tcp_send_ack (tcp_connection_t * tc) /* Fill in the ACK */ tcp_make_ack (tc, b); - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } @@ -937,12 +985,11 @@ tcp_send_ack (tcp_connection_t * tc) void tcp_timer_delack_handler (u32 index) { - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; tc = tcp_connection_get (index, thread_index); tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; -// tc->flags &= ~TCP_CONN_DELACK; tcp_send_ack (tc); } @@ -977,6 +1024,8 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, goto done; } + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + ASSERT (max_bytes <= tc->snd_mss); n_bytes = stream_session_peek_bytes (&tc->connection, @@ -984,7 +1033,8 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, max_bytes); ASSERT (n_bytes != 0); b->current_length = n_bytes; - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 0); + tc->rtx_bytes += n_bytes; done: TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); @@ -995,7 +1045,7 @@ done: * Reset congestion control, switch cwnd to loss window and try again. */ static void -tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) +tcp_rtx_timeout_cc (tcp_connection_t * tc) { /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) @@ -1008,8 +1058,10 @@ tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) } /* Start again from the beginning */ + tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } static void @@ -1017,10 +1069,10 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, snd_space, n_bytes; + u32 bi, n_bytes; if (is_syn) { @@ -1048,40 +1100,21 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { /* First retransmit timeout */ if (tc->rto_boff == 1) - tcp_rtx_timeout_cc_recover (tc); + tcp_rtx_timeout_cc (tc); /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Figure out what and how many bytes we can send */ - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - if (snd_space == 0) - { - clib_warning ("no wnd to retransmit"); - tcp_return_buffer (tm); - - /* Force one segment */ - tcp_retransmit_first_unacked (tc); + /* Send one segment. No fancy recovery for now! */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + scoreboard_clear (&tc->sack_sb); - /* Re-enable retransmit timer. Output may be unwilling - * to do it for us */ - tcp_retransmit_timer_set (tc); - - return; - } - else + if (n_bytes == 0) { - /* No fancy recovery for now! */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space); - scoreboard_clear (&tc->sack_sb); - - if (n_bytes == 0) - return; - - tc->rtx_bytes += n_bytes; + clib_warning ("could not retransmit"); + return; } } else @@ -1097,7 +1130,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ tc->snd_nxt += 1; @@ -1114,6 +1147,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_SYN_SENT); + TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc); + /* This goes straight to ipx_lookup */ tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); @@ -1136,6 +1171,56 @@ tcp_timer_retransmit_syn_handler (u32 index) tcp_timer_retransmit_handler_i (index, 1); } +/** + * Got 0 snd_wnd from peer, try to do something about it. + * + */ +void +tcp_timer_persist_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, n_bytes; + + tc = tcp_connection_get (index, thread_index); + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + + /* Problem already solved or worse */ + if (tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + return; + + /* Increment RTO backoff */ + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* Try to force the first unsent segment */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (b), + tc->snd_una_max - tc->snd_una, + tc->snd_mss); + /* Nothing to send */ + if (n_bytes == 0) + { + tcp_return_buffer (tm); + return; + } + + b->current_length = n_bytes; + tcp_push_hdr_i (tc, b, tc->state, 0); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Re-enable persist timer */ + tcp_persist_timer_set (tc); +} + /** * Retransmit first unacked segment */ @@ -1157,10 +1242,12 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); if (n_bytes == 0) - return; + goto done; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); - tc->rtx_bytes += n_bytes; + +done: + tc->snd_nxt = tc->snd_una_max; } sack_scoreboard_hole_t * @@ -1234,7 +1321,6 @@ tcp_fast_retransmit (tcp_connection_t * tc) } tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); - tc->rtx_bytes += n_written; snd_space -= n_written; } @@ -1257,7 +1343,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1275,7 +1361,8 @@ tcp46_output_inline (vlib_main_t * vm, u32 bi0; vlib_buffer_t *b0; tcp_connection_t *tc0; - tcp_header_t *th0; + tcp_tx_trace_t *t0; + tcp_header_t *th0 = 0; u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1329,9 +1416,6 @@ tcp46_output_inline (vlib_main_t * vm, } } - /* Retransmitted SYNs do reach this but it should be harmless */ - tc0->rcv_las = tc0->rcv_nxt; - /* Stop DELACK timer and fix flags */ tc0->flags &= ~(TCP_CONN_SNDACK); if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) @@ -1340,7 +1424,7 @@ tcp46_output_inline (vlib_main_t * vm, } /* If not retransmitting - * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 1) update snd_una_max (SYN, SYNACK, FIN) * 2) If we're not tracking an ACK, start tracking */ if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) { @@ -1371,7 +1455,17 @@ tcp46_output_inline (vlib_main_t * vm, b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + if (th0) + { + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + } + else + { + memset (&t0->tcp_header, 0, sizeof (t0->tcp_header)); + } + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1447,7 +1541,7 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tcp_connection_t *tc; tc = (tcp_connection_t *) tconn; - tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED); + tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); return 0; } @@ -1471,7 +1565,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1488,6 +1582,8 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_tx_trace_t *t0; + tcp_header_t *th0; u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1516,7 +1612,13 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + th0 = vlib_buffer_get_current (b0); + if (is_ip4) + th0 = ip4_next_header ((ip4_header_t *) th0); + else + th0 = ip6_next_header ((ip6_header_t *) th0); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1554,6 +1656,7 @@ VLIB_REGISTER_NODE (tcp4_reset_node) = { foreach_tcp4_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */ @@ -1572,6 +1675,7 @@ VLIB_REGISTER_NODE (tcp6_reset_node) = { foreach_tcp6_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */