X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_output.c;h=47c94e6dacaf0edf273e71db317cbee29e7b1a15;hb=93992a9048cb6e5dcd22de5091e72de778122627;hp=a671f72882474757ae22b78db9e36d2780acc509;hpb=6792ec059696a358b6c98d8d86e9740b34c01e24;p=vpp.git diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index a671f728824..47c94e6daca 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -42,44 +42,28 @@ static char *tcp_error_strings[] = { typedef struct { - u16 src_port; - u16 dst_port; - u8 state; + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; } tcp_tx_trace_t; -u16 dummy_mtu = 400; +u16 dummy_mtu = 1460; u8 * format_tcp_tx_trace (u8 * s, va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *); + uword indent = format_get_indent (s); - s = format (s, "TBD\n"); + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection, &t->tcp_connection, 1); return s; } -void -tcp_set_snd_mss (tcp_connection_t * tc) -{ - u16 snd_mss; - - /* TODO find our iface MTU */ - snd_mss = dummy_mtu; - - /* TODO cache mss and consider PMTU discovery */ - snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; - - tc->snd_mss = snd_mss; - - if (tc->snd_mss == 0) - { - clib_warning ("snd mss is 0"); - tc->snd_mss = dummy_mtu; - } -} - static u8 tcp_window_compute_scale (u32 available_space) { @@ -96,7 +80,7 @@ tcp_window_compute_scale (u32 available_space) always_inline u32 tcp_initial_wnd_unscaled (tcp_connection_t * tc) { - return TCP_IW_N_SEGMENTS * dummy_mtu; + return TCP_IW_N_SEGMENTS * tc->mss; } /** @@ -152,11 +136,10 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) * Figure out how much space we have available */ available_space = stream_session_max_rx_enqueue (&tc->connection); - max_fifo = stream_session_fifo_size (&tc->connection); - - ASSERT (tc->opt.mss < max_fifo); + max_fifo = stream_session_rx_fifo_size (&tc->connection); - if (available_space < tc->opt.mss && available_space < max_fifo / 8) + ASSERT (tc->rcv_opts.mss < max_fifo); + if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -170,16 +153,21 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) /* Bad. Thou shalt not shrink */ if (available_space < observed_wnd) { - /* Does happen! */ wnd = observed_wnd; + TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space); } else { wnd = available_space; } - if (wnd && ((wnd << tc->rcv_wscale) >> tc->rcv_wscale != wnd)) - wnd += 1 << tc->rcv_wscale; + /* Make sure we have a multiple of rcv_wscale */ + if (wnd && tc->rcv_wscale) + { + wnd &= ~(1 << tc->rcv_wscale); + if (wnd == 0) + wnd = 1 << tc->rcv_wscale; + } tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } @@ -288,8 +276,11 @@ tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) opts->tsecr = 0; len += TCP_OPTION_LEN_TIMESTAMP; - opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; - len += TCP_OPTION_LEN_SACK_PERMITTED; + if (TCP_USE_SACKS) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } /* Align to needed boundary */ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; @@ -302,17 +293,17 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) u8 len = 0; opts->flags |= TCP_OPTS_FLAG_MSS; - opts->mss = dummy_mtu; /*XXX discover that */ + opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; - if (tcp_opts_wscale (&tc->opt)) + if (tcp_opts_wscale (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_WSCALE; opts->wscale = tc->rcv_wscale; len += TCP_OPTION_LEN_WINDOW_SCALE; } - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); @@ -320,7 +311,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; len += TCP_OPTION_LEN_SACK_PERMITTED; @@ -338,20 +329,21 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) opts->flags = 0; - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); opts->tsecr = tc->tsval_recent; len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { if (vec_len (tc->snd_sacks)) { opts->flags |= TCP_OPTS_FLAG_SACK; opts->sacks = tc->snd_sacks; - opts->n_sack_blocks = vec_len (tc->snd_sacks); + opts->n_sack_blocks = clib_min (vec_len (tc->snd_sacks), + TCP_OPTS_MAX_SACK_BLOCKS); len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks; } } @@ -380,11 +372,66 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, } } +/** + * Update max segment size we're able to process. + * + * The value is constrained by our interface's MTU and IP options. It is + * also what we advertise to our peer. + */ +void +tcp_update_rcv_mss (tcp_connection_t * tc) +{ + /* TODO find our iface MTU */ + tc->mss = dummy_mtu; +} + +/** + * Update snd_mss to reflect the effective segment size that we can send + * by taking into account all TCP options, including SACKs + */ +void +tcp_update_snd_mss (tcp_connection_t * tc) +{ + /* Compute options to be used for connection. These may be reused when + * sending data or to compute the effective mss (snd_mss) */ + tc->snd_opts_len = + tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); + + /* XXX check if MTU has been updated */ + tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len; + ASSERT (tc->snd_mss > 0); +} + +void +tcp_init_mss (tcp_connection_t * tc) +{ + u16 default_min_mss = 536; + tcp_update_rcv_mss (tc); + + /* TODO cache mss and consider PMTU discovery */ + tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss); + + if (tc->snd_mss < 45) + { + clib_warning ("snd mss is 0"); + /* Assume that at least the min default mss works */ + tc->snd_mss = default_min_mss; + tc->rcv_opts.mss = default_min_mss; + } + + /* We should have enough space for 40 bytes of options */ + ASSERT (tc->snd_mss > 45); + + /* If we use timestamp option, account for it */ + if (tcp_opts_tstamp (&tc->rcv_opts)) + tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; +} + #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ @@ -392,7 +439,7 @@ do { \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ tm->vlib_main, my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[cpu_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -404,8 +451,8 @@ do { \ #define tcp_return_buffer(tm) \ do { \ u32 *my_tx_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ _vec_len (my_tx_buffers) +=1; \ } while (0) @@ -462,8 +509,9 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + tc->rcv_las = tc->rcv_nxt; } /** @@ -556,54 +604,41 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, int tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, - tcp_state_t state, u32 my_thread_index, u8 is_ip4) + tcp_state_t state, u8 thread_index, u8 is_ip4) { - u8 tcp_hdr_len = sizeof (tcp_header_t); ip4_header_t *ih4; ip6_header_t *ih6; tcp_header_t *th0; - ip4_address_t src_ip40; - ip6_address_t src_ip60; - u16 src_port0; + ip4_address_t src_ip40, dst_ip40; + ip6_address_t src_ip60, dst_ip60; + u16 src_port, dst_port; u32 tmp; + u32 seq, ack; + u8 flags; /* Find IP and TCP headers */ - if (is_ip4) - { - ih4 = vlib_buffer_get_current (b0); - th0 = ip4_next_header (ih4); - } - else - { - ih6 = vlib_buffer_get_current (b0); - th0 = ip6_next_header (ih6); - } + th0 = tcp_buffer_hdr (b0); - /* Swap src and dst ip */ + /* Save src and dst ip */ if (is_ip4) { + ih4 = vlib_buffer_get_current (b0); ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40); src_ip40.as_u32 = ih4->src_address.as_u32; - ih4->src_address.as_u32 = ih4->dst_address.as_u32; - ih4->dst_address.as_u32 = src_ip40.as_u32; - - /* Chop the end of the pkt */ - b0->current_length += ip4_header_bytes (ih4) + tcp_hdr_len; + dst_ip40.as_u32 = ih4->dst_address.as_u32; } else { + ih6 = vlib_buffer_get_current (b0); ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); - clib_memcpy (&ih6->src_address, &ih6->dst_address, - sizeof (ip6_address_t)); - clib_memcpy (&ih6->dst_address, &src_ip60, sizeof (ip6_address_t)); - - /* Chop the end of the pkt */ - b0->current_length += sizeof (ip6_header_t) + tcp_hdr_len; + clib_memcpy (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t)); } - /* Try to determine what/why we're actually resetting and swap - * src and dst ports */ + src_port = th0->src_port; + dst_port = th0->dst_port; + + /* Try to determine what/why we're actually resetting */ if (state == TCP_STATE_CLOSED) { if (!tcp_syn (th0)) @@ -612,33 +647,32 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, tmp = clib_net_to_host_u32 (th0->seq_number); /* Got a SYN for no listener. */ - th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; - th0->ack_number = clib_host_to_net_u32 (tmp + 1); - th0->seq_number = 0; - + flags = TCP_FLAG_RST | TCP_FLAG_ACK; + ack = clib_host_to_net_u32 (tmp + 1); + seq = 0; } - else if (state >= TCP_STATE_SYN_SENT) + else { - th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; - th0->seq_number = th0->ack_number; - th0->ack_number = 0; + flags = TCP_FLAG_RST; + seq = th0->ack_number; + ack = 0; } - src_port0 = th0->src_port; - th0->src_port = th0->dst_port; - th0->dst_port = src_port0; - th0->window = 0; - th0->data_offset_and_reserved = (tcp_hdr_len >> 2) << 4; - th0->urgent_pointer = 0; + tcp_reuse_buffer (vm, b0); + th0 = vlib_buffer_push_tcp_net_order (b0, dst_port, src_port, seq, ack, + sizeof (tcp_header_t), flags, 0); - /* Compute checksum */ if (is_ip4) { + ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40, + IP_PROTOCOL_TCP); th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4); } else { int bogus = ~0; + ih6 = vlib_buffer_push_ip6 (vm, b0, &dst_ip60, &src_ip60, + IP_PROTOCOL_TCP); th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus); ASSERT (!bogus); } @@ -848,6 +882,7 @@ tcp_send_fin (tcp_connection_t * tc) tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } @@ -876,22 +911,19 @@ tcp_make_state_flags (tcp_state_t next_state) */ static void tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, - tcp_state_t next_state) + tcp_state_t next_state, u8 compute_opts) { u32 advertise_wnd, data_len; - u8 tcp_opts_len, tcp_hdr_opts_len, opts_write_len, flags; - tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_hdr_opts_len, opts_write_len, flags; tcp_header_t *th; - data_len = b->current_length; + data_len = b->current_length + b->total_length_not_including_first_buffer; vnet_buffer (b)->tcp.flags = 0; - /* Make and write options */ - memset (snd_opts, 0, sizeof (*snd_opts)); - tcp_opts_len = tcp_make_options (tc, snd_opts, next_state); - tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + if (compute_opts) + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - /* Get rcv window to advertise */ + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); advertise_wnd = tcp_window_to_advertise (tc, next_state); flags = tcp_make_state_flags (next_state); @@ -899,18 +931,25 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); - opts_write_len = tcp_options_write ((u8 *) (th + 1), snd_opts); - - ASSERT (opts_write_len == tcp_opts_len); - - /* Tag the buffer with the connection index */ + ASSERT (opts_write_len == tc->snd_opts_len); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + /* + * Update connection variables + */ + tc->snd_nxt += data_len; + tc->rcv_las = tc->rcv_nxt; + /* TODO this is updated in output as well ... */ - if (tc->snd_nxt > tc->snd_una_max) - tc->snd_una_max = tc->snd_nxt; + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) + { + tc->snd_una_max = tc->snd_nxt; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + } + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -929,7 +968,6 @@ tcp_send_ack (tcp_connection_t * tc) /* Fill in the ACK */ tcp_make_ack (tc, b); - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } @@ -937,12 +975,11 @@ tcp_send_ack (tcp_connection_t * tc) void tcp_timer_delack_handler (u32 index) { - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; tc = tcp_connection_get (index, thread_index); tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; -// tc->flags &= ~TCP_CONN_DELACK; tcp_send_ack (tc); } @@ -950,13 +987,14 @@ tcp_timer_delack_handler (u32 index) * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit - * */ + */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, u32 offset, u32 max_bytes) { vlib_main_t *vm = vlib_get_main (); - u32 n_bytes = 0; + int n_bytes = 0; + u32 start; tcp_reuse_buffer (vm, b); @@ -964,27 +1002,33 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, ASSERT (max_bytes != 0); max_bytes = clib_min (tc->snd_mss, max_bytes); + start = tc->snd_una + offset; /* Start is beyond snd_congestion */ - if (seq_geq (tc->snd_una + offset, tc->snd_congestion)) + if (seq_geq (start, tc->snd_congestion)) goto done; /* Don't overshoot snd_congestion */ - if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - tc->snd_nxt; + max_bytes = tc->snd_congestion - start; if (max_bytes == 0) goto done; } + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + ASSERT (max_bytes <= tc->snd_mss); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); - ASSERT (n_bytes != 0); + ASSERT (n_bytes > 0); b->current_length = n_bytes; - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 0); + + if (tcp_in_fastrecovery (tc)) + tc->snd_rxt_bytes += n_bytes; done: TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); @@ -995,21 +1039,20 @@ done: * Reset congestion control, switch cwnd to loss window and try again. */ static void -tcp_rtx_timeout_cc_recover (tcp_connection_t * tc) +tcp_rtx_timeout_cc (tcp_connection_t * tc) { + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; + /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) - { - tcp_cc_recover (tc); - } - else - { - tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); - } + tcp_cc_fastrecovery_exit (tc); /* Start again from the beginning */ + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } static void @@ -1017,10 +1060,10 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, snd_space, n_bytes; + u32 bi, n_bytes; if (is_syn) { @@ -1034,62 +1077,64 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + if (!tcp_in_recovery (tc) && tc->rto_boff > 0 + && tc->state >= TCP_STATE_ESTABLISHED) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + /* Increment RTO backoff (also equal to number of retries) */ tc->rto_boff += 1; /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) { + /* Lost FIN, retransmit and return */ + if (tc->flags & TCP_CONN_FINSNT) + { + tcp_send_fin (tc); + return; + } + /* First retransmit timeout */ if (tc->rto_boff == 1) - tcp_rtx_timeout_cc_recover (tc); + tcp_rtx_timeout_cc (tc); /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Figure out what and how many bytes we can send */ - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - if (snd_space == 0) - { - clib_warning ("no wnd to retransmit"); - tcp_return_buffer (tm); + /* Send one segment */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + /* TODO be less aggressive about this */ + scoreboard_clear (&tc->sack_sb); - /* Force one segment */ - tcp_retransmit_first_unacked (tc); + if (n_bytes == 0) + { + clib_warning ("could not retransmit anything"); + clib_warning ("%U", format_tcp_connection, tc, 2); - /* Re-enable retransmit timer. Output may be unwilling - * to do it for us */ + /* Try again eventually */ tcp_retransmit_timer_set (tc); - + ASSERT (0 || (tc->rto_boff > 1 + && tc->snd_una == tc->snd_congestion)); return; } - else - { - /* No fancy recovery for now! */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, snd_space); - scoreboard_clear (&tc->sack_sb); - - if (n_bytes == 0) - return; - tc->rtx_bytes += n_bytes; - } + /* For first retransmit, record timestamp (Eifel detection RFC3522) */ + if (tc->rto_boff == 1) + tc->snd_rxt_ts = tcp_time_now (); } - else + /* Retransmit for SYN/SYNACK */ + else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) { - /* Retransmit for SYN/SYNACK */ - ASSERT (tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_SYN_SENT); - /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ if (tc->rto_boff > TCP_RTO_SYN_RETRIES) @@ -1097,11 +1142,17 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ tc->snd_nxt += 1; } + else + { + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; + } if (!is_syn) { @@ -1114,6 +1165,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_SYN_SENT); + TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc); + /* This goes straight to ipx_lookup */ tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); @@ -1136,6 +1189,70 @@ tcp_timer_retransmit_syn_handler (u32 index) tcp_timer_retransmit_handler_i (index, 1); } +/** + * Got 0 snd_wnd from peer, try to do something about it. + * + */ +void +tcp_timer_persist_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, old_snd_nxt; + int n_bytes = 0; + + tc = tcp_connection_get_if_valid (index, thread_index); + + if (!tc) + return; + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + + /* Problem already solved or worse */ + if (tc->state == TCP_STATE_CLOSED + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + return; + + /* Increment RTO backoff */ + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* Try to force the first unsent segment */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (b), + tc->snd_una_max - tc->snd_una, + tc->snd_mss); + /* Nothing to send */ + if (n_bytes <= 0) + { + clib_warning ("persist found nothing to send"); + tcp_return_buffer (tm); + return; + } + + b->current_length = n_bytes; + ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + + /* Allow updating of snd_una_max but don't update snd_nxt */ + old_snd_nxt = tc->snd_nxt; + tcp_push_hdr_i (tc, b, tc->state, 0); + tc->snd_nxt = old_snd_nxt; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Re-enable persist timer */ + tcp_persist_timer_set (tc); +} + /** * Retransmit first unacked segment */ @@ -1145,8 +1262,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, n_bytes, old_snd_nxt; + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; /* Get buffer */ @@ -1157,73 +1275,117 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); if (n_bytes == 0) - return; + { + tcp_return_buffer (tm); + goto done; + } tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); - tc->rtx_bytes += n_bytes; + +done: + tc->snd_nxt = old_snd_nxt; } -sack_scoreboard_hole_t * -scoreboard_first_rtx_hole (sack_scoreboard_t * sb) +/** + * Do fast retransmit with SACKs + */ +void +tcp_fast_retransmit_sack (tcp_connection_t * tc) { - sack_scoreboard_hole_t *hole = 0; - -// hole = scoreboard_first_hole (&tc->sack_sb); -// if (hole) -// { -// -// offset = hole->start - tc->snd_una; -// hole_size = hole->end - hole->start; -// -// ASSERT(hole_size); -// -// if (hole_size < max_bytes) -// max_bytes = hole_size; -// } - return hole; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, max_bytes; + vlib_buffer_t *b; + sack_scoreboard_hole_t *hole; + sack_scoreboard_t *sb; + u32 bi, old_snd_nxt; + int snd_space; + u8 snd_limited = 0, can_rescue = 0; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + snd_space = tcp_available_snd_space (tc); + + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + while (hole && snd_space > 0) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + hole = scoreboard_next_rxt_hole (sb, hole, + tcp_fastrecovery_sent_1_smss (tc), + &can_rescue, &snd_limited); + if (!hole) + { + if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) + || seq_gt (sb->rescue_rxt, + tc->snd_congestion))) + break; + + /* If rescue rxt undefined or less than snd_una then one segment of + * up to SMSS octets that MUST include the highest outstanding + * unSACKed sequence number SHOULD be returned, and RescueRxt set to + * RecoveryPoint. HighRxt MUST NOT be updated. + */ + max_bytes = clib_min (tc->snd_mss, snd_space); + offset = tc->snd_congestion - tc->snd_una - max_bytes; + sb->rescue_rxt = tc->snd_congestion; + tc->snd_nxt = tc->snd_una + offset; + tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + break; + } + + max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + offset = sb->high_rxt - tc->snd_una; + tc->snd_nxt = tc->snd_una + offset; + n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + + /* Nothing left to retransmit */ + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } + + sb->high_rxt += n_written; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; + } + + /* If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; } /** - * Do fast retransmit. + * Fast retransmit without SACK info */ void -tcp_fast_retransmit (tcp_connection_t * tc) +tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 bi; + u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; - u32 n_written = 0, offset = 0; vlib_buffer_t *b; - u8 use_sacks = 0; ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); /* Start resending from first un-acked segment */ + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* If we have SACKs use them */ - if (tcp_opts_sack_permitted (&tc->opt) - && scoreboard_first_hole (&tc->sack_sb)) - use_sacks = 0; while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); - if (use_sacks) - { - scoreboard_first_rtx_hole (&tc->sack_sb); - } - else - { - offset += n_written; - } - + offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ @@ -1234,13 +1396,24 @@ tcp_fast_retransmit (tcp_connection_t * tc) } tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); - tc->rtx_bytes += n_written; snd_space -= n_written; } - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit + */ +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) + tcp_fast_retransmit_sack (tc); + else + tcp_fast_retransmit_no_sack (tc); } always_inline u32 @@ -1257,7 +1430,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1275,7 +1448,8 @@ tcp46_output_inline (vlib_main_t * vm, u32 bi0; vlib_buffer_t *b0; tcp_connection_t *tc0; - tcp_header_t *th0; + tcp_tx_trace_t *t0; + tcp_header_t *th0 = 0; u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1329,9 +1503,6 @@ tcp46_output_inline (vlib_main_t * vm, } } - /* Retransmitted SYNs do reach this but it should be harmless */ - tc0->rcv_las = tc0->rcv_nxt; - /* Stop DELACK timer and fix flags */ tc0->flags &= ~(TCP_CONN_SNDACK); if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) @@ -1340,7 +1511,7 @@ tcp46_output_inline (vlib_main_t * vm, } /* If not retransmitting - * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 1) update snd_una_max (SYN, SYNACK, FIN) * 2) If we're not tracking an ACK, start tracking */ if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) { @@ -1371,7 +1542,17 @@ tcp46_output_inline (vlib_main_t * vm, b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + if (th0) + { + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + } + else + { + memset (&t0->tcp_header, 0, sizeof (t0->tcp_header)); + } + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1447,7 +1628,13 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tcp_connection_t *tc; tc = (tcp_connection_t *) tconn; - tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED); + tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } return 0; } @@ -1471,7 +1658,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1488,6 +1675,8 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_tx_trace_t *t0; + tcp_header_t *th0; u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1516,7 +1705,13 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + th0 = vlib_buffer_get_current (b0); + if (is_ip4) + th0 = ip4_next_header ((ip4_header_t *) th0); + else + th0 = ip6_next_header ((ip6_header_t *) th0); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1554,6 +1749,7 @@ VLIB_REGISTER_NODE (tcp4_reset_node) = { foreach_tcp4_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */ @@ -1572,6 +1768,7 @@ VLIB_REGISTER_NODE (tcp6_reset_node) = { foreach_tcp6_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */