X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_output.c;h=b418e8baa5498d34687b9dcc5a67b7aa7318fa0f;hb=2c25a62cc1cc4937165de740a3b32d78429c72d6;hp=aa43e9f37f0ceb54406916189d9bb84252519254;hpb=d79b41e993981df80245b0e6d90eb691bdaae648;p=vpp.git diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index aa43e9f37f0..b418e8baa54 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -19,7 +19,7 @@ vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; -typedef enum _tcp_output_nect +typedef enum _tcp_output_next { TCP_OUTPUT_NEXT_DROP, TCP_OUTPUT_NEXT_IP_LOOKUP, @@ -42,44 +42,28 @@ static char *tcp_error_strings[] = { typedef struct { - u16 src_port; - u16 dst_port; - u8 state; + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; } tcp_tx_trace_t; -u16 dummy_mtu = 400; +u16 dummy_mtu = 1460; u8 * format_tcp_tx_trace (u8 * s, va_list * args) { CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *); + uword indent = format_get_indent (s); - s = format (s, "TBD\n"); + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection, &t->tcp_connection, 1); return s; } -void -tcp_set_snd_mss (tcp_connection_t * tc) -{ - u16 snd_mss; - - /* TODO find our iface MTU */ - snd_mss = dummy_mtu; - - /* TODO cache mss and consider PMTU discovery */ - snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; - - tc->snd_mss = snd_mss; - - if (tc->snd_mss == 0) - { - clib_warning ("snd mss is 0"); - tc->snd_mss = dummy_mtu; - } -} - static u8 tcp_window_compute_scale (u32 available_space) { @@ -96,7 +80,7 @@ tcp_window_compute_scale (u32 available_space) always_inline u32 tcp_initial_wnd_unscaled (tcp_connection_t * tc) { - return TCP_IW_N_SEGMENTS * dummy_mtu; + return TCP_IW_N_SEGMENTS * tc->mss; } /** @@ -125,43 +109,67 @@ tcp_initial_window_to_advertise (tcp_connection_t * tc) u32 tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) { - u32 available_space, max_fifo, observed_wnd; - if (state < TCP_STATE_ESTABLISHED) return tcp_initial_window_to_advertise (tc); + tcp_update_rcv_wnd (tc); + + if (tc->rcv_wnd == 0) + { + tc->flags |= TCP_CONN_SENT_RCV_WND0; + } + else + { + tc->flags &= ~TCP_CONN_SENT_RCV_WND0; + } + + return tc->rcv_wnd >> tc->rcv_wscale; +} + +void +tcp_update_rcv_wnd (tcp_connection_t * tc) +{ + i32 observed_wnd; + u32 available_space, max_fifo, wnd; + /* * Figure out how much space we have available */ - available_space = stream_session_max_enqueue (&tc->connection); - max_fifo = stream_session_fifo_size (&tc->connection); + available_space = stream_session_max_rx_enqueue (&tc->connection); + max_fifo = stream_session_rx_fifo_size (&tc->connection); - ASSERT (tc->opt.mss < max_fifo); - - if (available_space < tc->opt.mss && available_space < max_fifo / 8) + ASSERT (tc->rcv_opts.mss < max_fifo); + if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3) available_space = 0; /* * Use the above and what we know about what we've previously advertised * to compute the new window */ - observed_wnd = tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); + if (observed_wnd < 0) + observed_wnd = 0; /* Bad. Thou shalt not shrink */ if (available_space < observed_wnd) { - if (available_space == 0) - clib_warning ("Didn't shrink rcv window despite not having space"); + wnd = observed_wnd; + TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space); + } + else + { + wnd = available_space; } - tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); - - if (tc->rcv_wnd == 0) + /* Make sure we have a multiple of rcv_wscale */ + if (wnd && tc->rcv_wscale) { - tc->flags |= TCP_CONN_SENT_RCV_WND0; + wnd &= ~(1 << tc->rcv_wscale); + if (wnd == 0) + wnd = 1 << tc->rcv_wscale; } - return tc->rcv_wnd >> tc->rcv_wscale; + tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } /** @@ -268,8 +276,11 @@ tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) opts->tsecr = 0; len += TCP_OPTION_LEN_TIMESTAMP; - opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; - len += TCP_OPTION_LEN_SACK_PERMITTED; + if (TCP_USE_SACKS) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } /* Align to needed boundary */ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; @@ -282,17 +293,17 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) u8 len = 0; opts->flags |= TCP_OPTS_FLAG_MSS; - opts->mss = dummy_mtu; /*XXX discover that */ + opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; - if (tcp_opts_wscale (&tc->opt)) + if (tcp_opts_wscale (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_WSCALE; opts->wscale = tc->rcv_wscale; len += TCP_OPTION_LEN_WINDOW_SCALE; } - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); @@ -300,7 +311,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; len += TCP_OPTION_LEN_SACK_PERMITTED; @@ -318,20 +329,21 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) opts->flags = 0; - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); opts->tsecr = tc->tsval_recent; len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { if (vec_len (tc->snd_sacks)) { opts->flags |= TCP_OPTS_FLAG_SACK; opts->sacks = tc->snd_sacks; - opts->n_sack_blocks = vec_len (tc->snd_sacks); + opts->n_sack_blocks = clib_min (vec_len (tc->snd_sacks), + TCP_OPTS_MAX_SACK_BLOCKS); len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks; } } @@ -360,19 +372,74 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, } } +/** + * Update max segment size we're able to process. + * + * The value is constrained by our interface's MTU and IP options. It is + * also what we advertise to our peer. + */ +void +tcp_update_rcv_mss (tcp_connection_t * tc) +{ + /* TODO find our iface MTU */ + tc->mss = dummy_mtu; +} + +/** + * Update snd_mss to reflect the effective segment size that we can send + * by taking into account all TCP options, including SACKs + */ +void +tcp_update_snd_mss (tcp_connection_t * tc) +{ + /* Compute options to be used for connection. These may be reused when + * sending data or to compute the effective mss (snd_mss) */ + tc->snd_opts_len = + tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); + + /* XXX check if MTU has been updated */ + tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len; + ASSERT (tc->snd_mss > 0); +} + +void +tcp_init_mss (tcp_connection_t * tc) +{ + u16 default_min_mss = 536; + tcp_update_rcv_mss (tc); + + /* TODO cache mss and consider PMTU discovery */ + tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss); + + if (tc->snd_mss < 45) + { + clib_warning ("snd mss is 0"); + /* Assume that at least the min default mss works */ + tc->snd_mss = default_min_mss; + tc->rcv_opts.mss = default_min_mss; + } + + /* We should have enough space for 40 bytes of options */ + ASSERT (tc->snd_mss > 45); + + /* If we use timestamp option, account for it */ + if (tcp_opts_tstamp (&tc->rcv_opts)) + tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; +} + #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = tm->vlib_main->cpu_index; \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ vec_validate (my_tx_buffers, n_free_buffers - 1); \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - tm->vlib_main, my_tx_buffers, n_free_buffers, \ + vlib_get_main(), my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[cpu_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -381,6 +448,14 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ + _vec_len (my_tx_buffers) +=1; \ +} while (0) + always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { @@ -421,8 +496,6 @@ tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd); tcp_options_write ((u8 *) (th + 1), snd_opts); - - /* Mark as ACK */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; } @@ -432,12 +505,13 @@ tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, void tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); + TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + tc->rcv_las = tc->rcv_nxt; } /** @@ -446,17 +520,12 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 flags = 0; tcp_reuse_buffer (vm, b); - if (tc->rcv_las == tc->rcv_nxt) - flags = TCP_FLAG_FIN; - else - flags = TCP_FLAG_FIN | TCP_FLAG_ACK; - + flags = TCP_FLAG_FIN | TCP_FLAG_ACK; tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, flags); /* Reset flags, make sure ack is sent */ @@ -471,8 +540,7 @@ tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) void tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_options_t _snd_opts, *snd_opts = &_snd_opts; u8 tcp_opts_len, tcp_hdr_opts_len; tcp_header_t *th; @@ -536,54 +604,41 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, int tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, - tcp_state_t state, u32 my_thread_index, u8 is_ip4) + tcp_state_t state, u8 thread_index, u8 is_ip4) { - u8 tcp_hdr_len = sizeof (tcp_header_t); ip4_header_t *ih4; ip6_header_t *ih6; tcp_header_t *th0; - ip4_address_t src_ip40; - ip6_address_t src_ip60; - u16 src_port0; + ip4_address_t src_ip40, dst_ip40; + ip6_address_t src_ip60, dst_ip60; + u16 src_port, dst_port; u32 tmp; + u32 seq, ack; + u8 flags; /* Find IP and TCP headers */ - if (is_ip4) - { - ih4 = vlib_buffer_get_current (b0); - th0 = ip4_next_header (ih4); - } - else - { - ih6 = vlib_buffer_get_current (b0); - th0 = ip6_next_header (ih6); - } + th0 = tcp_buffer_hdr (b0); - /* Swap src and dst ip */ + /* Save src and dst ip */ if (is_ip4) { + ih4 = vlib_buffer_get_current (b0); ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40); src_ip40.as_u32 = ih4->src_address.as_u32; - ih4->src_address.as_u32 = ih4->dst_address.as_u32; - ih4->dst_address.as_u32 = src_ip40.as_u32; - - /* Chop the end of the pkt */ - b0->current_length += ip4_header_bytes (ih4) + tcp_hdr_len; + dst_ip40.as_u32 = ih4->dst_address.as_u32; } else { + ih6 = vlib_buffer_get_current (b0); ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); - clib_memcpy (&ih6->src_address, &ih6->dst_address, - sizeof (ip6_address_t)); - clib_memcpy (&ih6->dst_address, &src_ip60, sizeof (ip6_address_t)); - - /* Chop the end of the pkt */ - b0->current_length += sizeof (ip6_header_t) + tcp_hdr_len; + clib_memcpy (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t)); } - /* Try to determine what/why we're actually resetting and swap - * src and dst ports */ + src_port = th0->src_port; + dst_port = th0->dst_port; + + /* Try to determine what/why we're actually resetting */ if (state == TCP_STATE_CLOSED) { if (!tcp_syn (th0)) @@ -592,33 +647,32 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, tmp = clib_net_to_host_u32 (th0->seq_number); /* Got a SYN for no listener. */ - th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; - th0->ack_number = clib_host_to_net_u32 (tmp + 1); - th0->seq_number = 0; - + flags = TCP_FLAG_RST | TCP_FLAG_ACK; + ack = clib_host_to_net_u32 (tmp + 1); + seq = 0; } - else if (state >= TCP_STATE_SYN_SENT) + else { - th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; - th0->seq_number = th0->ack_number; - th0->ack_number = 0; + flags = TCP_FLAG_RST; + seq = th0->ack_number; + ack = 0; } - src_port0 = th0->src_port; - th0->src_port = th0->dst_port; - th0->dst_port = src_port0; - th0->window = 0; - th0->data_offset_and_reserved = (tcp_hdr_len >> 2) << 4; - th0->urgent_pointer = 0; + tcp_reuse_buffer (vm, b0); + th0 = vlib_buffer_push_tcp_net_order (b0, dst_port, src_port, seq, ack, + sizeof (tcp_header_t), flags, 0); - /* Compute checksum */ if (is_ip4) { + ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40, + IP_PROTOCOL_TCP); th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4); } else { int bogus = ~0; + ih6 = vlib_buffer_push_ip6 (vm, b0, &dst_ip60, &src_ip60, + IP_PROTOCOL_TCP); th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus); ASSERT (!bogus); } @@ -635,7 +689,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 tcp_hdr_len, flags = 0; tcp_header_t *th, *pkt_th; u32 seq, ack; @@ -706,23 +760,22 @@ void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { tcp_header_t *th = vlib_buffer_get_current (b); - + vlib_main_t *vm = vlib_get_main (); if (tc->c_is_ip4) { ip4_header_t *ih; - ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, + ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4, &tc->c_rmt_ip4, IP_PROTOCOL_TCP); - th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih); } else { ip6_header_t *ih; int bogus = ~0; - ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6, &tc->c_rmt_ip6, IP_PROTOCOL_TCP); - th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, - &bogus); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus); ASSERT (!bogus); } } @@ -740,7 +793,7 @@ tcp_send_syn (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); u8 tcp_hdr_opts_len, tcp_opts_len; tcp_header_t *th; u32 time_now; @@ -799,9 +852,16 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; - f = vlib_get_frame_to_node (vm, next_index); + + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } /* Enqueue the packet */ + f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); to_next[0] = bi; f->n_vectors = 1; @@ -817,7 +877,7 @@ tcp_send_fin (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; + vlib_main_t *vm = vlib_get_main (); tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); @@ -828,6 +888,8 @@ tcp_send_fin (tcp_connection_t * tc) tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tcp_retransmit_timer_force_update (tc); + TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } always_inline u8 @@ -855,22 +917,19 @@ tcp_make_state_flags (tcp_state_t next_state) */ static void tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, - tcp_state_t next_state) + tcp_state_t next_state, u8 compute_opts) { u32 advertise_wnd, data_len; - u8 tcp_opts_len, tcp_hdr_opts_len, opts_write_len, flags; - tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_hdr_opts_len, opts_write_len, flags; tcp_header_t *th; - data_len = b->current_length; + data_len = b->current_length + b->total_length_not_including_first_buffer; vnet_buffer (b)->tcp.flags = 0; - /* Make and write options */ - memset (snd_opts, 0, sizeof (*snd_opts)); - tcp_opts_len = tcp_make_options (tc, snd_opts, next_state); - tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + if (compute_opts) + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - /* Get rcv window to advertise */ + tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); advertise_wnd = tcp_window_to_advertise (tc, next_state); flags = tcp_make_state_flags (next_state); @@ -878,103 +937,140 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); + opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); - opts_write_len = tcp_options_write ((u8 *) (th + 1), snd_opts); - - ASSERT (opts_write_len == tcp_opts_len); - - /* Tag the buffer with the connection index */ + ASSERT (opts_write_len == tc->snd_opts_len); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + /* + * Update connection variables + */ + tc->snd_nxt += data_len; + tc->rcv_las = tc->rcv_nxt; + + /* TODO this is updated in output as well ... */ + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) + { + tc->snd_una_max = tc->snd_nxt; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + } + + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } -/* Send delayed ACK when timer expires */ void -tcp_timer_delack_handler (u32 index) +tcp_send_ack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; - u32 thread_index = os_get_cpu_number (); - tcp_connection_t *tc; + vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b; u32 bi; - tc = tcp_connection_get (index, thread_index); - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); /* Fill in the ACK */ tcp_make_ack (tc, b); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); +} - tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; - tc->flags &= ~TCP_CONN_DELACK; +/* Send delayed ACK when timer expires */ +void +tcp_timer_delack_handler (u32 index) +{ + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tc = tcp_connection_get (index, thread_index); + tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; + tcp_send_ack (tc); } /** Build a retransmit segment * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit - * */ + */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 max_bytes) + u32 offset, u32 max_bytes) { - tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; - u32 n_bytes, offset = 0; - sack_scoreboard_hole_t *hole; - u32 hole_size; + vlib_main_t *vm = vlib_get_main (); + int n_bytes = 0; + u32 start; tcp_reuse_buffer (vm, b); ASSERT (tc->state >= TCP_STATE_ESTABLISHED); ASSERT (max_bytes != 0); - if (tcp_opts_sack_permitted (&tc->opt)) - { - /* XXX get first hole not retransmitted yet */ - hole = scoreboard_first_hole (&tc->sack_sb); - if (!hole) - return 0; + max_bytes = clib_min (tc->snd_mss, max_bytes); + start = tc->snd_una + offset; - offset = hole->start - tc->snd_una; - hole_size = hole->end - hole->start; + /* Start is beyond snd_congestion */ + if (seq_geq (start, tc->snd_congestion)) + goto done; - ASSERT (hole_size); - - if (hole_size < max_bytes) - max_bytes = hole_size; - } - else + /* Don't overshoot snd_congestion */ + if (seq_gt (start + max_bytes, tc->snd_congestion)) { - if (seq_geq (tc->snd_nxt, tc->snd_una_max)) - return 0; + max_bytes = tc->snd_congestion - start; + if (max_bytes == 0) + goto done; } + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + + ASSERT (max_bytes <= tc->snd_mss); + n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); - ASSERT (n_bytes != 0); + ASSERT (n_bytes > 0); + b->current_length = n_bytes; + tcp_push_hdr_i (tc, b, tc->state, 0); - tcp_push_hdr_i (tc, b, tc->state); + if (tcp_in_fastrecovery (tc)) + tc->snd_rxt_bytes += n_bytes; +done: + TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); return n_bytes; } +/** + * Reset congestion control, switch cwnd to loss window and try again. + */ +static void +tcp_rtx_timeout_cc (tcp_connection_t * tc) +{ + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; + + /* Cleanly recover cc (also clears up fast retransmit) */ + if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); + + /* Start again from the beginning */ + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); + tc->cwnd = tcp_loss_wnd (tc); + tc->snd_congestion = tc->snd_una_max; + + tcp_recovery_on (tc); +} + static void tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); - vlib_main_t *vm = tm->vlib_main; - u32 thread_index = os_get_cpu_number (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, max_bytes, snd_space; + u32 bi, n_bytes; if (is_syn) { @@ -988,45 +1084,64 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + if (!tcp_in_recovery (tc) && tc->rto_boff > 0 + && tc->state >= TCP_STATE_ESTABLISHED) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + /* Increment RTO backoff (also equal to number of retries) */ tc->rto_boff += 1; /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) { - tcp_fastrecovery_off (tc); + /* Lost FIN, retransmit and return */ + if (tc->flags & TCP_CONN_FINSNT) + { + tcp_send_fin (tc); + return; + } + + /* First retransmit timeout */ + if (tc->rto_boff == 1) + tcp_rtx_timeout_cc (tc); /* Exponential backoff */ tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Figure out what and how many bytes we can send */ - snd_space = tcp_available_snd_space (tc); - max_bytes = clib_min (tc->snd_mss, snd_space); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - if (max_bytes == 0) + /* Send one segment */ + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + /* TODO be less aggressive about this */ + scoreboard_clear (&tc->sack_sb); + + if (n_bytes == 0) { - clib_warning ("no wnd to retransmit"); + clib_warning ("could not retransmit anything"); + clib_warning ("%U", format_tcp_connection, tc, 2); + + /* Try again eventually */ + tcp_retransmit_timer_set (tc); + ASSERT (0 || (tc->rto_boff > 1 + && tc->snd_una == tc->snd_congestion)); return; } - tcp_prepare_retransmit_segment (tc, b, max_bytes); - tc->rtx_bytes += max_bytes; - - /* No fancy recovery for now! */ - scoreboard_clear (&tc->sack_sb); + /* For first retransmit, record timestamp (Eifel detection RFC3522) */ + if (tc->rto_boff == 1) + tc->snd_rxt_ts = tcp_time_now (); } - else + /* Retransmit for SYN/SYNACK */ + else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) { - /* Retransmit for SYN/SYNACK */ - ASSERT (tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_SYN_SENT); - /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ if (tc->rto_boff > TCP_RTO_SYN_RETRIES) @@ -1034,10 +1149,17 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_push_hdr_i (tc, b, tc->state); + tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ tc->snd_nxt += 1; + tc->rtt_ts = 0; + } + else + { + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; } if (!is_syn) @@ -1051,6 +1173,8 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_SYN_SENT); + TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc); + /* This goes straight to ipx_lookup */ tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); @@ -1074,63 +1198,230 @@ tcp_timer_retransmit_syn_handler (u32 index) } /** - * Retansmit first unacked segment */ + * Got 0 snd_wnd from peer, try to do something about it. + * + */ +void +tcp_timer_persist_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 thread_index = vlib_get_thread_index (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, old_snd_nxt; + int n_bytes = 0; + + tc = tcp_connection_get_if_valid (index, thread_index); + + if (!tc) + return; + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; + + /* Problem already solved or worse */ + if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + return; + + /* Increment RTO backoff */ + tc->rto_boff += 1; + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* Try to force the first unsent segment */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (b), + tc->snd_una_max - tc->snd_una, + tc->snd_mss); + /* Nothing to send */ + if (n_bytes <= 0) + { + // clib_warning ("persist found nothing to send"); + tcp_return_buffer (tm); + return; + } + + b->current_length = n_bytes; + ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + + /* Allow updating of snd_una_max but don't update snd_nxt */ + old_snd_nxt = tc->snd_nxt; + tcp_push_hdr_i (tc, b, tc->state, 0); + tc->snd_nxt = old_snd_nxt; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Re-enable persist timer */ + tcp_persist_timer_set (tc); +} + +/** + * Retransmit first unacked segment + */ void tcp_retransmit_first_unacked (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - u32 snd_nxt = tc->snd_nxt; + vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi; + u32 bi, n_bytes, old_snd_nxt; + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (tm->vlib_main, bi); + b = vlib_get_buffer (vm, bi); - tcp_prepare_retransmit_segment (tc, b, tc->snd_mss); - tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); - tc->snd_nxt = snd_nxt; - tc->rtx_bytes += tc->snd_mss; + n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + if (n_bytes == 0) + { + tcp_return_buffer (tm); + goto done; + } + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + +done: + tc->snd_nxt = old_snd_nxt; } +/** + * Do fast retransmit with SACKs + */ void -tcp_fast_retransmit (tcp_connection_t * tc) +tcp_fast_retransmit_sack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); - u32 snd_space, max_bytes, n_bytes, bi; + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, max_bytes; vlib_buffer_t *b; + sack_scoreboard_hole_t *hole; + sack_scoreboard_t *sb; + u32 bi, old_snd_nxt; + int snd_space; + u8 snd_limited = 0, can_rescue = 0; ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + snd_space = tcp_available_snd_space (tc); - clib_warning ("fast retransmit!"); + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + while (hole && snd_space > 0) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + hole = scoreboard_next_rxt_hole (sb, hole, + tcp_fastrecovery_sent_1_smss (tc), + &can_rescue, &snd_limited); + if (!hole) + { + if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) + || seq_gt (sb->rescue_rxt, + tc->snd_congestion))) + break; + + /* If rescue rxt undefined or less than snd_una then one segment of + * up to SMSS octets that MUST include the highest outstanding + * unSACKed sequence number SHOULD be returned, and RescueRxt set to + * RecoveryPoint. HighRxt MUST NOT be updated. + */ + max_bytes = clib_min (tc->snd_mss, snd_space); + offset = tc->snd_congestion - tc->snd_una - max_bytes; + sb->rescue_rxt = tc->snd_congestion; + tc->snd_nxt = tc->snd_una + offset; + tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + break; + } + + max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + offset = sb->high_rxt - tc->snd_una; + tc->snd_nxt = tc->snd_una + offset; + n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + + /* Nothing left to retransmit */ + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } + + sb->high_rxt += n_written; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; + } + + /* If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Fast retransmit without SACK info + */ +void +tcp_fast_retransmit_no_sack (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, bi, old_snd_nxt; + int snd_space; + vlib_buffer_t *b; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); /* Start resending from first un-acked segment */ + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_snd_space (tc); - while (snd_space) + while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (tm->vlib_main, bi); + b = vlib_get_buffer (vm, bi); - max_bytes = clib_min (tc->snd_mss, snd_space); - n_bytes = tcp_prepare_retransmit_segment (tc, b, max_bytes); + offset += n_written; + n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ - if (n_bytes == 0) - return; - - tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } - snd_space -= n_bytes; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; } - /* If window allows, send new data */ - tc->snd_nxt = tc->snd_una_max; + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit + */ +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) + tcp_fast_retransmit_sack (tc); + else + tcp_fast_retransmit_no_sack (tc); } always_inline u32 @@ -1147,7 +1438,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1165,7 +1456,8 @@ tcp46_output_inline (vlib_main_t * vm, u32 bi0; vlib_buffer_t *b0; tcp_connection_t *tc0; - tcp_header_t *th0; + tcp_tx_trace_t *t0; + tcp_header_t *th0 = 0; u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1186,6 +1478,7 @@ tcp46_output_inline (vlib_main_t * vm, } th0 = vlib_buffer_get_current (b0); + TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length); if (is_ip4) { @@ -1210,8 +1503,6 @@ tcp46_output_inline (vlib_main_t * vm, if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) { - ASSERT (tc0->snt_dupacks > 0); - tc0->snt_dupacks--; if (!tcp_session_has_ooo_data (tc0)) { error0 = TCP_ERROR_FILTERED_DUPACKS; @@ -1220,19 +1511,12 @@ tcp46_output_inline (vlib_main_t * vm, } } - /* Retransmitted SYNs do reach this but it should be harmless */ - tc0->rcv_las = tc0->rcv_nxt; - /* Stop DELACK timer and fix flags */ - tc0->flags &= - ~(TCP_CONN_SNDACK | TCP_CONN_DELACK | TCP_CONN_BURSTACK); - if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) - { - tcp_timer_reset (tc0, TCP_TIMER_DELACK); - } + tc0->flags &= ~(TCP_CONN_SNDACK); + tcp_timer_reset (tc0, TCP_TIMER_DELACK); /* If not retransmitting - * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 1) update snd_una_max (SYN, SYNACK, FIN) * 2) If we're not tracking an ACK, start tracking */ if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) { @@ -1242,22 +1526,6 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rtt_ts = tcp_time_now (); tc0->rtt_seq = tc0->snd_nxt; } - - if (1) - { - ELOG_TYPE_DECLARE (e) = - { - .format = - "output: snd_una %u snd_una_max %u",.format_args = - "i4i4",}; - struct - { - u32 data[2]; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->data[0] = tc0->snd_una - tc0->iss; - ed->data[1] = tc0->snd_una_max - tc0->iss; - } } /* Set the retransmit timer if not set already and not @@ -1269,18 +1537,48 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rto_boff = 0; } - /* set fib index to default and lookup node */ - /* XXX network virtualization (vrf/vni) */ +#if 0 + /* Make sure we haven't lost route to our peer */ + if (PREDICT_FALSE (tc0->last_fib_check + < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) + { + if (PREDICT_TRUE + (tc0->c_rmt_fei == tcp_lookup_rmt_in_fib (tc0))) + { + tc0->last_fib_check = tc0->snd_opts.tsval; + } + else + { + clib_warning ("lost connection to peer"); + tcp_connection_reset (tc0); + goto done; + } + } + + /* Use pre-computed dpo to set next node */ + next0 = tc0->c_rmt_dpo.dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; +#endif + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; - done: - b0->error = error0 != 0 ? node->errors[error0] : 0; + b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + if (th0) + { + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + } + else + { + memset (&t0->tcp_header, 0, sizeof (t0->tcp_header)); + } + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1307,40 +1605,62 @@ tcp6_output (vlib_main_t * vm, vlib_node_runtime_t * node, return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ ); } +/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp4_output_node) = { .function = tcp4_output,.name = "tcp4-output", /* Takes a vector of packets. */ - .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = - tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = - { + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_OUTPUT_N_NEXT, + .next_nodes = { #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, foreach_tcp4_output_next #undef _ - } -,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ -VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output) +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output); + +/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp6_output_node) = { - .function = tcp6_output,.name = "tcp6-output", + .function = tcp6_output, + .name = "tcp6-output", /* Takes a vector of packets. */ - .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = - tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = - { + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_OUTPUT_N_NEXT, + .next_nodes = { #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, foreach_tcp6_output_next #undef _ - } -,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_tx_trace, +}; +/* *INDENT-ON* */ -VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output) u32 +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output); + +u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) { tcp_connection_t *tc; tc = (tcp_connection_t *) tconn; - tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED); + tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + + if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } return 0; } @@ -1364,7 +1684,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1381,6 +1701,8 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_tx_trace_t *t0; + tcp_header_t *th0; u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; bi0 = from[0]; @@ -1405,11 +1727,17 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, next0 = TCP_RESET_NEXT_IP_LOOKUP; done: - b0->error = error0 != 0 ? node->errors[error0] : 0; + b0->error = node->errors[error0]; b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + th0 = vlib_buffer_get_current (b0); + if (is_ip4) + th0 = ip4_next_header ((ip4_header_t *) th0); + else + th0 = ip6_next_header ((ip6_header_t *) th0); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1447,9 +1775,12 @@ VLIB_REGISTER_NODE (tcp4_reset_node) = { foreach_tcp4_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */ +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_reset_node, tcp4_send_reset); + /* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp6_reset_node) = { .function = tcp6_send_reset, @@ -1463,9 +1794,12 @@ VLIB_REGISTER_NODE (tcp6_reset_node) = { foreach_tcp6_reset_next #undef _ }, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */ +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_reset_node, tcp6_send_reset); + /* * fd.io coding-style-patch-verification: ON *