X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_output.c;h=78148cd569511abac2919cb8c8c99b64d4072936;hb=e8ea6be8dfb626b5bb4ff3355ce8037724ce1d83;hp=d07fb2ec26e61b14b321fd31e931c7267158851d;hpb=5484daa001ccbbbf8773b273f428dbcddc4750cc;p=vpp.git diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index d07fb2ec26e..78148cd5695 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -14,7 +14,10 @@ */ #include +#include #include +#include +#include typedef enum _tcp_output_next { @@ -37,8 +40,8 @@ typedef enum _tcp_output_next _ (IP_REWRITE, "ip6-rewrite") \ _ (IP_ARP, "ip6-discover-neighbor") -static char *tcp_error_strings[] = { -#define tcp_error(n,s) s, +static vlib_error_desc_t tcp_output_error_counters[] = { +#define tcp_error(f, n, s, d) { #n, d, VL_COUNTER_SEVERITY_##s }, #include #undef tcp_error }; @@ -119,11 +122,6 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) * Figure out how much space we have available */ available_space = transport_max_rx_enqueue (&tc->connection); - if (PREDICT_FALSE (available_space < tc->rcv_opts.mss)) - { - tc->rcv_wnd = 0; - return; - } /* * Use the above and what we know about what we've previously advertised @@ -131,24 +129,22 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) */ observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); - /* Bad. Thou shalt not shrink */ + /* Check if we are about to retract the window. Do the comparison before + * rounding to avoid errors. Per RFC7323 sec. 2.4 we could remove this */ if (PREDICT_FALSE ((i32) available_space < observed_wnd)) { - wnd = clib_max (observed_wnd, 0); + wnd = round_down_pow2 (clib_max (observed_wnd, 0), 1 << tc->rcv_wscale); TCP_EVT (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space); } else { - wnd = available_space; + /* Make sure we have a multiple of 1 << rcv_wscale. We round down to + * avoid advertising a window larger than what can be buffered */ + wnd = round_down_pow2 (available_space, 1 << tc->rcv_wscale); } - /* Make sure we have a multiple of rcv_wscale */ - if (wnd && tc->rcv_wscale) - { - wnd &= ~((1 << tc->rcv_wscale) - 1); - if (wnd == 0) - wnd = 1 << tc->rcv_wscale; - } + if (PREDICT_FALSE (wnd < tc->rcv_opts.mss)) + wnd = 0; tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale); } @@ -166,90 +162,6 @@ tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) return tc->rcv_wnd >> tc->rcv_wscale; } -/** - * Write TCP options to segment. - */ -static u32 -tcp_options_write (u8 * data, tcp_options_t * opts) -{ - u32 opts_len = 0; - u32 buf, seq_len = 4; - - if (tcp_opts_mss (opts)) - { - *data++ = TCP_OPTION_MSS; - *data++ = TCP_OPTION_LEN_MSS; - buf = clib_host_to_net_u16 (opts->mss); - clib_memcpy_fast (data, &buf, sizeof (opts->mss)); - data += sizeof (opts->mss); - opts_len += TCP_OPTION_LEN_MSS; - } - - if (tcp_opts_wscale (opts)) - { - *data++ = TCP_OPTION_WINDOW_SCALE; - *data++ = TCP_OPTION_LEN_WINDOW_SCALE; - *data++ = opts->wscale; - opts_len += TCP_OPTION_LEN_WINDOW_SCALE; - } - - if (tcp_opts_sack_permitted (opts)) - { - *data++ = TCP_OPTION_SACK_PERMITTED; - *data++ = TCP_OPTION_LEN_SACK_PERMITTED; - opts_len += TCP_OPTION_LEN_SACK_PERMITTED; - } - - if (tcp_opts_tstamp (opts)) - { - *data++ = TCP_OPTION_TIMESTAMP; - *data++ = TCP_OPTION_LEN_TIMESTAMP; - buf = clib_host_to_net_u32 (opts->tsval); - clib_memcpy_fast (data, &buf, sizeof (opts->tsval)); - data += sizeof (opts->tsval); - buf = clib_host_to_net_u32 (opts->tsecr); - clib_memcpy_fast (data, &buf, sizeof (opts->tsecr)); - data += sizeof (opts->tsecr); - opts_len += TCP_OPTION_LEN_TIMESTAMP; - } - - if (tcp_opts_sack (opts)) - { - int i; - - if (opts->n_sack_blocks != 0) - { - *data++ = TCP_OPTION_SACK_BLOCK; - *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; - for (i = 0; i < opts->n_sack_blocks; i++) - { - buf = clib_host_to_net_u32 (opts->sacks[i].start); - clib_memcpy_fast (data, &buf, seq_len); - data += seq_len; - buf = clib_host_to_net_u32 (opts->sacks[i].end); - clib_memcpy_fast (data, &buf, seq_len); - data += seq_len; - } - opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; - } - } - - /* Terminate TCP options */ - if (opts_len % 4) - { - *data++ = TCP_OPTION_EOL; - opts_len += TCP_OPTION_LEN_EOL; - } - - /* Pad with zeroes to a u32 boundary */ - while (opts_len % 4) - { - *data++ = TCP_OPTION_NOOP; - opts_len += TCP_OPTION_LEN_NOOP; - } - return opts_len; -} - static int tcp_make_syn_options (tcp_connection_t * tc, tcp_options_t * opts) { @@ -264,7 +176,7 @@ tcp_make_syn_options (tcp_connection_t * tc, tcp_options_t * opts) len += TCP_OPTION_LEN_WINDOW_SCALE; opts->flags |= TCP_OPTS_FLAG_TSTAMP; - opts->tsval = tcp_time_now (); + opts->tsval = tcp_time_tstamp (tc->c_thread_index); opts->tsecr = 0; len += TCP_OPTION_LEN_TIMESTAMP; @@ -298,7 +210,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; - opts->tsval = tcp_time_now (); + opts->tsval = tcp_time_tstamp (tc->c_thread_index); opts->tsecr = tc->tsval_recent; len += TCP_OPTION_LEN_TIMESTAMP; } @@ -409,29 +321,16 @@ tcp_update_burst_snd_vars (tcp_connection_t * tc) if (tc->snd_una == tc->snd_nxt) { tcp_cc_event (tc, TCP_CC_EVT_START_TX); - tcp_connection_tx_pacer_reset (tc, tc->cwnd, TRANSPORT_PACER_MIN_BURST); } -} -#endif /* CLIB_MARCH_VARIANT */ - -static void * -tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) -{ - if (b->flags & VLIB_BUFFER_NEXT_PRESENT) - vlib_buffer_free_one (vm, b->next_buffer); - /* Zero all flags but free list index and trace flag */ - b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1; - b->current_data = 0; - b->current_length = 0; - b->total_length_not_including_first_buffer = 0; - vnet_buffer (b)->tcp.flags = 0; - - /* Leave enough space for headers */ - return vlib_buffer_make_headroom (b, TRANSPORT_MAX_HDRS_LEN); + if (tc->flags & TCP_CONN_PSH_PENDING) + { + u32 max_deq = transport_max_tx_dequeue (&tc->connection); + /* Last byte marked for push */ + tc->psh_seq = tc->snd_una + max_deq - 1; + } } -#ifndef CLIB_MARCH_VARIANT static void * tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b) { @@ -440,12 +339,10 @@ tcp_init_buffer (vlib_main_t * vm, vlib_buffer_t * b) b->total_length_not_including_first_buffer = 0; b->current_data = 0; vnet_buffer (b)->tcp.flags = 0; - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); /* Leave enough space for headers */ return vlib_buffer_make_headroom (b, TRANSPORT_MAX_HDRS_LEN); } - /* Compute TCP checksum in software when offloading is disabled for a connection */ u16 ip6_tcp_compute_checksum_custom (vlib_main_t * vm, vlib_buffer_t * p0, @@ -511,7 +408,7 @@ tcp_compute_checksum (tcp_connection_t * tc, vlib_buffer_t * b) } else { - b->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + vnet_buffer_offload_flags_set (b, VNET_BUFFER_OFFLOAD_F_TCP_CKSUM); } return checksum; } @@ -523,7 +420,7 @@ static inline void tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, u8 flags) { - tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + tcp_options_t _snd_opts = {}, *snd_opts = &_snd_opts; u8 tcp_opts_len, tcp_hdr_opts_len; tcp_header_t *th; u16 wnd; @@ -544,7 +441,10 @@ tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, vnet_buffer (b)->tcp.connection_index = tc->c_c_index; if (wnd == 0) - tcp_zero_rwnd_sent_on (tc); + { + transport_rx_fifo_req_deq_ntf (&tc->connection); + tcp_zero_rwnd_sent_on (tc); + } else tcp_zero_rwnd_sent_off (tc); } @@ -563,7 +463,7 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) /** * Convert buffer to FIN-ACK */ -void +static void tcp_make_fin (tcp_connection_t * tc, vlib_buffer_t * b) { tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK); @@ -598,7 +498,7 @@ tcp_make_syn (tcp_connection_t * tc, vlib_buffer_t * b) /** * Convert buffer to SYN-ACK */ -void +static void tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) { tcp_options_t _snd_opts, *snd_opts = &_snd_opts; @@ -621,25 +521,19 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) } static void -tcp_enqueue_to_ip_lookup (tcp_worker_ctx_t * wrk, vlib_buffer_t * b, u32 bi, - u8 is_ip4, u32 fib_index) +tcp_enqueue_half_open (tcp_worker_ctx_t *wrk, tcp_connection_t *tc, + vlib_buffer_t *b, u32 bi) { - tcp_main_t *tm = &tcp_main; vlib_main_t *vm = wrk->vm; b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b->error = 0; - vnet_buffer (b)->sw_if_index[VLIB_TX] = fib_index; - vnet_buffer (b)->sw_if_index[VLIB_RX] = 0; - - tcp_trajectory_add_start (b, 1); - session_add_pending_tx_buffer (vm->thread_index, bi, - tm->ipl_next_node[!is_ip4]); + wrk->tco_next_node[!tc->c_is_ip4]); if (vm->thread_index == 0 && vlib_num_workers ()) - session_queue_run_on_main_thread (wrk->vm); + session_queue_run_on_main_thread (vm); } static void @@ -653,24 +547,24 @@ tcp_enqueue_to_output (tcp_worker_ctx_t * wrk, vlib_buffer_t * b, u32 bi, wrk->tco_next_node[!is_ip4]); } -#endif /* CLIB_MARCH_VARIANT */ - -static int -tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b, u8 is_ip4) +int +tcp_buffer_make_reset (vlib_main_t *vm, vlib_buffer_t *b, u8 is_ip4) { - ip4_header_t *ih4; - ip6_header_t *ih6; - tcp_header_t *th; - ip4_address_t src_ip4, dst_ip4; + ip4_address_t src_ip4 = {}, dst_ip4 = {}; ip6_address_t src_ip6, dst_ip6; u16 src_port, dst_port; u32 tmp, len, seq, ack; + ip4_header_t *ih4; + ip6_header_t *ih6; + tcp_header_t *th; u8 flags; - /* Find IP and TCP headers */ + /* + * Find IP and TCP headers and glean information from them. Assumes + * buffer was parsed by something like @ref tcp_input_lookup_buffer + */ th = tcp_buffer_hdr (b); - /* Save src and dst ip */ if (is_ip4) { ih4 = vlib_buffer_get_current (b); @@ -710,8 +604,23 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b, u8 is_ip4) seq = 0; } - tcp_reuse_buffer (vm, b); - tcp_trajectory_add_start (b, 4); + /* + * Clear and reuse current buffer for reset + */ + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + vlib_buffer_free_one (vm, b->next_buffer); + + /* Zero all flags but free list index and trace flag */ + b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1; + /* Make sure new tcp header comes after current ip */ + b->current_data = ((u8 *) th - b->data) + sizeof (tcp_header_t); + b->current_length = 0; + b->total_length_not_including_first_buffer = 0; + vnet_buffer (b)->tcp.flags = 0; + + /* + * Add TCP and IP headers + */ th = vlib_buffer_push_tcp_net_order (b, dst_port, src_port, seq, ack, sizeof (tcp_header_t), flags, 0); @@ -732,7 +641,6 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b, u8 is_ip4) return 0; } -#ifndef CLIB_MARCH_VARIANT /** * Send reset without reusing existing buffer * @@ -745,22 +653,21 @@ tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index); vlib_main_t *vm = wrk->vm; vlib_buffer_t *b; - u32 bi, sw_if_index, fib_index; u8 tcp_hdr_len, flags = 0; tcp_header_t *th, *pkt_th; - u32 seq, ack; - ip4_header_t *ih4, *pkt_ih4; - ip6_header_t *ih6, *pkt_ih6; - fib_protocol_t fib_proto; + u32 seq, ack, bi; + ip4_header_t *pkt_ih4; + ip6_header_t *pkt_ih6; if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) - return; + { + tcp_worker_stats_inc (wrk, no_buffer, 1); + return; + } b = vlib_get_buffer (vm, bi); - sw_if_index = vnet_buffer (pkt)->sw_if_index[VLIB_RX]; - fib_proto = is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; - fib_index = fib_table_get_index_for_sw_if_index (fib_proto, sw_if_index); tcp_init_buffer (vm, b); + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; /* Make and write options */ tcp_hdr_len = sizeof (tcp_header_t); @@ -781,6 +688,7 @@ tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, flags = TCP_FLAG_RST; seq = pkt_th->ack_number; ack = (tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0; + ack = clib_host_to_net_u32 (ack); } else { @@ -791,30 +699,9 @@ tcp_send_reset_w_pkt (tcp_connection_t * tc, vlib_buffer_t * pkt, th = vlib_buffer_push_tcp_net_order (b, pkt_th->dst_port, pkt_th->src_port, seq, ack, tcp_hdr_len, flags, 0); + th->checksum = tcp_compute_checksum (tc, b); - /* Swap src and dst ip */ - if (is_ip4) - { - ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40); - ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address, - &pkt_ih4->src_address, IP_PROTOCOL_TCP, - tcp_csum_offload (tc)); - th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4); - } - else - { - int bogus = ~0; - ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) == - 0x60); - ih6 = vlib_buffer_push_ip6_custom (vm, b, &pkt_ih6->dst_address, - &pkt_ih6->src_address, - IP_PROTOCOL_TCP, - tc->ipv6_flow_label); - th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus); - ASSERT (!bogus); - } - - tcp_enqueue_to_ip_lookup (wrk, b, bi, is_ip4, fib_index); + tcp_enqueue_half_open (wrk, tc, b, bi); TCP_EVT (TCP_EVT_RST_SENT, tc); vlib_node_increment_counter (vm, tcp_node_index (output, tc->c_is_ip4), TCP_ERROR_RST_SENT, 1); @@ -835,14 +722,17 @@ tcp_send_reset (tcp_connection_t * tc) u8 flags; if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) - return; + { + tcp_worker_stats_inc (wrk, no_buffer, 1); + return; + } b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); advertise_wnd = tc->rcv_wnd >> tc->rcv_wscale; - flags = TCP_FLAG_RST; + flags = TCP_FLAG_RST | TCP_FLAG_ACK; th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); @@ -856,28 +746,13 @@ tcp_send_reset (tcp_connection_t * tc) TCP_ERROR_RST_SENT, 1); } -static void -tcp_push_ip_hdr (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, - vlib_buffer_t * b) -{ - if (tc->c_is_ip4) - { - vlib_buffer_push_ip4 (wrk->vm, b, &tc->c_lcl_ip4, &tc->c_rmt_ip4, - IP_PROTOCOL_TCP, tcp_csum_offload (tc)); - } - else - { - vlib_buffer_push_ip6_custom (wrk->vm, b, &tc->c_lcl_ip6, &tc->c_rmt_ip6, - IP_PROTOCOL_TCP, tc->ipv6_flow_label); - } -} - /** * Send SYN * - * Builds a SYN packet for a half-open connection and sends it to ipx_lookup. - * The packet is not forwarded through tcpx_output to avoid doing lookups - * in the half_open pool. + * Builds a SYN packet for a half-open connection and sends it to tcp-output. + * The packet is handled by main thread and because half-open and established + * connections use the same pool the connection can be retrieved without + * additional logic. */ void tcp_send_syn (tcp_connection_t * tc) @@ -891,12 +766,14 @@ tcp_send_syn (tcp_connection_t * tc) * Setup retransmit and establish timers before requesting buffer * such that we can return if we've ran out. */ - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, - tc->rto * TCP_TO_TIMER_TICK); + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN, + (u32) tc->rto * TCP_TO_TIMER_TICK); if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1); + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN, + tcp_cfg.alloc_err_timeout); + tcp_worker_stats_inc (wrk, no_buffer, 1); return; } @@ -909,8 +786,7 @@ tcp_send_syn (tcp_connection_t * tc) tc->rtt_seq = tc->snd_nxt; tc->rto_boff = 0; - tcp_push_ip_hdr (wrk, tc, b); - tcp_enqueue_to_ip_lookup (wrk, b, bi, tc->c_is_ip4, tc->c_fib_index); + tcp_enqueue_half_open (wrk, tc, b, bi); TCP_EVT (TCP_EVT_SYN_SENT, tc); } @@ -922,11 +798,14 @@ tcp_send_synack (tcp_connection_t * tc) vlib_buffer_t *b; u32 bi; - tcp_retransmit_timer_force_update (tc); + ASSERT (tc->snd_una != tc->snd_nxt); + tcp_retransmit_timer_update (&wrk->timer_wheel, tc); if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT, + tcp_cfg.alloc_err_timeout); + tcp_worker_stats_inc (wrk, no_buffer, 1); return; } @@ -957,12 +836,13 @@ tcp_send_fin (tcp_connection_t * tc) if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { /* Out of buffers so program fin retransmit ASAP */ - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); - if (fin_snt) - tc->snd_nxt += 1; - else - /* Make sure retransmit retries a fin not data */ + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT, + tcp_cfg.alloc_err_timeout); + tc->snd_nxt += 1; + /* Make sure retransmit retries a fin not data with right snd_nxt */ + if (!fin_snt) tc->flags |= TCP_CONN_FINSNT; + tcp_worker_stats_inc (wrk, no_buffer, 1); return; } @@ -970,7 +850,6 @@ tcp_send_fin (tcp_connection_t * tc) if ((tc->flags & TCP_CONN_SNDACK) && !tc->pending_dupacks) tc->flags &= ~TCP_CONN_SNDACK; - tcp_retransmit_timer_force_update (tc); b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); tcp_make_fin (tc, b); @@ -978,11 +857,11 @@ tcp_send_fin (tcp_connection_t * tc) TCP_EVT (TCP_EVT_FIN_SENT, tc); /* Account for the FIN */ tc->snd_nxt += 1; + tcp_retransmit_timer_update (&wrk->timer_wheel, tc); if (!fin_snt) { tc->flags |= TCP_CONN_FINSNT; tc->flags &= ~TCP_CONN_FINPNDG; - tc->snd_una_max = seq_max (tc->snd_una_max, tc->snd_nxt); } } @@ -1063,19 +942,47 @@ tcp_buffer_len (vlib_buffer_t * b) return data_len; } -u32 -tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +always_inline u32 +tcp_push_one_header (tcp_connection_t *tc, vlib_buffer_t *b) { - tcp_connection_t *tc = (tcp_connection_t *) tconn; - if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) tcp_bt_track_tx (tc, tcp_buffer_len (b)); tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1, /* update_snd_nxt */ 1); - tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max); - tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + tcp_validate_txf_size (tc, tc->snd_nxt - tc->snd_una); + return 0; +} + +u32 +tcp_session_push_header (transport_connection_t *tconn, vlib_buffer_t **bs, + u32 n_bufs) +{ + tcp_connection_t *tc = (tcp_connection_t *) tconn; + + while (n_bufs >= 4) + { + vlib_prefetch_buffer_header (bs[2], STORE); + vlib_prefetch_buffer_header (bs[3], STORE); + + tcp_push_one_header (tc, bs[0]); + tcp_push_one_header (tc, bs[1]); + + n_bufs -= 2; + bs += 2; + } + while (n_bufs) + { + if (n_bufs > 1) + vlib_prefetch_buffer_header (bs[1], STORE); + + tcp_push_one_header (tc, bs[0]); + + n_bufs -= 1; + bs += 1; + } + /* If not tracking an ACK, start tracking */ if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) { @@ -1084,10 +991,10 @@ tcp_session_push_header (transport_connection_t * tconn, vlib_buffer_t * b) } if (PREDICT_FALSE (!tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))) { - tcp_retransmit_timer_set (tc); + tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index); + tcp_retransmit_timer_set (&wrk->timer_wheel, tc); tc->rto_boff = 0; } - tcp_trajectory_add_start (b, 3); return 0; } @@ -1102,6 +1009,7 @@ tcp_send_ack (tcp_connection_t * tc) if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { tcp_update_rcv_wnd (tc); + tcp_worker_stats_inc (wrk, no_buffer, 1); return; } b = vlib_get_buffer (vm, bi); @@ -1142,17 +1050,6 @@ tcp_program_retransmit (tcp_connection_t * tc) } } -/** - * Delayed ack timer handler - * - * Sends delayed ACK when timer expires - */ -void -tcp_timer_delack_handler (tcp_connection_t * tc) -{ - tcp_send_ack (tc); -} - /** * Send window update ack * @@ -1211,12 +1108,15 @@ tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, if (PREDICT_TRUE (seg_size <= bytes_per_buffer)) { if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) - return 0; + { + tcp_worker_stats_inc (wrk, no_buffer, 1); + return 0; + } *b = vlib_get_buffer (vm, bi); data = tcp_init_buffer (vm, *b); n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset, max_deq_bytes); - ASSERT (n_bytes == max_deq_bytes); + ASSERT (n_bytes > 0); b[0]->current_length = n_bytes; tcp_push_hdr_i (tc, *b, tc->snd_una + offset, /* compute opts */ 0, /* burst */ 0, /* update_snd_nxt */ 0); @@ -1238,6 +1138,7 @@ tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, { if (n_bufs) vlib_buffer_free (vm, wrk->tx_buffers, n_bufs); + tcp_worker_stats_inc (wrk, no_buffer, 1); return 0; } @@ -1335,7 +1236,7 @@ tcp_prepare_retransmit_segment (tcp_worker_ctx_t * wrk, tc->bytes_retrans += n_bytes; tc->segs_retrans += 1; - tcp_workerp_stats_inc (wrk, rxt_segs, 1); + tcp_worker_stats_inc (wrk, rxt_segs, 1); TCP_EVT (TCP_EVT_CC_RTX, tc, offset, n_bytes); return n_bytes; @@ -1376,6 +1277,8 @@ tcp_cc_init_rxt_timeout (tcp_connection_t * tc) tc->rtt_ts = 0; tc->cwnd_acc_bytes = 0; tc->tr_occurences += 1; + tc->sack_sb.reorder = TCP_DUPACK_THRESHOLD; + tc->sack_sb.rescue_rxt = tc->snd_una - 1; tcp_recovery_on (tc); } @@ -1387,7 +1290,7 @@ tcp_timer_retransmit_handler (tcp_connection_t * tc) vlib_buffer_t *b = 0; u32 bi, n_bytes; - tcp_workerp_stats_inc (wrk, tr_events, 1); + tcp_worker_stats_inc (wrk, tr_events, 1); /* Should be handled by a different handler */ if (PREDICT_FALSE (tc->state == TCP_STATE_SYN_SENT)) @@ -1410,11 +1313,8 @@ tcp_timer_retransmit_handler (tcp_connection_t * tc) return; } - /* Shouldn't be here. This condition is tricky because it has to take - * into account boff > 0 due to persist timeout. */ - if ((tc->rto_boff == 0 && tc->snd_una == tc->snd_nxt) - || (tc->rto_boff > 0 && seq_geq (tc->snd_una, tc->snd_congestion) - && !tcp_flight_size (tc))) + /* Shouldn't be here */ + if (tc->snd_una == tc->snd_nxt) { ASSERT (!tcp_in_recovery (tc)); tc->rto_boff = 0; @@ -1440,12 +1340,15 @@ tcp_timer_retransmit_handler (tcp_connection_t * tc) session_transport_closed_notify (&tc->connection); tcp_connection_timers_reset (tc); tcp_program_cleanup (wrk, tc); - tcp_workerp_stats_inc (wrk, tr_abort, 1); + tcp_worker_stats_inc (wrk, tr_abort, 1); return; } if (tcp_opts_sack_permitted (&tc->rcv_opts)) - tcp_check_sack_reneging (tc); + { + tcp_check_sack_reneging (tc); + scoreboard_rxt_mark_lost (&tc->sack_sb, tc->snd_una, tc->snd_nxt); + } /* Update send congestion to make sure that rxt has data to send */ tc->snd_congestion = tc->snd_nxt; @@ -1456,7 +1359,8 @@ tcp_timer_retransmit_handler (tcp_connection_t * tc) n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, n_bytes, &b); if (!n_bytes) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT, + tcp_cfg.alloc_err_timeout); return; } @@ -1464,7 +1368,7 @@ tcp_timer_retransmit_handler (tcp_connection_t * tc) tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - tcp_retransmit_timer_force_update (tc); + tcp_retransmit_timer_update (&wrk->timer_wheel, tc); tc->rto_boff += 1; if (tc->rto_boff == 1) @@ -1492,13 +1396,15 @@ tcp_timer_retransmit_handler (tcp_connection_t * tc) tcp_connection_set_state (tc, TCP_STATE_CLOSED); tcp_connection_timers_reset (tc); tcp_program_cleanup (wrk, tc); - tcp_workerp_stats_inc (wrk, tr_abort, 1); + tcp_worker_stats_inc (wrk, tr_abort, 1); return; } if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT, + tcp_cfg.alloc_err_timeout); + tcp_worker_stats_inc (wrk, no_buffer, 1); return; } @@ -1506,7 +1412,8 @@ tcp_timer_retransmit_handler (tcp_connection_t * tc) if (tc->rto_boff > TCP_RTO_SYN_RETRIES) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - tcp_retransmit_timer_force_update (tc); + ASSERT (tc->snd_una != tc->snd_nxt); + tcp_retransmit_timer_update (&wrk->timer_wheel, tc); b = vlib_get_buffer (vm, bi); tcp_init_buffer (vm, b); @@ -1554,14 +1461,16 @@ tcp_timer_retransmit_syn_handler (tcp_connection_t * tc) /* Active open establish timeout */ if (tc->rto >= TCP_ESTABLISH_TIME >> 1) { - session_stream_connect_notify (&tc->connection, 1 /* fail */ ); + session_stream_connect_notify (&tc->connection, SESSION_E_TIMEDOUT); tcp_connection_cleanup (tc); return; } if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1); + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN, + tcp_cfg.alloc_err_timeout); + tcp_worker_stats_inc (wrk, no_buffer, 1); return; } @@ -1577,12 +1486,10 @@ tcp_timer_retransmit_syn_handler (tcp_connection_t * tc) TCP_EVT (TCP_EVT_SYN_RXT, tc, 0); - /* This goes straight to ipx_lookup */ - tcp_push_ip_hdr (wrk, tc, b); - tcp_enqueue_to_ip_lookup (wrk, b, bi, tc->c_is_ip4, tc->c_fib_index); + tcp_enqueue_half_open (wrk, tc, b, bi); - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, - tc->rto * TCP_TO_TIMER_TICK); + tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN, + (u32) tc->rto * TCP_TO_TIMER_TICK); } /** @@ -1612,7 +1519,7 @@ tcp_timer_persist_handler (tcp_connection_t * tc) * next time */ if (!available_bytes) { - tcp_persist_timer_set (tc); + tcp_persist_timer_set (&wrk->timer_wheel, tc); return; } @@ -1628,7 +1535,8 @@ tcp_timer_persist_handler (tcp_connection_t * tc) */ if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1))) { - tcp_persist_timer_set (tc); + tcp_persist_timer_set (&wrk->timer_wheel, tc); + tcp_worker_stats_inc (wrk, no_buffer, 1); return; } @@ -1637,13 +1545,15 @@ tcp_timer_persist_handler (tcp_connection_t * tc) tcp_validate_txf_size (tc, offset); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - max_snd_bytes = clib_min (tc->snd_mss, + max_snd_bytes = clib_min (clib_min (tc->snd_mss, available_bytes), tm->bytes_per_buffer - TRANSPORT_MAX_HDRS_LEN); + if (tc->snd_wnd > 0) + max_snd_bytes = clib_min (tc->snd_wnd, max_snd_bytes); n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset, max_snd_bytes); b->current_length = n_bytes; ASSERT (n_bytes != 0 && (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT) - || tc->snd_nxt == tc->snd_una_max + || tc->snd_una == tc->snd_nxt || tc->rto_boff > 1)); if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) @@ -1654,12 +1564,11 @@ tcp_timer_persist_handler (tcp_connection_t * tc) tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 0, /* update_snd_nxt */ 1); - tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max); - tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); + tcp_validate_txf_size (tc, tc->snd_nxt - tc->snd_una); tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); /* Just sent new data, enable retransmit */ - tcp_retransmit_timer_update (tc); + tcp_retransmit_timer_update (&wrk->timer_wheel, tc); return; @@ -1721,7 +1630,6 @@ tcp_transmit_unsent (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, tcp_bt_track_tx (tc, n_written); tc->snd_nxt += n_written; - tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max); } done: @@ -1821,7 +1729,7 @@ tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, && tc->rxt_head != tc->snd_una && tcp_retransmit_should_retry_head (tc, sb)) { - max_bytes = clib_min (tc->snd_mss, tc->snd_congestion - tc->snd_una); + max_bytes = clib_min (tc->snd_mss, tc->snd_nxt - tc->snd_una); n_written = tcp_prepare_retransmit_segment (wrk, tc, 0, max_bytes, &b); if (!n_written) { @@ -1853,7 +1761,7 @@ tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, if (!hole) { /* We are out of lost holes to retransmit so send some new data. */ - if (max_deq > tc->snd_mss) + if (max_deq) { u32 n_segs_new; int av_wnd; @@ -1863,7 +1771,10 @@ tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, av_wnd = (int) tc->snd_wnd - (tc->snd_nxt - tc->snd_una); av_wnd = clib_max (av_wnd - tc->snd_mss, 0); snd_space = clib_min (snd_space, av_wnd); - snd_space = clib_min (max_deq, snd_space); + /* Low bound max_deq to mss to be able to send a segment even + * when it is less than mss */ + snd_space = + clib_min (clib_max (max_deq, tc->snd_mss), snd_space); burst_size = clib_min (burst_size - n_segs, snd_space / tc->snd_mss); burst_size = clib_min (burst_size, TCP_RXT_MAX_BURST); @@ -1875,8 +1786,7 @@ tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, goto done; } - if (tcp_in_recovery (tc) || !can_rescue - || scoreboard_rescue_rxt_valid (sb, tc)) + if (!can_rescue || scoreboard_rescue_rxt_valid (sb, tc)) break; /* If rescue rxt undefined or less than snd_una then one segment of @@ -1900,7 +1810,11 @@ tcp_retransmit_sack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, break; } - max_bytes = clib_min (hole->end - sb->high_rxt, snd_space); + max_bytes = hole->end - sb->high_rxt; + /* Avoid retransmitting segment less than mss if possible */ + if (snd_space < tc->snd_mss && max_bytes > snd_space) + break; + max_bytes = clib_min (max_bytes, snd_space); max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes; if (max_bytes == 0) break; @@ -2092,7 +2006,7 @@ tcp_do_retransmit (tcp_connection_t * tc, u32 max_burst_size) } int -tcp_session_custom_tx (void *conn, u32 max_burst_size) +tcp_session_custom_tx (void *conn, transport_send_params_t * sp) { tcp_connection_t *tc = (tcp_connection_t *) conn; u32 n_segs = 0; @@ -2100,8 +2014,7 @@ tcp_session_custom_tx (void *conn, u32 max_burst_size) if (tcp_in_cong_recovery (tc) && (tc->flags & TCP_CONN_RXT_PENDING)) { tc->flags &= ~TCP_CONN_RXT_PENDING; - n_segs = tcp_do_retransmit (tc, max_burst_size); - max_burst_size -= n_segs; + n_segs = tcp_do_retransmit (tc, sp->max_burst_size); } if (!(tc->flags & TCP_CONN_SNDACK)) @@ -2113,13 +2026,13 @@ tcp_session_custom_tx (void *conn, u32 max_burst_size) if (n_segs && !tc->pending_dupacks) return n_segs; - if (!max_burst_size) + if (sp->max_burst_size <= n_segs) { tcp_program_ack (tc); - return max_burst_size; + return n_segs; } - n_segs += tcp_send_acks (tc, max_burst_size); + n_segs += tcp_send_acks (tc, sp->max_burst_size - n_segs); return n_segs; } @@ -2238,7 +2151,7 @@ tcp_output_handle_packet (tcp_connection_t * tc0, vlib_buffer_t * b0, } vnet_buffer (b0)->sw_if_index[VLIB_TX] = tc0->c_fib_index; - vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_RX] = tc0->sw_if_index; if (!is_ip4) { @@ -2254,9 +2167,6 @@ tcp_output_handle_packet (tcp_connection_t * tc0, vlib_buffer_t * b0, } } - if (!TCP_ALWAYS_ACK) - tcp_timer_reset (tc0, TCP_TIMER_DELACK); - tc0->segs_out += 1; } @@ -2267,13 +2177,11 @@ tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_from, *from, thread_index = vm->thread_index; vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; u16 nexts[VLIB_FRAME_SIZE], *next; - vlib_node_runtime_t *error_node; - - error_node = vlib_node_get_runtime (vm, tcp_node_index (output, is_ip4)); + u16 err_counters[TCP_N_ERROR] = { 0 }; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; - tcp_set_time_now (tcp_get_worker (thread_index)); + tcp_update_time_now (tcp_get_worker (thread_index)); if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) tcp46_output_trace_frame (vm, node, from, n_left_from); @@ -2307,8 +2215,8 @@ tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_check_if_gso (tc0, b[0]); tcp_check_if_gso (tc1, b[1]); - tcp_output_handle_packet (tc0, b[0], error_node, &next[0], is_ip4); - tcp_output_handle_packet (tc1, b[1], error_node, &next[1], is_ip4); + tcp_output_handle_packet (tc0, b[0], node, &next[0], is_ip4); + tcp_output_handle_packet (tc1, b[1], node, &next[1], is_ip4); } else { @@ -2316,24 +2224,24 @@ tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_output_push_ip (vm, b[0], tc0, is_ip4); tcp_check_if_gso (tc0, b[0]); - tcp_output_handle_packet (tc0, b[0], error_node, &next[0], - is_ip4); + tcp_output_handle_packet (tc0, b[0], node, &next[0], is_ip4); } else { - b[0]->error = error_node->errors[TCP_ERROR_INVALID_CONNECTION]; + tcp_inc_err_counter (err_counters, TCP_ERROR_INVALID_CONNECTION, + 1); next[0] = TCP_OUTPUT_NEXT_DROP; } if (tc1 != 0) { tcp_output_push_ip (vm, b[1], tc1, is_ip4); tcp_check_if_gso (tc1, b[1]); - tcp_output_handle_packet (tc1, b[1], error_node, &next[1], - is_ip4); + tcp_output_handle_packet (tc1, b[1], node, &next[1], is_ip4); } else { - b[1]->error = error_node->errors[TCP_ERROR_INVALID_CONNECTION]; + tcp_inc_err_counter (err_counters, TCP_ERROR_INVALID_CONNECTION, + 1); next[1] = TCP_OUTPUT_NEXT_DROP; } } @@ -2359,11 +2267,11 @@ tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_output_push_ip (vm, b[0], tc0, is_ip4); tcp_check_if_gso (tc0, b[0]); - tcp_output_handle_packet (tc0, b[0], error_node, &next[0], is_ip4); + tcp_output_handle_packet (tc0, b[0], node, &next[0], is_ip4); } else { - b[0]->error = error_node->errors[TCP_ERROR_INVALID_CONNECTION]; + tcp_inc_err_counter (err_counters, TCP_ERROR_INVALID_CONNECTION, 1); next[0] = TCP_OUTPUT_NEXT_DROP; } @@ -2372,6 +2280,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_left_from -= 1; } + tcp_store_err_counters (output, err_counters); vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors); vlib_node_increment_counter (vm, tcp_node_index (output, is_ip4), TCP_ERROR_PKTS_SENT, frame->n_vectors); @@ -2390,7 +2299,6 @@ VLIB_NODE_FN (tcp6_output_node) (vlib_main_t * vm, vlib_node_runtime_t * node, return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ ); } -/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp4_output_node) = { .name = "tcp4-output", @@ -2398,7 +2306,7 @@ VLIB_REGISTER_NODE (tcp4_output_node) = .vector_size = sizeof (u32), .n_errors = TCP_N_ERROR, .protocol_hint = VLIB_NODE_PROTO_HINT_TCP, - .error_strings = tcp_error_strings, + .error_counters = tcp_output_error_counters, .n_next_nodes = TCP_OUTPUT_N_NEXT, .next_nodes = { #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, @@ -2408,9 +2316,7 @@ VLIB_REGISTER_NODE (tcp4_output_node) = .format_buffer = format_tcp_header, .format_trace = format_tcp_tx_trace, }; -/* *INDENT-ON* */ -/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp6_output_node) = { .name = "tcp6-output", @@ -2418,7 +2324,7 @@ VLIB_REGISTER_NODE (tcp6_output_node) = .vector_size = sizeof (u32), .n_errors = TCP_N_ERROR, .protocol_hint = VLIB_NODE_PROTO_HINT_TCP, - .error_strings = tcp_error_strings, + .error_counters = tcp_output_error_counters, .n_next_nodes = TCP_OUTPUT_N_NEXT, .next_nodes = { #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, @@ -2428,7 +2334,6 @@ VLIB_REGISTER_NODE (tcp6_output_node) = .format_buffer = format_tcp_header, .format_trace = format_tcp_tx_trace, }; -/* *INDENT-ON* */ typedef enum _tcp_reset_next { @@ -2445,84 +2350,105 @@ typedef enum _tcp_reset_next _(DROP, "error-drop") \ _(IP_LOOKUP, "ip6-lookup") +static void +tcp_reset_trace_frame (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_buffer_t **bs, u32 n_bufs, u8 is_ip4) +{ + tcp_header_t *tcp; + tcp_tx_trace_t *t; + int i; + + for (i = 0; i < n_bufs; i++) + { + if (bs[i]->flags & VLIB_BUFFER_IS_TRACED) + { + tcp = vlib_buffer_get_current (bs[i]); + t = vlib_add_trace (vm, node, bs[i], sizeof (*t)); + + if (is_ip4) + { + ip4_header_t *ih4 = vlib_buffer_get_current (bs[i]); + tcp = ip4_next_header (ih4); + t->tcp_connection.c_lcl_ip.ip4 = ih4->dst_address; + t->tcp_connection.c_rmt_ip.ip4 = ih4->src_address; + t->tcp_connection.c_is_ip4 = 1; + } + else + { + ip6_header_t *ih6 = vlib_buffer_get_current (bs[i]); + tcp = ip6_next_header (ih6); + t->tcp_connection.c_lcl_ip.ip6 = ih6->dst_address; + t->tcp_connection.c_rmt_ip.ip6 = ih6->src_address; + } + t->tcp_connection.c_lcl_port = tcp->dst_port; + t->tcp_connection.c_rmt_port = tcp->src_port; + t->tcp_connection.c_proto = TRANSPORT_PROTO_TCP; + clib_memcpy_fast (&t->tcp_header, tcp, sizeof (t->tcp_header)); + } + } +} + static uword -tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * from_frame, u8 is_ip4) +tcp46_reset_inline (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, u8 is_ip4) { - u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; - u32 n_left_from, next_index, *from, *to_next; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b; + u16 nexts[VLIB_FRAME_SIZE], *next; + u32 n_left_from, *from; - from = vlib_frame_vector_args (from_frame); - n_left_from = from_frame->n_vectors; + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + vlib_get_buffers (vm, from, bufs, n_left_from); - next_index = node->cached_next_index; + b = bufs; + next = nexts; while (n_left_from > 0) { - u32 n_left_to_next; + tcp_buffer_make_reset (vm, b[0], is_ip4); - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + /* IP lookup in fib where it was received. Previous value + * was overwritten by tcp-input */ + vnet_buffer (b[0])->sw_if_index[VLIB_TX] = + vec_elt (ip4_main.fib_index_by_sw_if_index, + vnet_buffer (b[0])->sw_if_index[VLIB_RX]); - while (n_left_from > 0 && n_left_to_next > 0) - { - vlib_buffer_t *b0; - tcp_tx_trace_t *t0; - tcp_header_t *th0; - u32 bi0; - - bi0 = from[0]; - to_next[0] = bi0; - from += 1; - to_next += 1; - n_left_from -= 1; - n_left_to_next -= 1; - - b0 = vlib_get_buffer (vm, bi0); - tcp_make_reset_in_place (vm, b0, is_ip4); - - /* Prepare to send to IP lookup */ - vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; - - b0->error = node->errors[error0]; - b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) - { - th0 = vlib_buffer_get_current (b0); - if (is_ip4) - th0 = ip4_next_header ((ip4_header_t *) th0); - else - th0 = ip6_next_header ((ip6_header_t *) th0); - t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy_fast (&t0->tcp_header, th0, - sizeof (t0->tcp_header)); - } + b[0]->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; + next[0] = TCP_RESET_NEXT_IP_LOOKUP; - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, - n_left_to_next, bi0, next0); - } - vlib_put_next_frame (vm, node, next_index, n_left_to_next); + b += 1; + next += 1; + n_left_from -= 1; } - return from_frame->n_vectors; + + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) + tcp_reset_trace_frame (vm, node, bufs, frame->n_vectors, is_ip4); + + vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors); + + vlib_node_increment_counter (vm, node->node_index, TCP_ERROR_RST_SENT, + frame->n_vectors); + + return frame->n_vectors; } VLIB_NODE_FN (tcp4_reset_node) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame) { - return tcp46_send_reset_inline (vm, node, from_frame, 1); + return tcp46_reset_inline (vm, node, from_frame, 1); } VLIB_NODE_FN (tcp6_reset_node) (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame) { - return tcp46_send_reset_inline (vm, node, from_frame, 0); + return tcp46_reset_inline (vm, node, from_frame, 0); } -/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp4_reset_node) = { .name = "tcp4-reset", .vector_size = sizeof (u32), .n_errors = TCP_N_ERROR, - .error_strings = tcp_error_strings, + .error_counters = tcp_output_error_counters, .n_next_nodes = TCP_RESET_N_NEXT, .next_nodes = { #define _(s,n) [TCP_RESET_NEXT_##s] = n, @@ -2531,14 +2457,12 @@ VLIB_REGISTER_NODE (tcp4_reset_node) = { }, .format_trace = format_tcp_tx_trace, }; -/* *INDENT-ON* */ -/* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp6_reset_node) = { .name = "tcp6-reset", .vector_size = sizeof (u32), .n_errors = TCP_N_ERROR, - .error_strings = tcp_error_strings, + .error_counters = tcp_output_error_counters, .n_next_nodes = TCP_RESET_N_NEXT, .next_nodes = { #define _(s,n) [TCP_RESET_NEXT_##s] = n, @@ -2547,7 +2471,6 @@ VLIB_REGISTER_NODE (tcp6_reset_node) = { }, .format_trace = format_tcp_tx_trace, }; -/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON