X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_output.c;h=97f5b81f9e81daeea2b463dc63c38375efaeab2d;hb=7c03ed4;hp=b15cf9b362b7ef6510e47a880645edd67d7bd7d0;hpb=36ee9f1ca37daf277c2cd8d33bf16eabc15773e5;p=vpp.git diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index b15cf9b362b..97f5b81f9e8 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -126,7 +126,10 @@ tcp_initial_window_to_advertise (tcp_connection_t * tc) * scale to be computed in the same way */ max_fifo = tm->max_rx_fifo ? tm->max_rx_fifo : TCP_MAX_RX_FIFO_SIZE; - tc->rcv_wscale = tcp_window_compute_scale (max_fifo); + /* Compute rcv wscale only if peer advertised support for it */ + if (tc->state != TCP_STATE_SYN_RCVD || tcp_opts_wscale (&tc->rcv_opts)) + tc->rcv_wscale = tcp_window_compute_scale (max_fifo); + tc->rcv_wnd = tcp_initial_wnd_unscaled (tc); return clib_min (tc->rcv_wnd, TCP_WND_MAX); @@ -215,7 +218,7 @@ tcp_options_write (u8 * data, tcp_options_t * opts) *data++ = TCP_OPTION_MSS; *data++ = TCP_OPTION_LEN_MSS; buf = clib_host_to_net_u16 (opts->mss); - clib_memcpy (data, &buf, sizeof (opts->mss)); + clib_memcpy_fast (data, &buf, sizeof (opts->mss)); data += sizeof (opts->mss); opts_len += TCP_OPTION_LEN_MSS; } @@ -240,10 +243,10 @@ tcp_options_write (u8 * data, tcp_options_t * opts) *data++ = TCP_OPTION_TIMESTAMP; *data++ = TCP_OPTION_LEN_TIMESTAMP; buf = clib_host_to_net_u32 (opts->tsval); - clib_memcpy (data, &buf, sizeof (opts->tsval)); + clib_memcpy_fast (data, &buf, sizeof (opts->tsval)); data += sizeof (opts->tsval); buf = clib_host_to_net_u32 (opts->tsecr); - clib_memcpy (data, &buf, sizeof (opts->tsecr)); + clib_memcpy_fast (data, &buf, sizeof (opts->tsecr)); data += sizeof (opts->tsecr); opts_len += TCP_OPTION_LEN_TIMESTAMP; } @@ -261,10 +264,10 @@ tcp_options_write (u8 * data, tcp_options_t * opts) for (i = 0; i < n_sack_blocks; i++) { buf = clib_host_to_net_u32 (opts->sacks[i].start); - clib_memcpy (data, &buf, seq_len); + clib_memcpy_fast (data, &buf, seq_len); data += seq_len; buf = clib_host_to_net_u32 (opts->sacks[i].end); - clib_memcpy (data, &buf, seq_len); + clib_memcpy_fast (data, &buf, seq_len); data += seq_len; } opts_len += 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; @@ -389,9 +392,13 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, switch (state) { case TCP_STATE_ESTABLISHED: + case TCP_STATE_CLOSE_WAIT: case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_LAST_ACK: + case TCP_STATE_CLOSING: + case TCP_STATE_FIN_WAIT_2: + case TCP_STATE_TIME_WAIT: case TCP_STATE_CLOSED: - case TCP_STATE_CLOSE_WAIT: return tcp_make_established_options (tc, opts); case TCP_STATE_SYN_RCVD: return tcp_make_synack_options (tc, opts); @@ -444,7 +451,6 @@ tcp_init_mss (tcp_connection_t * tc) if (tc->snd_mss < 45) { - clib_warning ("snd mss is 0"); /* Assume that at least the min default mss works */ tc->snd_mss = default_min_mss; tc->rcv_opts.mss = default_min_mss; @@ -559,7 +565,6 @@ tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_reuse_buffer (vm, b); tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc); - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; tc->rcv_las = tc->rcv_nxt; } @@ -631,7 +636,6 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) tcp_options_write ((u8 *) (th + 1), snd_opts); vnet_buffer (b)->tcp.connection_index = tc->c_c_index; - vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; /* Init retransmit timer. Use update instead of set because of * retransmissions */ @@ -766,8 +770,8 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, { ih6 = vlib_buffer_get_current (b0); ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); - clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); - clib_memcpy (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t)); + clib_memcpy_fast (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); + clib_memcpy_fast (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t)); } src_port = th0->src_port; @@ -990,7 +994,7 @@ tcp_send_syn (tcp_connection_t * tc) * Setup retransmit and establish timers before requesting buffer * such that we can return if we've ran out. */ - tcp_timer_set (tc, TCP_TIMER_ESTABLISH, TCP_ESTABLISH_TIME); + tcp_timer_set (tc, TCP_TIMER_ESTABLISH_AO, TCP_ESTABLISH_TIME); tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK); @@ -1002,7 +1006,7 @@ tcp_send_syn (tcp_connection_t * tc) tcp_make_syn (tc, b); /* Measure RTT with this */ - tc->rtt_ts = tcp_time_now (); + tc->rtt_ts = tcp_time_now_us (vlib_num_workers ()? 1 : 0); tc->rtt_seq = tc->snd_nxt; tc->rto_boff = 0; @@ -1011,6 +1015,24 @@ tcp_send_syn (tcp_connection_t * tc) TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc); } +void +tcp_send_synack (tcp_connection_t * tc) +{ + tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index); + vlib_main_t *vm = wrk->vm; + vlib_buffer_t *b; + u32 bi; + + /* Get buffer */ + if (PREDICT_FALSE (tcp_get_free_buffer_index (wrk, &bi))) + return; + + tc->rtt_ts = tcp_time_now_us (tc->c_thread_index); + b = vlib_get_buffer (vm, bi); + tcp_make_synack (tc, b); + tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); +} + /** * Flush tx frame populated by retransmits and timer pops */ @@ -1074,7 +1096,9 @@ tcp_send_fin (tcp_connection_t * tc) { /* Out of buffers so program fin retransmit ASAP */ tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1); - goto post_enqueue; + if (fin_snt) + tc->snd_nxt = tc->snd_una_max; + return; } tcp_retransmit_timer_force_update (tc); @@ -1084,7 +1108,6 @@ tcp_send_fin (tcp_connection_t * tc) tcp_enqueue_to_output_now (wrk, b, bi, tc->c_is_ip4); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); -post_enqueue: if (!fin_snt) { tc->flags |= TCP_CONN_FINSNT; @@ -1106,6 +1129,8 @@ tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state) { case TCP_STATE_ESTABLISHED: case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_TIME_WAIT: + case TCP_STATE_FIN_WAIT_2: return TCP_FLAG_ACK; case TCP_STATE_SYN_RCVD: return TCP_FLAG_SYN | TCP_FLAG_ACK; @@ -1113,6 +1138,7 @@ tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state) return TCP_FLAG_SYN; case TCP_STATE_LAST_ACK: case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_CLOSING: if (tc->snd_nxt + 1 < tc->snd_una_max) return TCP_FLAG_ACK; else @@ -1153,16 +1179,21 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, advertise_wnd = tcp_window_to_advertise (tc, next_state); flags = tcp_make_state_flags (tc, next_state); - + if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING)) + { + if (seq_geq (tc->psh_seq, tc->snd_nxt) + && seq_lt (tc->psh_seq, tc->snd_nxt + data_len)) + flags |= TCP_FLAG_PSH; + } th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); if (maybe_burst) { - clib_memcpy ((u8 *) (th + 1), - tm->wrk_ctx[tc->c_thread_index].cached_opts, - tc->snd_opts_len); + clib_memcpy_fast ((u8 *) (th + 1), + tm->wrk_ctx[tc->c_thread_index].cached_opts, + tc->snd_opts_len); } else { @@ -1186,8 +1217,7 @@ tcp_push_header (tcp_connection_t * tc, vlib_buffer_t * b) tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, /* compute opts */ 0, /* burst */ 1); tc->snd_una_max = tc->snd_nxt; - ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd - + tcp_fastrecovery_sent_1_smss (tc) * tc->snd_mss)); + ASSERT (seq_leq (tc->snd_una_max, tc->snd_una + tc->snd_wnd)); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); /* If not tracking an ACK, start tracking */ if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) @@ -1223,6 +1253,56 @@ tcp_send_ack (tcp_connection_t * tc) tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4); } +void +tcp_program_ack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!(tc->flags & TCP_CONN_SNDACK)) + { + vec_add1 (wrk->pending_acks, tc->c_c_index); + tc->flags |= TCP_CONN_SNDACK; + } +} + +void +tcp_program_dupack (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!(tc->flags & TCP_CONN_SNDACK)) + { + vec_add1 (wrk->pending_acks, tc->c_c_index); + tc->flags |= TCP_CONN_SNDACK; + } + if (tc->pending_dupacks < 255) + tc->pending_dupacks += 1; +} + +void +tcp_send_acks (tcp_worker_ctx_t * wrk) +{ + u32 thread_index, *pending_acks; + tcp_connection_t *tc; + int i, j, n_acks; + + if (!vec_len (wrk->pending_acks)) + return; + + thread_index = wrk->vm->thread_index; + pending_acks = wrk->pending_acks; + for (i = 0; i < vec_len (pending_acks); i++) + { + tc = tcp_connection_get (pending_acks[i], thread_index); + tc->flags &= ~TCP_CONN_SNDACK; + n_acks = clib_max (1, tc->pending_dupacks); + /* If we're supposed to send dupacks but have no ooo data + * send only one ack */ + if (tc->pending_dupacks && !vec_len (tc->snd_sacks)) + n_acks = 1; + for (j = 0; j < n_acks; j++) + tcp_send_ack (tc); + tc->pending_dupacks = 0; + } + _vec_len (wrk->pending_acks) = 0; +} + /** * Delayed ack timer handler * @@ -1308,7 +1388,7 @@ tcp_prepare_segment (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, } } - tcp_get_free_buffer_index (wrk, &bi); + (void) tcp_get_free_buffer_index (wrk, &bi); ASSERT (bi != (u32) ~ 0); *b = vlib_get_buffer (vm, bi); data = tcp_init_buffer (vm, *b); @@ -1451,7 +1531,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tc = tcp_half_open_connection_get (index); /* Note: the connection may have transitioned to ESTABLISHED... */ - if (PREDICT_FALSE (tc == 0)) + if (PREDICT_FALSE (tc == 0 || tc->state != TCP_STATE_SYN_SENT)) return; tc->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID; } @@ -1459,9 +1539,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tc = tcp_connection_get (index, thread_index); /* Note: the connection may have been closed and pool_put */ - if (PREDICT_FALSE (tc == 0)) + if (PREDICT_FALSE (tc == 0 || tc->state == TCP_STATE_SYN_SENT)) return; tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + /* Wait-close and retransmit could pop at the same time */ + if (tc->state == TCP_STATE_CLOSED) + return; } if (tc->state >= TCP_STATE_ESTABLISHED) @@ -1516,8 +1599,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Send one segment. Note that n_bytes may be zero due to buffer * shortfall */ n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, tc->snd_mss, &b); - - if (n_bytes == 0) + if (!n_bytes) { tcp_retransmit_timer_force_update (tc); return; @@ -1541,10 +1623,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (tc->flags & TCP_CONN_HALF_OPEN_DONE) { if (tcp_half_open_connection_cleanup (tc)) - { - clib_warning ("could not remove half-open connection"); - ASSERT (0); - } + TCP_DBG ("could not remove half-open connection"); return; } @@ -1643,7 +1722,7 @@ tcp_timer_persist_handler (u32 index) /* Problem already solved or worse */ if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED - || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + || tc->snd_wnd > tc->snd_mss) return; available_bytes = session_tx_fifo_max_dequeue (&tc->connection); @@ -1908,7 +1987,7 @@ send_unsent: /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */ snd_space = tcp_available_cc_snd_space (tc); - if (snd_space < tc->snd_mss) + if (snd_space < tc->snd_mss || tc->snd_mss == 0) goto done; max_deq = session_tx_fifo_max_dequeue (&tc->connection); @@ -1944,13 +2023,6 @@ tcp_fast_retransmit (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, return tcp_fast_retransmit_no_sack (wrk, tc, burst_size); } -static u32 -tcp_session_has_ooo_data (tcp_connection_t * tc) -{ - stream_session_t *s = session_get (tc->c_s_index, tc->c_thread_index); - return svm_fifo_has_ooo_data (s->server_rx_fifo); -} - static void tcp_output_handle_link_local (tcp_connection_t * tc0, vlib_buffer_t * b0, u16 * next0, u32 * error0) @@ -2001,8 +2073,8 @@ tcp46_output_trace_frame (vlib_main_t * vm, vlib_node_runtime_t * node, tc = tcp_connection_get (vnet_buffer (b)->tcp.connection_index, vm->thread_index); t = vlib_add_trace (vm, node, b, sizeof (*t)); - clib_memcpy (&t->tcp_header, th, sizeof (t->tcp_header)); - clib_memcpy (&t->tcp_connection, tc, sizeof (t->tcp_connection)); + clib_memcpy_fast (&t->tcp_header, th, sizeof (t->tcp_header)); + clib_memcpy_fast (&t->tcp_connection, tc, sizeof (t->tcp_connection)); } } @@ -2055,25 +2127,6 @@ tcp_output_handle_packet (tcp_connection_t * tc0, vlib_buffer_t * b0, tcp_output_handle_link_local (tc0, b0, next0, error0); } - /* Filter out DUPACKs if there are no OOO segments left */ - if (PREDICT_FALSE (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) - { - /* N.B. Should not filter burst of dupacks. Two issues: - * 1) dupacks open cwnd on remote peer when congested - * 2) acks leaving should have the latest rcv_wnd since the - * burst may have eaten up all of it, so only the old ones - * could be filtered. - */ - if (!tcp_session_has_ooo_data (tc0)) - { - *error0 = TCP_ERROR_FILTERED_DUPACKS; - *next0 = TCP_OUTPUT_NEXT_DROP; - return; - } - } - - /* Stop DELACK timer and fix flags */ - tc0->flags &= ~(TCP_CONN_SNDACK); if (!TCP_ALWAYS_ACK) tcp_timer_reset (tc0, TCP_TIMER_DELACK); } @@ -2171,19 +2224,21 @@ tcp6_output (vlib_main_t * vm, vlib_node_runtime_t * node, /* *INDENT-OFF* */ VLIB_REGISTER_NODE (tcp4_output_node) = { - .function = tcp4_output,.name = "tcp4-output", - /* Takes a vector of packets. */ - .vector_size = sizeof (u32), - .n_errors = TCP_N_ERROR, - .error_strings = tcp_error_strings, - .n_next_nodes = TCP_OUTPUT_N_NEXT, - .next_nodes = { + .function = tcp4_output, + .name = "tcp4-output", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .protocol_hint = VLIB_NODE_PROTO_HINT_TCP, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_OUTPUT_N_NEXT, + .next_nodes = { #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, foreach_tcp4_output_next #undef _ - }, - .format_buffer = format_tcp_header, - .format_trace = format_tcp_tx_trace, + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_tx_trace, }; /* *INDENT-ON* */ @@ -2197,6 +2252,7 @@ VLIB_REGISTER_NODE (tcp6_output_node) = /* Takes a vector of packets. */ .vector_size = sizeof (u32), .n_errors = TCP_N_ERROR, + .protocol_hint = VLIB_NODE_PROTO_HINT_TCP, .error_strings = tcp_error_strings, .n_next_nodes = TCP_OUTPUT_N_NEXT, .next_nodes = { @@ -2284,7 +2340,8 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, else th0 = ip6_next_header ((ip6_header_t *) th0); t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy_fast (&t0->tcp_header, th0, + sizeof (t0->tcp_header)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,