X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=0e915505d373a53215c9c61ee919dd3744558584;hb=ea41aac320cc47ae00e7dd6e870e6ca32dcdc0b5;hp=0f1ab1ab3b077bb71b98cc664f2b412d1ace646a;hpb=efefc6b4b219e2897e48def83352b4df52bc03a0;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 0f1ab1ab3b0..0e915505d37 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -121,10 +121,11 @@ tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) * * @param th TCP header * @param to TCP options data structure to be populated + * @param is_syn set if packet is syn * @return -1 if parsing failed */ -static int -tcp_options_parse (tcp_header_t * th, tcp_options_t * to) +static inline int +tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn) { const u8 *data; u8 opt_len, opts_len, kind; @@ -136,7 +137,7 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) /* Zero out all flags but those set in SYN */ to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE - | TCP_OPTS_FLAG_SACK); + | TCP_OPTS_FLAG_TSTAMP | TCP_OPTION_MSS); for (; opts_len > 0; opts_len -= opt_len, data += opt_len) { @@ -166,6 +167,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) switch (kind) { case TCP_OPTION_MSS: + if (!is_syn) + break; if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th)) { to->flags |= TCP_OPTS_FLAG_MSS; @@ -173,6 +176,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) } break; case TCP_OPTION_WINDOW_SCALE: + if (!is_syn) + break; if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th)) { to->flags |= TCP_OPTS_FLAG_WSCALE; @@ -186,14 +191,18 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) } break; case TCP_OPTION_TIMESTAMP: - if (opt_len == TCP_OPTION_LEN_TIMESTAMP) + if (is_syn) + to->flags |= TCP_OPTS_FLAG_TSTAMP; + if ((to->flags & TCP_OPTS_FLAG_TSTAMP) + && opt_len == TCP_OPTION_LEN_TIMESTAMP) { - to->flags |= TCP_OPTS_FLAG_TSTAMP; to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2)); to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6)); } break; case TCP_OPTION_SACK_PERMITTED: + if (!is_syn) + break; if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th)) to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; break; @@ -236,7 +245,7 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) always_inline int tcp_segment_check_paws (tcp_connection_t * tc) { - return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent + return tcp_opts_tstamp (&tc->rcv_opts) && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent); } @@ -289,9 +298,8 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0, goto error; } - if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts))) + if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts, 0))) { - clib_warning ("options parse error"); *error0 = TCP_ERROR_OPTIONS; goto error; } @@ -299,8 +307,6 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0, if (PREDICT_FALSE (tcp_segment_check_paws (tc0))) { *error0 = TCP_ERROR_PAWS; - if (CLIB_DEBUG > 2) - clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2); TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); @@ -309,8 +315,7 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0, if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, tcp_time_now_w_thread (tc0->c_thread_index))) { - /* Age isn't reset until we get a valid tsval (bsd inspired) */ - tc0->tsval_recent = 0; + tc0->tsval_recent = tc0->rcv_opts.tsval; clib_warning ("paws failed - really old segment. REALLY?"); } else @@ -361,6 +366,7 @@ tcp_segment_validate (tcp_worker_ctx_t * wrk, tcp_connection_t * tc0, /* TODO implement RFC 5961 */ if (tc0->state == TCP_STATE_SYN_RCVD) { + tcp_options_parse (th0, &tc0->rcv_opts, 1); tcp_send_synack (tc0); TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); } @@ -501,8 +507,8 @@ tcp_estimate_initial_rtt (tcp_connection_t * tc) else { mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr; + mrtt = clib_max (mrtt, 1); tc->mrtt_us = (f64) mrtt *TCP_TICK; - } if (mrtt > 0 && mrtt < TCP_RTT_MAX) @@ -1131,7 +1137,6 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->rtt_ts = 0; tcp_fastrecovery_off (tc); - tcp_fastrecovery_1_smss_off (tc); tcp_fastrecovery_first_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); @@ -1441,7 +1446,7 @@ partial_ack: */ /* XXX limit this only to first partial ack? */ - tcp_retransmit_timer_force_update (tc); + tcp_retransmit_timer_update (tc); /* RFC6675: If the incoming ACK is a cumulative acknowledgment, * reset dupacks to 0. Also needed if in congestion recovery */ @@ -1600,6 +1605,54 @@ process_ack: return 0; } +static void +tcp_program_disconnect (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) +{ + if (!tcp_disconnect_pending (tc)) + { + vec_add1 (wrk->pending_disconnects, tc->c_c_index); + tcp_disconnect_pending_on (tc); + } +} + +static void +tcp_handle_disconnects (tcp_worker_ctx_t * wrk) +{ + u32 thread_index, *pending_disconnects; + tcp_connection_t *tc; + int i; + + if (!vec_len (wrk->pending_disconnects)) + return; + + thread_index = wrk->vm->thread_index; + pending_disconnects = wrk->pending_disconnects; + for (i = 0; i < vec_len (pending_disconnects); i++) + { + tc = tcp_connection_get (pending_disconnects[i], thread_index); + tcp_disconnect_pending_off (tc); + stream_session_disconnect_notify (&tc->connection); + } + _vec_len (wrk->pending_disconnects) = 0; +} + +static void +tcp_rcv_fin (tcp_worker_ctx_t * wrk, tcp_connection_t * tc, vlib_buffer_t * b, + u32 * error) +{ + /* Enter CLOSE-WAIT and notify session. To avoid lingering + * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ + /* Account for the FIN if nothing else was received */ + if (vnet_buffer (b)->tcp.data_len == 0) + tc->rcv_nxt += 1; + tcp_program_ack (wrk, tc); + tc->state = TCP_STATE_CLOSE_WAIT; + tcp_program_disconnect (wrk, tc); + tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc); + *error = TCP_ERROR_FIN_RCVD; +} + static u8 tcp_sack_vector_is_sane (sack_block_t * sacks) { @@ -1943,13 +1996,14 @@ tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0, { if (tc0) { - clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection)); + clib_memcpy_fast (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } else { th0 = tcp_buffer_hdr (b0); } - clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header)); } static void @@ -2099,19 +2153,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 8: check the FIN bit */ if (PREDICT_FALSE (is_fin)) - { - /* Enter CLOSE-WAIT and notify session. To avoid lingering - * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ - /* Account for the FIN if nothing else was received */ - if (vnet_buffer (b0)->tcp.data_len == 0) - tc0->rcv_nxt += 1; - tcp_program_ack (wrk, tc0); - tc0->state = TCP_STATE_CLOSE_WAIT; - stream_session_disconnect_notify (&tc0->connection); - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); - TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); - error0 = TCP_ERROR_FIN_RCVD; - } + tcp_rcv_fin (wrk, tc0, b0, &error0); done: tcp_inc_err_counter (err_counters, error0, 1); @@ -2122,6 +2164,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, err_counters[TCP_ERROR_EVENT_FIFO_FULL] = errors; tcp_store_err_counters (established, err_counters); tcp_handle_postponed_dequeues (wrk); + tcp_handle_disconnects (wrk); vlib_buffer_free (vm, first_buffer, frame->n_vectors); return frame->n_vectors; @@ -2397,7 +2440,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Parse options */ - if (tcp_options_parse (tcp0, &tc0->rcv_opts)) + if (tcp_options_parse (tcp0, &tc0->rcv_opts, 1)) { clib_warning ("options parse fail"); error0 = TCP_ERROR_OPTIONS; @@ -2407,7 +2450,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Valid SYN or SYN-ACK. Move connection from half-open pool to * current thread pool. */ pool_get (tm->connections[my_thread_index], new_tc0); - clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); + clib_memcpy_fast (new_tc0, tc0, sizeof (*new_tc0)); new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index]; new_tc0->c_thread_index = my_thread_index; new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; @@ -2459,6 +2502,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } + new_tc0->tx_fifo_size = + transport_tx_fifo_size (&new_tc0->connection); /* Update rtt with the syn-ack sample */ tcp_estimate_initial_rtt (new_tc0); TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); @@ -2478,8 +2523,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - tc0->rtt_ts = 0; - tcp_init_snd_vars (tc0); + new_tc0->tx_fifo_size = + transport_tx_fifo_size (&new_tc0->connection); + new_tc0->rtt_ts = 0; + tcp_init_snd_vars (new_tc0); tcp_send_synack (new_tc0); error0 = TCP_ERROR_SYNS_RCVD; goto drop; @@ -2504,8 +2551,9 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0)) { t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection)); + clib_memcpy_fast (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy_fast (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } } @@ -3045,13 +3093,13 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } else { - clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address, - sizeof (ip6_address_t)); - clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address, - sizeof (ip6_address_t)); + clib_memcpy_fast (&child0->c_lcl_ip6, &ip60->dst_address, + sizeof (ip6_address_t)); + clib_memcpy_fast (&child0->c_rmt_ip6, &ip60->src_address, + sizeof (ip6_address_t)); } - if (tcp_options_parse (th0, &child0->rcv_opts)) + if (tcp_options_parse (th0, &child0->rcv_opts, 1)) { clib_warning ("options parse fail"); goto drop; @@ -3090,6 +3138,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } + child0->tx_fifo_size = transport_tx_fifo_size (&child0->connection); tcp_send_synack (child0); tcp_timer_set (child0, TCP_TIMER_ESTABLISH, TCP_SYN_RCVD_TIME); @@ -3098,8 +3147,9 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); - clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); - clib_memcpy (&t0->tcp_connection, lc0, sizeof (t0->tcp_connection)); + clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy_fast (&t0->tcp_connection, lc0, + sizeof (t0->tcp_connection)); } n_syns += (error0 == TCP_ERROR_NONE); @@ -3258,13 +3308,19 @@ tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error, if (is_ip4) { ip4_header_t *ip4 = vlib_buffer_get_current (b); + int ip_hdr_bytes = ip4_header_bytes (ip4); + if (PREDICT_FALSE (b->current_length < ip_hdr_bytes + sizeof (*tcp))) + { + *error = TCP_ERROR_LENGTH; + return 0; + } tcp = ip4_next_header (ip4); vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip4; - n_advance_bytes = (ip4_header_bytes (ip4) + tcp_header_bytes (tcp)); + n_advance_bytes = (ip_hdr_bytes + tcp_header_bytes (tcp)); n_data_bytes = clib_net_to_host_u16 (ip4->length) - n_advance_bytes; /* Length check. Checksum computed by ipx_local no need to compute again */ - if (PREDICT_FALSE (n_advance_bytes < 0)) + if (PREDICT_FALSE (n_data_bytes < 0)) { *error = TCP_ERROR_LENGTH; return 0; @@ -3278,6 +3334,11 @@ tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error, else { ip6_header_t *ip6 = vlib_buffer_get_current (b); + if (PREDICT_FALSE (b->current_length < sizeof (*ip6) + sizeof (*tcp))) + { + *error = TCP_ERROR_LENGTH; + return 0; + } tcp = ip6_next_header (ip6); vnet_buffer (b)->tcp.hdr_offset = (u8 *) tcp - (u8 *) ip6; n_advance_bytes = tcp_header_bytes (tcp); @@ -3285,7 +3346,7 @@ tcp_input_lookup_buffer (vlib_buffer_t * b, u8 thread_index, u32 * error, - n_advance_bytes; n_advance_bytes += sizeof (ip6[0]); - if (PREDICT_FALSE (n_advance_bytes < 0)) + if (PREDICT_FALSE (n_data_bytes < 0)) { *error = TCP_ERROR_LENGTH; return 0;