X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=1679f81894746109f506764e098255b09c3b9b86;hb=d3c008d108aa2187d1a2afe2833b4de25ca2c2ab;hp=841e72a503e3e14cb9de76a9ffc003c9a5697171;hpb=4eeeaaf5e822718eb222e6c49abd82e1bcb566fd;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 841e72a503e..1679f818947 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -248,8 +248,8 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) * then the TSval from the segment is copied to TS.Recent; * otherwise, the TSval is ignored. */ - if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent - && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) + if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las) + && seq_leq (tc->rcv_las, seq_end)) { ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval)); tc->tsval_recent = tc->rcv_opts.tsval; @@ -351,12 +351,17 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tcp_syn (th0)) { /* TODO implement RFC 5961 */ - if (tc0->state != TCP_STATE_SYN_RCVD) - tcp_make_ack (tc0, b0); + if (tc0->state == TCP_STATE_SYN_RCVD) + { + tcp_make_synack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); + } else - tcp_make_synack (tc0, b0); + { + tcp_make_ack (tc0, b0); + TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, tc0); + } *next0 = tcp_next_output (tc0->c_is_ip4); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0, 0); return -1; } @@ -413,51 +418,53 @@ tcp_update_rto (tcp_connection_t * tc) tc->rto = clib_max (tc->rto, TCP_RTO_MIN); } -/** Update RTT estimate and RTO timer +/** + * Update RTT estimate and RTO timer * * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK * timing. Middle boxes are known to fiddle with TCP options so we * should give higher priority to ACK timing. * + * This should be called only if previously sent bytes have been acked. + * * return 1 if valid rtt 0 otherwise */ static int tcp_update_rtt (tcp_connection_t * tc, u32 ack) { u32 mrtt = 0; - u8 rtx_acked; - - /* Determine if only rtx bytes are acked. */ - rtx_acked = tcp_in_cong_recovery (tc) || !tc->bytes_acked; /* Karn's rule, part 1. Don't use retransmitted segments to estimate * RTT because they're ambiguous. */ - if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !rtx_acked) + if (tcp_in_cong_recovery (tc) || tc->sack_sb.sacked_bytes) + goto done; + + if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq)) { mrtt = tcp_time_now () - tc->rtt_ts; } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances * snd_una, i.e., the left side of the send window: - * seq_lt (tc->snd_una, ack). */ - else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr - && tc->bytes_acked) + * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */ + else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr) { mrtt = tcp_time_now () - tc->rcv_opts.tsecr; } - /* Allow measuring of a new RTT */ - tc->rtt_ts = 0; - - /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is - * not valid */ - if (tc->bytes_acked) - tc->rto_boff = 0; - /* Ignore dubious measurements */ if (mrtt == 0 || mrtt > TCP_RTT_MAX) - return 0; + goto done; tcp_estimate_rtt (tc, mrtt); + +done: + + /* Allow measuring of a new RTT */ + tc->rtt_ts = 0; + + /* If we got here something must've been ACKed so make sure boff is 0, + * even if mrrt is not valid since we update the rto lower */ + tc->rto_boff = 0; tcp_update_rto (tc); return 0; @@ -927,10 +934,12 @@ static void tcp_cc_recovery_exit (tcp_connection_t * tc) { /* Deflate rto */ - tcp_update_rto (tc); tc->rto_boff = 0; + tcp_update_rto (tc); tc->snd_rxt_ts = 0; + tc->snd_nxt = tc->snd_una_max; tcp_recovery_off (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } void @@ -939,8 +948,10 @@ tcp_cc_fastrecovery_exit (tcp_connection_t * tc) tc->cc_algo->recovered (tc); tc->snd_rxt_bytes = 0; tc->rcv_dupacks = 0; + tc->snd_nxt = tc->snd_una_max; tcp_fastrecovery_off (tc); tcp_fastrecovery_1_smss_off (tc); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } static void @@ -953,13 +964,14 @@ tcp_cc_congestion_undo (tcp_connection_t * tc) if (tcp_in_recovery (tc)) tcp_cc_recovery_exit (tc); ASSERT (tc->rto_boff == 0); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 5); /* TODO extend for fastrecovery */ } static u8 tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) { - return (tcp_in_recovery (tc) + return (tcp_in_recovery (tc) && tc->rto_boff == 1 && tc->snd_rxt_ts && tcp_opts_tstamp (&tc->rcv_opts) && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); @@ -983,7 +995,6 @@ tcp_cc_recover (tcp_connection_t * tc) ASSERT (tc->rto_boff == 0); ASSERT (!tcp_in_cong_recovery (tc)); ASSERT (tcp_scoreboard_is_sane_post_recovery (tc)); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); return 0; } @@ -1617,7 +1628,7 @@ format_tcp_rx_trace (u8 * s, va_list * args) CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); - uword indent = format_get_indent (s); + u32 indent = format_get_indent (s); s = format (s, "%U\n%U%U", format_tcp_header, &t->tcp_header, 128, @@ -1747,18 +1758,17 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 8: check the FIN bit */ if (PREDICT_FALSE (is_fin)) { - /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead - * wait for session to call close. To avoid lingering + /* Enter CLOSE-WAIT and notify session. To avoid lingering * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ - tc0->state = TCP_STATE_CLOSE_WAIT; - TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + /* Account for the FIN if nothing else was received */ if (vnet_buffer (b0)->tcp.data_len == 0) - { - tc0->rcv_nxt += 1; - next0 = TCP_ESTABLISHED_NEXT_DROP; - } + tc0->rcv_nxt += 1; + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + tc0->state = TCP_STATE_CLOSE_WAIT; stream_session_disconnect_notify (&tc0->connection); tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); } done: @@ -1973,6 +1983,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); + /* Crude check to see if the connection handle does not match + * the packet. Probably connection just switched to established */ + if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port + || tcp0->src_port != tc0->c_rmt_port)) + goto drop; + if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) goto drop; @@ -2105,7 +2121,6 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->flags |= TCP_CONN_SNDACK; /* Update rtt with the syn-ack sample */ - new_tc0->bytes_acked = 1; tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); } @@ -2148,7 +2163,8 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, drop: b0->error = error0 ? node->errors[error0] : 0; - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + if (PREDICT_FALSE + ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0)) { t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); @@ -2265,6 +2281,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + u8 is_fin0; bi0 = from[0]; to_next[0] = bi0; @@ -2283,11 +2300,11 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tcp0 = tcp_buffer_hdr (b0); + is_fin0 = tcp_is_fin (tcp0); /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number - + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) - + vnet_buffer (b0)->tcp.data_len; + + tcp_is_syn (tcp0) + is_fin0 + vnet_buffer (b0)->tcp.data_len; if (CLIB_DEBUG) { @@ -2335,13 +2352,12 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) { - clib_warning ("connection not accepted"); + TCP_DBG ("connection not accepted"); tcp_send_reset_w_pkt (tc0, b0, is_ip4); goto drop; } /* Update rtt and rto */ - tc0->bytes_acked = 1; tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number); /* Switch state to ESTABLISHED */ @@ -2384,21 +2400,14 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If FIN is ACKed */ else if (tc0->snd_una == tc0->snd_una_max) { - tc0->rcv_nxt += 1; tc0->state = TCP_STATE_FIN_WAIT_2; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); - if (tcp_fin (tcp0)) - { - /* Stop all timers, 2MSL will be set lower */ - tcp_connection_timers_reset (tc0); - } - else - { - /* Wait for peer to finish sending its data */ - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, - TCP_2MSL_TIME); - } + /* Stop all retransmit timers because we have nothing more + * to send. Enable waitclose though because we're willing to + * wait for peer's FIN but not indefinitely. */ + tcp_connection_timers_reset (tc0); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); } break; case TCP_STATE_FIN_WAIT_2: @@ -2422,7 +2431,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0->state = TCP_STATE_TIME_WAIT; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); goto drop; break; @@ -2432,27 +2441,28 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * delete the TCB, enter the CLOSED state, and return. */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) - goto drop; + { + error0 = TCP_ERROR_ACK_INVALID; + goto drop; + } - /* Apparently our FIN was lost */ - if (tcp_fin (tcp0)) + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; + /* Apparently our ACK for the peer's FIN was lost */ + if (is_fin0 && tc0->snd_una != tc0->snd_una_max) { - /* Don't "make" fin since that increments snd_nxt */ tcp_send_fin (tc0); goto drop; } tc0->state = TCP_STATE_CLOSED; TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + tcp_connection_timers_reset (tc0); /* Don't delete the connection/session yet. Instead, wait a * reasonable amount of time until the pipes are cleared. In * particular, this makes sure that we won't have dead sessions * when processing events on the tx path */ - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); - - /* Stop retransmit */ - tcp_retransmit_timer_reset (tc0); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME); goto drop; @@ -2466,8 +2476,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; tcp_make_ack (tc0, b0); - tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE); - tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + next0 = tcp_next_output (is_ip4); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); goto drop; @@ -2486,6 +2496,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_FIN_WAIT_2: if (vnet_buffer (b0)->tcp.data_len) error0 = tcp_segment_rcv (tm, tc0, b0, &next0); + else if (is_fin0) + tc0->rcv_nxt += 1; break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2497,7 +2509,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* 8: check the FIN bit */ - if (!tcp_fin (tcp0)) + if (!is_fin0) goto drop; switch (tc0->state) @@ -2527,19 +2539,19 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); break; case TCP_STATE_FIN_WAIT_2: - /* Got FIN, send ACK! */ + /* Got FIN, send ACK! Be more aggressive with resource cleanup */ tc0->state = TCP_STATE_TIME_WAIT; tcp_connection_timers_reset (tc0); - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_TIME_WAIT: - /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait + /* Remain in the TIME-WAIT state. Restart the time-wait * timeout. */ - tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_TIMEWAIT_TIME); break; } TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); @@ -2866,6 +2878,7 @@ typedef enum _tcp_input_next TCP_INPUT_NEXT_SYN_SENT, TCP_INPUT_NEXT_ESTABLISHED, TCP_INPUT_NEXT_RESET, + TCP_INPUT_NEXT_PUNT, TCP_INPUT_N_NEXT } tcp_input_next_t; @@ -2875,7 +2888,8 @@ typedef enum _tcp_input_next _ (RCV_PROCESS, "tcp4-rcv-process") \ _ (SYN_SENT, "tcp4-syn-sent") \ _ (ESTABLISHED, "tcp4-established") \ - _ (RESET, "tcp4-reset") + _ (RESET, "tcp4-reset") \ + _ (PUNT, "error-punt") #define foreach_tcp6_input_next \ _ (DROP, "error-drop") \ @@ -2883,7 +2897,8 @@ typedef enum _tcp_input_next _ (RCV_PROCESS, "tcp6-rcv-process") \ _ (SYN_SENT, "tcp6-syn-sent") \ _ (ESTABLISHED, "tcp6-established") \ - _ (RESET, "tcp6-reset") + _ (RESET, "tcp6-reset") \ + _ (PUNT, "error-punt") #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) @@ -3007,9 +3022,18 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } else { - /* Send reset */ - next0 = TCP_INPUT_NEXT_RESET; - error0 = TCP_ERROR_NO_LISTENER; + if ((is_ip4 && tm->punt_unknown4) || + (!is_ip4 && tm->punt_unknown6)) + { + next0 = TCP_INPUT_NEXT_PUNT; + error0 = TCP_ERROR_PUNT; + } + else + { + /* Send reset */ + next0 = TCP_INPUT_NEXT_RESET; + error0 = TCP_ERROR_NO_LISTENER; + } } done: @@ -3162,9 +3186,9 @@ do { \ TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); - _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); + _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); - _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, + _(CLOSED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ }