X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_input.c;h=3bd53878afc1d98c87ebe5c578348c619fb678d3;hb=a5464817522c7a7dc760af4612f1d6a68ed0afc8;hp=5d11985faa7765b6f0ebd721cee05b66b9dee68f;hpb=6792ec059696a358b6c98d8d86e9740b34c01e24;p=vpp.git diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 5d11985faa7..3bd53878afc 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -211,8 +211,6 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) always_inline int tcp_segment_check_paws (tcp_connection_t * tc) { - /* XXX normally test for timestamp should be lt instead of leq, but for - * local testing this is not enough */ return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent && timestamp_lt (tc->opt.tsval, tc->tsval_recent); } @@ -276,8 +274,7 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (tc0->rcv_wnd == 0 && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number) { - /* Make it look as if there's nothing to dequeue */ - vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number; + /* TODO Should segment be tagged? */ } else { @@ -375,7 +372,6 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) { mrtt = tcp_time_now () - tc->rtt_ts; - tc->rtt_seq = 0; } /* As per RFC7323 TSecr can be used for RTTM only if the segment advances @@ -395,6 +391,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + /* Allow measuring of RTT and make sure boff is 0 */ + tc->rtt_seq = 0; + tc->rto_boff = 0; + return 1; } @@ -408,11 +408,7 @@ tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); /* Update rtt and rto */ - if (tcp_update_rtt (tc, ack)) - { - /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */ - tc->rto_boff = 0; - } + tcp_update_rtt (tc, ack); } /** @@ -672,6 +668,13 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl1 = seq; tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); + + /* Set probe timer if we just got 0 wnd */ + if (tc->snd_wnd < tc->snd_mss + && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) + tcp_persist_timer_set (tc); + else + tcp_persist_timer_reset (tc); } } @@ -686,6 +689,10 @@ tcp_cc_congestion (tcp_connection_t * tc) void tcp_cc_recover (tcp_connection_t * tc) { + /* TODO: check if time to recover was small. It might be that RTO popped + * too soon. + */ + tc->cc_algo->recovered (tc); tc->rtx_bytes = 0; @@ -695,8 +702,7 @@ tcp_cc_recover (tcp_connection_t * tc) tc->cc_algo->rcv_ack (tc); tc->tsecr_last_ack = tc->opt.tsecr; - tcp_fastrecovery_1_smss_off (tc); - tcp_fastrecovery_off (tc); + tcp_cong_recovery_off (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); } @@ -706,7 +712,7 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { u8 partial_ack; - if (tcp_in_recovery (tc)) + if (tcp_in_cong_recovery (tc)) { partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); if (!partial_ack) @@ -724,10 +730,10 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) /* In case snd_nxt is still in the past and output tries to * shove some new bytes */ - tc->snd_nxt = tc->snd_una; + tc->snd_nxt = tc->snd_una_max; /* XXX need proper RFC6675 support */ - if (tc->sack_sb.last_sacked_bytes) + if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc)) { tcp_fast_retransmit (tc); } @@ -735,9 +741,6 @@ tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) { /* Retransmit first unacked segment */ tcp_retransmit_first_unacked (tc); - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; } } } @@ -814,10 +817,11 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, return -1; } - tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; - *error = TCP_ERROR_ACK_FUTURE; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2, vnet_buffer (b)->tcp.ack_number); + + tc->snd_nxt = vnet_buffer (b)->tcp.ack_number; + *error = TCP_ERROR_ACK_FUTURE; } /* If old ACK, probably it's an old dupack */ @@ -863,7 +867,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, * timer. */ if (tc->bytes_acked) { - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc, vnet_buffer (b)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); /* Updates congestion control (slow start/congestion avoidance) */ tcp_cc_rcv_ack (tc, b); @@ -966,11 +970,14 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, tc->rcv_nxt += written; /* Depending on how fast the app is, all remaining buffers in burst will - * not be enqueued. Should we inform peer of the damage? XXX */ + * not be enqueued. Inform peer */ + tc->flags |= TCP_CONN_SNDACK; + return TCP_ERROR_PARTIALLY_ENQUEUED; } else { + tc->flags |= TCP_CONN_SNDACK; return TCP_ERROR_FIFO_FULL; } @@ -990,7 +997,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, u16 data_len) { stream_session_t *s0; - u32 offset, seq; + u32 offset; int rv; /* Pure ACK. Do nothing */ @@ -1000,11 +1007,12 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, } s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); - seq = vnet_buffer (b)->tcp.seq_number; - offset = seq - tc->rcv_nxt; + offset = vnet_buffer (b)->tcp.seq_number - tc->irs; - rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, - data_len, vlib_buffer_get_current (b)); + clib_warning ("ooo: offset %d len %d", offset, data_len); + + rv = svm_fifo_enqueue_with_offset (s0->server_rx_fifo, offset, data_len, + vlib_buffer_get_current (b)); /* Nothing written */ if (rv) @@ -1023,8 +1031,8 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, /* Get the newest segment from the fifo */ newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); - start = tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); - end = tc->rcv_nxt + ooo_segment_end_offset (s0->server_rx_fifo, newest); + start = ooo_segment_offset (s0->server_rx_fifo, newest); + end = ooo_segment_end_offset (s0->server_rx_fifo, newest); tcp_update_sack_list (tc, start, end); } @@ -1063,6 +1071,7 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, { /* Old sequence numbers allowed through because they overlapped * the rx window */ + if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt)) { error = TCP_ERROR_SEGMENT_OLD; @@ -1101,29 +1110,58 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, goto done; } - if (PREDICT_FALSE (error == TCP_ERROR_FIFO_FULL)) - *next0 = TCP_NEXT_DROP; - /* Check if ACK can be delayed */ - if (!tcp_can_delack (tc)) - { - /* Nothing to do for pure ACKs XXX */ - if (n_data_bytes == 0) - goto done; - - *next0 = tcp_next_output (tc->c_is_ip4); - tcp_make_ack (tc, b); - } - else + if (tcp_can_delack (tc)) { if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK)) tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME); + goto done; } + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); + done: return error; } +typedef struct +{ + tcp_header_t tcp_header; + tcp_connection_t tcp_connection; +} tcp_rx_trace_t; + +u8 * +format_tcp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + uword indent = format_get_indent (s); + + s = format (s, "%U\n%U%U", + format_tcp_header, &t->tcp_header, 128, + format_white_space, indent, + format_tcp_connection_verbose, &t->tcp_connection); + + return s; +} + +u8 * +format_tcp_rx_trace_short (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + + s = format (s, "%d -> %d (%U)", + clib_net_to_host_u16 (t->tcp_header.src_port), + clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state, + &t->tcp_connection.state); + + return s; +} + always_inline void tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val) { @@ -1141,8 +1179,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; tcp_main_t *tm = vnet_get_tcp_main (); + u8 is_fin = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1159,6 +1198,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *th0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -1204,9 +1244,11 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_advance_bytes0 += sizeof (ip60[0]); } + is_fin = (th0->flags & TCP_FLAG_FIN) != 0; + /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number - + tcp_is_syn (th0) + tcp_is_fin (th0) + n_data_bytes0; + + tcp_is_syn (th0) + is_fin + n_data_bytes0; /* TODO header prediction fast path */ @@ -1233,8 +1275,11 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_advance (b0, n_advance_bytes0); error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a + * dangling reference. */ + /* 8: check the FIN bit */ - if (tcp_fin (th0)) + if (is_fin) { /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead * wait for session to call close. To avoid lingering @@ -1249,7 +1294,10 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1295,6 +1343,7 @@ VLIB_REGISTER_NODE (tcp4_established_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1316,6 +1365,7 @@ VLIB_REGISTER_NODE (tcp6_established_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1331,7 +1381,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); @@ -1349,6 +1399,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0, ack0, seq0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -1544,7 +1595,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b0->error = error0 ? node->errors[error0] : 0; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1598,6 +1652,7 @@ VLIB_REGISTER_NODE (tcp4_syn_sent_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1618,8 +1673,9 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) = #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, foreach_tcp_state_next #undef _ - } -,}; + }, + .format_trace = format_tcp_rx_trace_short, +}; /* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); @@ -1633,7 +1689,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1650,6 +1706,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -1840,6 +1897,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_ESTABLISHED: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: + vlib_buffer_advance (b0, n_advance_bytes0); error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); break; case TCP_STATE_CLOSE_WAIT: @@ -1897,7 +1955,10 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, tc0, + sizeof (t0->tcp_connection)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -1951,6 +2012,7 @@ VLIB_REGISTER_NODE (tcp4_rcv_process_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1972,6 +2034,7 @@ VLIB_REGISTER_NODE (tcp6_rcv_process_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -1988,7 +2051,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; @@ -2007,6 +2070,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *th0 = 0; tcp_connection_t *lc0; ip4_header_t *ip40; @@ -2084,6 +2148,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->irs = vnet_buffer (b0)->tcp.seq_number; child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; + child0->rcv_las = child0->rcv_nxt; child0->state = TCP_STATE_SYN_RCVD; /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} @@ -2113,7 +2178,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, drop: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header)); + clib_memcpy (&t0->tcp_connection, lc0, + sizeof (t0->tcp_connection)); } b0->error = node->errors[error0]; @@ -2157,6 +2225,7 @@ VLIB_REGISTER_NODE (tcp4_listen_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -2178,6 +2247,7 @@ VLIB_REGISTER_NODE (tcp6_listen_node) = foreach_tcp_state_next #undef _ }, + .format_trace = format_tcp_rx_trace_short, }; /* *INDENT-ON* */ @@ -2213,27 +2283,6 @@ typedef enum _tcp_input_next _ (ESTABLISHED, "tcp6-established") \ _ (RESET, "tcp6-reset") -typedef struct -{ - u16 src_port; - u16 dst_port; - u8 state; -} tcp_rx_trace_t; - -u8 * -format_tcp_rx_trace (u8 * s, va_list * args) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); - tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); - - s = format (s, "TCP: src-port %d dst-port %U%s\n", - clib_net_to_host_u16 (t->src_port), - clib_net_to_host_u16 (t->dst_port), format_tcp_state, t->state); - - return s; -} - #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) always_inline uword @@ -2241,7 +2290,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); @@ -2259,6 +2308,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 bi0; vlib_buffer_t *b0; + tcp_rx_trace_t *t0; tcp_header_t *tcp0 = 0; tcp_connection_t *tc0; ip4_header_t *ip40; @@ -2321,8 +2371,12 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH)) { + tcp_state_t state0 = tc0->state; /* Overload tcp flags to store state */ vnet_buffer (b0)->tcp.flags = tc0->state; + clib_warning ("disp error state %U flags %U", + format_tcp_state, &state0, + format_tcp_flags, flags0); } } else @@ -2336,7 +2390,10 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header)); + if (tc0) + clib_memcpy (&t0->tcp_connection, tc0, sizeof (*tc0)); } vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, @@ -2408,12 +2465,6 @@ VLIB_REGISTER_NODE (tcp6_input_node) = /* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input); -void -tcp_update_time (f64 now, u32 thread_index) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - tw_timer_expire_timers_16t_2w_512sl (&tm->timer_wheels[thread_index], now); -} static void tcp_dispatch_table_init (tcp_main_t * tm)