X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp_output.c;h=ad13493a14c96160a2441985229d5f08ed530db5;hb=fdbc38249a8c672937a74667dcfaafa2cfd292e7;hp=49fd6beffa3f4dc0ac470a2fae86b6b0080f99b9;hpb=bb292f4d3fbecfc6b1bac695f833b0da78369116;p=vpp.git diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 49fd6beffa3..ad13493a14c 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -19,7 +19,7 @@ vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; -typedef enum _tcp_output_nect +typedef enum _tcp_output_next { TCP_OUTPUT_NEXT_DROP, TCP_OUTPUT_NEXT_IP_LOOKUP, @@ -75,12 +75,34 @@ tcp_window_compute_scale (u32 available_space) } /** - * TCP's IW as recommended by RFC6928 + * Update max segment size we're able to process. + * + * The value is constrained by our interface's MTU and IP options. It is + * also what we advertise to our peer. + */ +void +tcp_update_rcv_mss (tcp_connection_t * tc) +{ + /* TODO find our iface MTU */ + tc->mss = dummy_mtu; +} + +/** + * TCP's initial window */ always_inline u32 tcp_initial_wnd_unscaled (tcp_connection_t * tc) { - return TCP_IW_N_SEGMENTS * tc->mss; + /* RFC 6928 recommends the value lower. However at the time our connections + * are initialized, fifos may not be allocated. Therefore, advertise the + * smallest possible unscaled window size and update once fifos are + * assigned to the session. + */ + /* + tcp_update_rcv_mss (tc); + TCP_IW_N_SEGMENTS * tc->mss; + */ + return TCP_MIN_RX_FIFO_SIZE; } /** @@ -136,10 +158,10 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) * Figure out how much space we have available */ available_space = stream_session_max_rx_enqueue (&tc->connection); - max_fifo = stream_session_fifo_size (&tc->connection); + max_fifo = stream_session_rx_fifo_size (&tc->connection); - ASSERT (tc->opt.mss < max_fifo); - if (available_space < tc->opt.mss && available_space < max_fifo >> 3) + ASSERT (tc->rcv_opts.mss < max_fifo); + if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -276,8 +298,11 @@ tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) opts->tsecr = 0; len += TCP_OPTION_LEN_TIMESTAMP; - opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; - len += TCP_OPTION_LEN_SACK_PERMITTED; + if (TCP_USE_SACKS) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } /* Align to needed boundary */ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; @@ -293,14 +318,14 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; - if (tcp_opts_wscale (&tc->opt)) + if (tcp_opts_wscale (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_WSCALE; opts->wscale = tc->rcv_wscale; len += TCP_OPTION_LEN_WINDOW_SCALE; } - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); @@ -308,7 +333,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; len += TCP_OPTION_LEN_SACK_PERMITTED; @@ -326,14 +351,14 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) opts->flags = 0; - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); opts->tsecr = tc->tsval_recent; len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { if (vec_len (tc->snd_sacks)) { @@ -369,19 +394,6 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, } } -/** - * Update max segment size we're able to process. - * - * The value is constrained by our interface's MTU and IP options. It is - * also what we advertise to our peer. - */ -void -tcp_update_rcv_mss (tcp_connection_t * tc) -{ - /* TODO find our iface MTU */ - tc->mss = dummy_mtu; -} - /** * Update snd_mss to reflect the effective segment size that we can send * by taking into account all TCP options, including SACKs @@ -395,7 +407,7 @@ tcp_update_snd_mss (tcp_connection_t * tc) tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); /* XXX check if MTU has been updated */ - tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len; + tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len; ASSERT (tc->snd_mss > 0); } @@ -406,37 +418,37 @@ tcp_init_mss (tcp_connection_t * tc) tcp_update_rcv_mss (tc); /* TODO cache mss and consider PMTU discovery */ - tc->snd_mss = clib_min (tc->opt.mss, tc->mss); + tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss); if (tc->snd_mss < 45) { clib_warning ("snd mss is 0"); /* Assume that at least the min default mss works */ tc->snd_mss = default_min_mss; - tc->opt.mss = default_min_mss; + tc->rcv_opts.mss = default_min_mss; } /* We should have enough space for 40 bytes of options */ ASSERT (tc->snd_mss > 45); /* If we use timestamp option, account for it */ - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ vec_validate (my_tx_buffers, n_free_buffers - 1); \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - tm->vlib_main, my_tx_buffers, n_free_buffers, \ + vlib_get_main(), my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -445,12 +457,12 @@ do { \ _vec_len (my_tx_buffers) -= 1; \ } while (0) -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ +#define tcp_return_buffer(tm) \ +do { \ + u32 *my_tx_buffers; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ + _vec_len (my_tx_buffers) +=1; \ } while (0) always_inline void @@ -573,6 +585,7 @@ tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) /* Init retransmit timer */ tcp_retransmit_timer_set (tc); + TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc); } always_inline void @@ -582,7 +595,7 @@ tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u32 *to_next, next_index; vlib_frame_t *f; - b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b->error = 0; /* Default FIB for now */ @@ -662,7 +675,7 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, if (is_ip4) { ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40, - IP_PROTOCOL_TCP); + IP_PROTOCOL_TCP, 1); th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4); } else @@ -681,7 +694,7 @@ tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, * Send reset without reusing existing buffer */ void -tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) +tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) { vlib_buffer_t *b; u32 bi; @@ -717,7 +730,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) { flags = TCP_FLAG_RST; seq = pkt_th->ack_number; - ack = 0; + ack = (tc && tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0; } else { @@ -734,7 +747,7 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) { ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40); ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address, - &pkt_ih4->src_address, IP_PROTOCOL_TCP); + &pkt_ih4->src_address, IP_PROTOCOL_TCP, 1); th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4); } else @@ -751,29 +764,29 @@ tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) } tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc); } void tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) { tcp_header_t *th = vlib_buffer_get_current (b); - + vlib_main_t *vm = vlib_get_main (); if (tc->c_is_ip4) { ip4_header_t *ih; - ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, - &tc->c_rmt_ip4, IP_PROTOCOL_TCP); - th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4, + &tc->c_rmt_ip4, IP_PROTOCOL_TCP, 1); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih); } else { ip6_header_t *ih; int bogus = ~0; - ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6, &tc->c_rmt_ip6, IP_PROTOCOL_TCP); - th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, - &bogus); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus); ASSERT (!bogus); } } @@ -837,6 +850,7 @@ tcp_send_syn (tcp_connection_t * tc) tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc); } always_inline void @@ -845,12 +859,19 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) u32 *to_next, next_index; vlib_frame_t *f; - b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; b->error = 0; /* Decide where to send the packet */ next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + /* Initialize the trajectory trace, if configured */ + if (VLIB_BUFFER_TRACE_TRAJECTORY > 0) + { + b->pre_data[0] = 1; + b->pre_data[1] = next_index; + } + /* Enqueue the packet */ f = vlib_get_frame_to_node (vm, next_index); to_next = vlib_frame_vector_args (f); @@ -879,6 +900,7 @@ tcp_send_fin (tcp_connection_t * tc) tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } @@ -919,10 +941,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, if (compute_opts) tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - /* Write pre-computed options */ tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); - - /* Get rcv window to advertise */ advertise_wnd = tcp_window_to_advertise (tc, next_state); flags = tcp_make_state_flags (next_state); @@ -930,26 +949,25 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); - opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); ASSERT (opts_write_len == tc->snd_opts_len); - - /* Tag the buffer with the connection index */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + /* + * Update connection variables + */ + tc->snd_nxt += data_len; tc->rcv_las = tc->rcv_nxt; /* TODO this is updated in output as well ... */ - if (tc->snd_nxt > tc->snd_una_max) - tc->snd_una_max = tc->snd_nxt; - - if (tc->rtt_ts == 0) + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) { - tc->rtt_ts = tcp_time_now (); - tc->rtt_seq = tc->snd_nxt; + tc->snd_una_max = tc->snd_nxt; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); } + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -987,13 +1005,14 @@ tcp_timer_delack_handler (u32 index) * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit - * */ + */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, u32 offset, u32 max_bytes) { vlib_main_t *vm = vlib_get_main (); - u32 n_bytes = 0; + int n_bytes = 0; + u32 start; tcp_reuse_buffer (vm, b); @@ -1001,15 +1020,16 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, ASSERT (max_bytes != 0); max_bytes = clib_min (tc->snd_mss, max_bytes); + start = tc->snd_una + offset; /* Start is beyond snd_congestion */ - if (seq_geq (tc->snd_una + offset, tc->snd_congestion)) + if (seq_geq (start, tc->snd_congestion)) goto done; /* Don't overshoot snd_congestion */ - if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - tc->snd_nxt; + max_bytes = tc->snd_congestion - start; if (max_bytes == 0) goto done; } @@ -1021,15 +1041,12 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); - ASSERT (n_bytes != 0); + ASSERT (n_bytes > 0); b->current_length = n_bytes; tcp_push_hdr_i (tc, b, tc->state, 0); - /* Don't count multiple retransmits of the same segment */ - if (tc->rto_boff > 1) - goto done; - - tc->rtx_bytes += n_bytes; + if (tcp_in_fastrecovery (tc)) + tc->snd_rxt_bytes += n_bytes; done: TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); @@ -1042,20 +1059,18 @@ done: static void tcp_rtx_timeout_cc (tcp_connection_t * tc) { + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; + /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) - { - tcp_cc_recover (tc); - } - else - { - tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); - } + tcp_cc_fastrecovery_exit (tc); /* Start again from the beginning */ - + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; + tcp_recovery_on (tc); } @@ -1072,14 +1087,20 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (is_syn) { tc = tcp_half_open_connection_get (index); + tc->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID; } else { tc = tcp_connection_get (index, thread_index); + tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; } - /* Make sure timer handle is set to invalid */ - tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + if (!tcp_in_recovery (tc) && tc->rto_boff > 0 + && tc->state >= TCP_STATE_ESTABLISHED) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } /* Increment RTO backoff (also equal to number of retries) */ tc->rto_boff += 1; @@ -1087,12 +1108,18 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) { + /* Lost FIN, retransmit and return */ + if (tc->flags & TCP_CONN_FINSNT) + { + tcp_send_fin (tc); + return; + } + /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rtx_timeout_cc (tc); @@ -1102,23 +1129,43 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - /* Send one segment. No fancy recovery for now! */ + /* Send one segment */ n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + /* TODO be less aggressive about this */ scoreboard_clear (&tc->sack_sb); if (n_bytes == 0) { clib_warning ("could not retransmit anything"); + clib_warning ("%U", format_tcp_connection, tc, 2); + /* Try again eventually */ tcp_retransmit_timer_set (tc); + ASSERT (0 || (tc->rto_boff > 1 + && tc->snd_una == tc->snd_congestion)); return; } + + /* For first retransmit, record timestamp (Eifel detection RFC3522) */ + if (tc->rto_boff == 1) + tc->snd_rxt_ts = tcp_time_now (); } - else + /* Retransmit for SYN/SYNACK */ + else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) { - /* Retransmit for SYN/SYNACK */ - ASSERT (tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_SYN_SENT); + /* Half-open connection actually moved to established but we were + * waiting for syn retransmit to pop to call cleanup from the right + * thread. */ + if (tc->flags & TCP_CONN_HALF_OPEN_DONE) + { + ASSERT (tc->state == TCP_STATE_SYN_SENT); + if (tcp_half_open_connection_cleanup (tc)) + { + clib_warning ("could not remove half-open connection"); + ASSERT (0); + } + return; + } /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ @@ -1126,11 +1173,19 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_push_hdr_i (tc, b, tc->state, 1); /* Account for the SYN */ tc->snd_nxt += 1; + tc->rtt_ts = 0; + TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, + (tc->state == TCP_STATE_SYN_SENT ? 0 : 1)); + } + else + { + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; } if (!is_syn) @@ -1144,8 +1199,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_SYN_SENT); - TCP_EVT_DBG (TCP_EVT_SYN_RTX, tc); - /* This goes straight to ipx_lookup */ tcp_push_ip_hdr (tm, tc, b); tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); @@ -1180,7 +1233,8 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, old_snd_nxt; + int n_bytes = 0; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1191,7 +1245,7 @@ tcp_timer_persist_handler (u32 index) tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; /* Problem already solved or worse */ - if (tc->state == TCP_STATE_CLOSED + if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) return; @@ -1202,21 +1256,29 @@ tcp_timer_persist_handler (u32 index) /* Try to force the first unsent segment */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, tc->snd_mss); /* Nothing to send */ - if (n_bytes == 0) + if (n_bytes <= 0) { - clib_warning ("persist found nothing to send"); + // clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); return; } b->current_length = n_bytes; + ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + + /* Allow updating of snd_una_max but don't update snd_nxt */ + old_snd_nxt = tc->snd_nxt; tcp_push_hdr_i (tc, b, tc->state, 0); + tc->snd_nxt = old_snd_nxt; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); /* Re-enable persist timer */ @@ -1232,8 +1294,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, n_bytes, old_snd_nxt; + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; /* Get buffer */ @@ -1244,75 +1307,117 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); if (n_bytes == 0) - goto done; + { + tcp_return_buffer (tm); + goto done; + } tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); done: - tc->snd_nxt = tc->snd_una_max; + tc->snd_nxt = old_snd_nxt; } -sack_scoreboard_hole_t * -scoreboard_first_rtx_hole (sack_scoreboard_t * sb) +/** + * Do fast retransmit with SACKs + */ +void +tcp_fast_retransmit_sack (tcp_connection_t * tc) { - sack_scoreboard_hole_t *hole = 0; - -// hole = scoreboard_first_hole (&tc->sack_sb); -// if (hole) -// { -// -// offset = hole->start - tc->snd_una; -// hole_size = hole->end - hole->start; -// -// ASSERT(hole_size); -// -// if (hole_size < max_bytes) -// max_bytes = hole_size; -// } - return hole; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, max_bytes; + vlib_buffer_t *b; + sack_scoreboard_hole_t *hole; + sack_scoreboard_t *sb; + u32 bi, old_snd_nxt; + int snd_space; + u8 snd_limited = 0, can_rescue = 0; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + snd_space = tcp_available_snd_space (tc); + + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + while (hole && snd_space > 0) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + hole = scoreboard_next_rxt_hole (sb, hole, + tcp_fastrecovery_sent_1_smss (tc), + &can_rescue, &snd_limited); + if (!hole) + { + if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) + || seq_gt (sb->rescue_rxt, + tc->snd_congestion))) + break; + + /* If rescue rxt undefined or less than snd_una then one segment of + * up to SMSS octets that MUST include the highest outstanding + * unSACKed sequence number SHOULD be returned, and RescueRxt set to + * RecoveryPoint. HighRxt MUST NOT be updated. + */ + max_bytes = clib_min (tc->snd_mss, snd_space); + offset = tc->snd_congestion - tc->snd_una - max_bytes; + sb->rescue_rxt = tc->snd_congestion; + tc->snd_nxt = tc->snd_una + offset; + tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + break; + } + + max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + offset = sb->high_rxt - tc->snd_una; + tc->snd_nxt = tc->snd_una + offset; + n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + + /* Nothing left to retransmit */ + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } + + sb->high_rxt += n_written; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; + } + + /* If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; } /** - * Do fast retransmit. + * Fast retransmit without SACK info */ void -tcp_fast_retransmit (tcp_connection_t * tc) +tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 bi; + u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; - u32 n_written = 0, offset = 0; vlib_buffer_t *b; - u8 use_sacks = 0; ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); /* Start resending from first un-acked segment */ + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* If we have SACKs use them */ - if (tcp_opts_sack_permitted (&tc->opt) - && scoreboard_first_hole (&tc->sack_sb)) - use_sacks = 0; while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); - if (use_sacks) - { - scoreboard_first_rtx_hole (&tc->sack_sb); - } - else - { - offset += n_written; - } - + offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ @@ -1326,9 +1431,21 @@ tcp_fast_retransmit (tcp_connection_t * tc) snd_space -= n_written; } - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit + */ +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) + tcp_fast_retransmit_sack (tc); + else + tcp_fast_retransmit_no_sack (tc); } always_inline u32 @@ -1391,8 +1508,14 @@ tcp46_output_inline (vlib_main_t * vm, { ip4_header_t *ih0; ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, - &tc0->c_rmt_ip4, IP_PROTOCOL_TCP); - th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih0); + &tc0->c_rmt_ip4, IP_PROTOCOL_TCP, + 1); + b0->flags |= + VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_OFFLOAD_IP_CKSUM | + VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; + th0->checksum = 0; } else { @@ -1401,8 +1524,13 @@ tcp46_output_inline (vlib_main_t * vm, ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); - th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih0, - &bogus); + + b0->flags |= VNET_BUFFER_F_IS_IP6 | + VNET_BUFFER_F_OFFLOAD_IP_CKSUM | + VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; + th0->checksum = 0; ASSERT (!bogus); } @@ -1420,10 +1548,7 @@ tcp46_output_inline (vlib_main_t * vm, /* Stop DELACK timer and fix flags */ tc0->flags &= ~(TCP_CONN_SNDACK); - if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) - { - tcp_timer_reset (tc0, TCP_TIMER_DELACK); - } + tcp_timer_reset (tc0, TCP_TIMER_DELACK); /* If not retransmitting * 1) update snd_una_max (SYN, SYNACK, FIN) @@ -1447,12 +1572,33 @@ tcp46_output_inline (vlib_main_t * vm, tc0->rto_boff = 0; } - /* set fib index to default and lookup node */ - /* XXX network virtualization (vrf/vni) */ +#if 0 + /* Make sure we haven't lost route to our peer */ + if (PREDICT_FALSE (tc0->last_fib_check + < tc0->snd_opts.tsval + TCP_FIB_RECHECK_PERIOD)) + { + if (PREDICT_TRUE + (tc0->c_rmt_fei == tcp_lookup_rmt_in_fib (tc0))) + { + tc0->last_fib_check = tc0->snd_opts.tsval; + } + else + { + clib_warning ("lost connection to peer"); + tcp_connection_reset (tc0); + goto done; + } + } + + /* Use pre-computed dpo to set next node */ + next0 = tc0->c_rmt_dpo.dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = tc0->c_rmt_dpo.dpoi_index; +#endif + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; - b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; done: b0->error = node->errors[error0]; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -1544,6 +1690,12 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + + if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc)) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } return 0; } @@ -1611,7 +1763,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, done: b0->error = node->errors[error0]; - b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { th0 = vlib_buffer_get_current (b0);