From: Florin Coras Date: Tue, 1 Aug 2017 23:56:58 +0000 (-0700) Subject: Fix tcp multi buffer segments retransmission X-Git-Tag: v17.10-rc1~248 X-Git-Url: https://gerrit.fd.io/r/gitweb?a=commitdiff_plain;h=b2215d6b0d8ef7d425d2b9eea524a1c055a9f3b3;p=vpp.git Fix tcp multi buffer segments retransmission - Fix tcp/udp sw checksum computation - Fix allocation of multi buffer tcp segments for retransmits - Send FIN only if/when tx fifo is empty Change-Id: I2e43a14b87a72c9e547b4339b9a51811cf5732c4 Signed-off-by: Florin Coras --- diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 72008dad71e..6a662416b29 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -833,7 +833,12 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(current_length); _(flags); #undef _ - ASSERT (dst->total_length_not_including_first_buffer == 0); + /* ASSERT (dst->total_length_not_including_first_buffer == 0); */ + /* total_length_not_including_first_buffer is not in the template anymore + * so it may actually not zeroed for some buffers. One option is to + * uncomment the line lower (comes at a cost), the other, is to just not + * care */ + /* dst->total_length_not_including_first_buffer = 0; */ ASSERT (dst->n_add_refs == 0); } diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 7a8d7a0cc1b..496df3c7cb5 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1454,7 +1454,7 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, { ip_csum_t sum0; u32 ip_header_length, payload_length_host_byte_order; - u32 n_this_buffer, n_bytes_left; + u32 n_this_buffer, n_bytes_left, n_ip_bytes_this_buffer; u16 sum16; void *data_this_buffer; @@ -1481,10 +1481,12 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, n_bytes_left = n_this_buffer = payload_length_host_byte_order; data_this_buffer = (void *) ip0 + ip_header_length; - if (n_this_buffer + ip_header_length > p0->current_length) - n_this_buffer = - p0->current_length > - ip_header_length ? p0->current_length - ip_header_length : 0; + n_ip_bytes_this_buffer = p0->current_length - (((u8 *) ip0 - p0->data) - p0->current_data); + if (n_this_buffer + ip_header_length > n_ip_bytes_this_buffer) + { + n_this_buffer = n_ip_bytes_this_buffer > ip_header_length ? + n_ip_bytes_this_buffer - ip_header_length : 0; + } while (1) { sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 533a6c22ae0..3a3e4dfe588 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -98,9 +98,9 @@ session_enqueue_chain_tail (stream_session_t * s, vlib_buffer_t * b, u32 offset, u8 is_in_order) { vlib_buffer_t *chain_b; - u32 chain_bi = b->next_buffer; + u32 chain_bi = b->next_buffer, len; vlib_main_t *vm = vlib_get_main (); - u8 *data, len; + u8 *data; u16 written = 0; int rv = 0; @@ -226,7 +226,7 @@ u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); - if (s->session_state != SESSION_STATE_READY) + if (!s->server_tx_fifo) return 0; return svm_fifo_max_dequeue (s->server_tx_fifo); } diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 8d703b0b302..9c5b17d98e6 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -75,20 +75,25 @@ always_inline void session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, u8 thread_index, svm_fifo_t * fifo, vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, - u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset, - u16 deq_per_buf, u8 peek_data) + u32 left_from_seg, u32 * left_to_snd0, + u16 * n_bufs, u32 * rx_offset, u16 deq_per_buf, + u8 peek_data) { vlib_buffer_t *chain_b0, *prev_b0; - u32 chain_bi0; + u32 chain_bi0, to_deq; u16 len_to_deq0, n_bytes_read; u8 *data0, j; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b0->total_length_not_including_first_buffer = 0; + chain_bi0 = bi0; chain_b0 = b0; + to_deq = left_from_seg; for (j = 1; j < n_bufs_per_seg; j++) { prev_b0 = chain_b0; - len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf); + len_to_deq0 = clib_min (to_deq, deq_per_buf); *n_bufs -= 1; chain_bi0 = smm->tx_buffers[thread_index][*n_bufs]; @@ -117,10 +122,12 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, /* update current buffer */ chain_b0->next_buffer = 0; - *left_to_snd0 -= n_bytes_read; - if (*left_to_snd0 == 0) + to_deq -= n_bytes_read; + if (to_deq == 0) break; } + ASSERT (to_deq == 0); + *left_to_snd0 -= left_from_seg; } always_inline int @@ -223,7 +230,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); n_bufs += buffers_allocated; - _vec_len (smm->tx_buffers[thread_index]) = n_bufs; if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) @@ -289,11 +295,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * Fill in the remaining buffers in the chain, if any */ if (PREDICT_FALSE (n_bufs_per_seg > 1)) - session_tx_fifo_chain_tail (smm, vm, thread_index, - s0->server_tx_fifo, b0, bi0, - n_bufs_per_seg, &left_to_snd0, - &n_bufs, &rx_offset, deq_per_buf, - peek_data); + { + u32 left_for_seg; + left_for_seg = clib_min (snd_mss0 - n_bytes_read, left_to_snd0); + session_tx_fifo_chain_tail (smm, vm, thread_index, + s0->server_tx_fifo, b0, bi0, + n_bufs_per_seg, left_for_seg, + &left_to_snd0, &n_bufs, &rx_offset, + deq_per_buf, peek_data); + } /* Ask transport to push header after current_length and * total_length_not_including_first_buffer are updated */ @@ -607,8 +617,9 @@ skip_dequeue: clib_warning ("It's dead, Jim!"); continue; } - - if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) + /* Can retransmit for closed sessions but can't do anything if + * session is not ready or closed */ + if (PREDICT_FALSE (s0->session_state < SESSION_STATE_READY)) continue; /* Spray packets in per session type frames, since they go to * different nodes */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 8e2eb9f4e3d..4652618b22c 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -288,18 +288,31 @@ tcp_connection_close (tcp_connection_t * tc) { TCP_EVT_DBG (TCP_EVT_CLOSE, tc); - /* Send FIN if needed */ - if (tc->state == TCP_STATE_ESTABLISHED - || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) - tcp_send_fin (tc); - - /* Switch state */ - if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD) - tc->state = TCP_STATE_FIN_WAIT_1; - else if (tc->state == TCP_STATE_SYN_SENT) - tc->state = TCP_STATE_CLOSED; - else if (tc->state == TCP_STATE_CLOSE_WAIT) - tc->state = TCP_STATE_LAST_ACK; + /* Send/Program FIN if needed and switch state */ + switch (tc->state) + { + case TCP_STATE_SYN_SENT: + tc->state = TCP_STATE_CLOSED; + break; + case TCP_STATE_SYN_RCVD: + tcp_send_fin (tc); + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_ESTABLISHED: + if (!stream_session_tx_fifo_max_dequeue (&tc->connection)) + tcp_send_fin (tc); + else + tc->flags |= TCP_CONN_FINPNDG; + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_CLOSE_WAIT: + tcp_send_fin (tc); + tc->state = TCP_STATE_LAST_ACK; + break; + default: + clib_warning ("shouldn't be here"); + } + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ @@ -1284,6 +1297,8 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->tx_frames[0], num_threads - 1); vec_validate (tm->tx_frames[1], num_threads - 1); + tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); return error; } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 997df76f545..a17262fa7b4 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -116,7 +116,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(RECOVERY, "Recovery on") \ _(FAST_RECOVERY, "Fast Recovery on") \ _(FR_1_SMSS, "Sent 1 SMSS") \ - _(HALF_OPEN_DONE, "Half-open completed") + _(HALF_OPEN_DONE, "Half-open completed") \ + _(FINPNDG, "FIN pending") typedef enum _tcp_connection_flag_bits { @@ -404,6 +405,9 @@ typedef struct _tcp_main /** Port allocator random number generator seed */ u32 port_allocator_seed; + + /** vlib buffer size */ + u32 bytes_per_buffer; } tcp_main_t; extern tcp_main_t tcp_main; @@ -587,6 +591,14 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } +always_inline u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + i32 tcp_rcv_wnd_available (tcp_connection_t * tc); u32 tcp_snd_space (tcp_connection_t * tc); void tcp_update_rcv_wnd (tcp_connection_t * tc); @@ -621,8 +633,8 @@ tcp_update_time (f64 now, u32 thread_index) u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); u32 -tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 offset, u32 max_bytes); +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_bytes, vlib_buffer_t ** b); void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 29f4f08d72c..a3b48d83659 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -492,14 +492,6 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, && (prev_snd_wnd == tc->snd_wnd)); } -static u8 -tcp_is_lost_fin (tcp_connection_t * tc) -{ - if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) - return 1; - return 0; -} - /** * Checks if ack is a congestion control event. */ @@ -1162,7 +1154,8 @@ partial_ack: /* Remove retransmitted bytes that have been delivered */ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv - >= tc->sack_sb.last_bytes_delivered); + >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) { @@ -1273,6 +1266,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) { tcp_cc_handle_event (tc, is_dack); + if (!tcp_in_cong_recovery (tc)) + return 0; *error = TCP_ERROR_ACK_DUP; TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); return vnet_buffer (b)->tcp.data_len ? 0 : -1; @@ -1497,6 +1492,29 @@ tcp_can_delack (tcp_connection_t * tc) return 1; } +static int +tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) +{ + u32 discard; + vlib_main_t *vm = vlib_get_main (); + + /* Handle multi segment packets */ + if (n_bytes_to_drop > b->current_length) + { + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + return -1; + do + { + discard = clib_min (n_bytes_to_drop, b->current_length); + vlib_buffer_advance (b, discard); + b = vlib_get_buffer (vm, b->next_buffer); + n_bytes_to_drop -= discard; + } + while (n_bytes_to_drop); + } + return 0; +} + static int tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, u32 * next0) @@ -1530,7 +1548,8 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; - vlib_buffer_advance (b, n_bytes_to_drop); + if (tcp_buffer_discard_bytes (b, n_bytes_to_drop)) + goto done; goto in_order; } @@ -2252,8 +2271,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) goto drop; + /* Still have to send the FIN */ + if (tc0->flags & TCP_CONN_FINPNDG) + { + /* TX fifo finally drained */ + if (!stream_session_tx_fifo_max_dequeue (&tc0->connection)) + tcp_send_fin (tc0); + } /* If FIN is ACKed */ - if (tc0->snd_una == tc0->snd_una_max) + else if (tc0->snd_una == tc0->snd_una_max) { ASSERT (tcp_fin (tcp0)); tc0->rcv_nxt += 1; diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index f8fbb8a9e69..4c1add21c27 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -15,6 +15,7 @@ #include #include +#include vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; @@ -84,7 +85,7 @@ void tcp_update_rcv_mss (tcp_connection_t * tc) { /* TODO find our iface MTU */ - tc->mss = dummy_mtu; + tc->mss = dummy_mtu - sizeof (tcp_header_t); } /** @@ -436,28 +437,35 @@ tcp_init_mss (tcp_connection_t * tc) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } +always_inline int +tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) +{ + vec_validate (tm->tx_buffers[thread_index], n_free_buffers - 1); + _vec_len (tm->tx_buffers[thread_index]) = + vlib_buffer_alloc_from_free_list (vlib_get_main (), + tm->tx_buffers[thread_index], + n_free_buffers, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + /* buffer shortage, report failure */ + if (vec_len (tm->tx_buffers[thread_index]) == 0) + { + clib_warning ("out of buffers"); + return -1; + } + return 0; +} + always_inline int tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) { - u32 *my_tx_buffers, n_free_buffers; + u32 *my_tx_buffers; u32 thread_index = vlib_get_thread_index (); - my_tx_buffers = tm->tx_buffers[thread_index]; - if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) + if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0)) { - n_free_buffers = VLIB_FRAME_SIZE; - vec_validate (my_tx_buffers, n_free_buffers - 1); - _vec_len (my_tx_buffers) = - vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers, - n_free_buffers, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); - /* buffer shortage, report failure */ - if (vec_len (my_tx_buffers) == 0) - { - clib_warning ("out of buffers"); - return -1; - } - tm->tx_buffers[thread_index] = my_tx_buffers; + if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE)) + return -1; } + my_tx_buffers = tm->tx_buffers[thread_index]; *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; _vec_len (my_tx_buffers) -= 1; return 0; @@ -476,6 +484,7 @@ always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { vlib_buffer_t *it = b; + u32 save_free_list = b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; do { it->current_data = 0; @@ -485,6 +494,10 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) while ((it->flags & VLIB_BUFFER_NEXT_PRESENT) && (it = vlib_get_buffer (vm, it->next_buffer))); + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + vlib_buffer_free_one (vm, b->next_buffer); + b->flags = save_free_list; + /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); vnet_buffer (b)->tcp.flags = 0; @@ -959,18 +972,16 @@ tcp_send_fin (tcp_connection_t * tc) return; b = vlib_get_buffer (vm, bi); - /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tc->flags &= ~TCP_CONN_FINPNDG; tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } always_inline u8 -tcp_make_state_flags (tcp_state_t next_state) +tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state) { switch (next_state) { @@ -982,7 +993,10 @@ tcp_make_state_flags (tcp_state_t next_state) return TCP_FLAG_SYN; case TCP_STATE_LAST_ACK: case TCP_STATE_FIN_WAIT_1: - return TCP_FLAG_FIN; + if (tc->snd_nxt + 1 < tc->snd_una_max) + return TCP_FLAG_ACK; + else + return TCP_FLAG_FIN; default: clib_warning ("Shouldn't be here!"); } @@ -1008,7 +1022,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); advertise_wnd = tcp_window_to_advertise (tc, next_state); - flags = tcp_make_state_flags (next_state); + flags = tcp_make_state_flags (tc, next_state); /* Push header and options */ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, @@ -1055,7 +1069,11 @@ tcp_send_ack (tcp_connection_t * tc) tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } -/* Send delayed ACK when timer expires */ +/** + * Delayed ack timer handler + * + * Sends delayed ACK when timer expires + */ void tcp_timer_delack_handler (u32 index) { @@ -1067,49 +1085,138 @@ tcp_timer_delack_handler (u32 index) tcp_send_ack (tc); } -/** Build a retransmit segment +/** + * Build a retransmit segment * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit */ u32 -tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 offset, u32 max_bytes) +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) { + tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); int n_bytes = 0; - u32 start; - - tcp_reuse_buffer (vm, b); + u32 start, bi, available_bytes; ASSERT (tc->state >= TCP_STATE_ESTABLISHED); - ASSERT (max_bytes != 0); + ASSERT (max_deq_bytes != 0); - max_bytes = clib_min (tc->snd_mss, max_bytes); + /* + * Make sure we can retransmit something + */ + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + if (!available_bytes) + return 0; + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); start = tc->snd_una + offset; /* Start is beyond snd_congestion */ if (seq_geq (start, tc->snd_congestion)) - goto done; + { + goto done; + } /* Don't overshoot snd_congestion */ - if (seq_gt (start + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - start; - if (max_bytes == 0) - goto done; + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + { + goto done; + } } + /* + * Prepare options + */ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - ASSERT (max_bytes <= tc->snd_mss); + /* + * Allocate and fill in buffer(s) + */ + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return 0; + *b = vlib_get_buffer (vm, bi); + + /* Easy case, buffer size greater than mss */ + if (PREDICT_TRUE (max_deq_bytes <= tm->bytes_per_buffer)) + { + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (*b), + offset, max_deq_bytes); + ASSERT (n_bytes == max_deq_bytes); + b[0]->current_length = n_bytes; + tcp_push_hdr_i (tc, *b, tc->state, 0); + } + /* Split mss into multiple buffers */ + else + { + u32 chain_bi = ~0, n_bufs_per_seg; + u32 thread_index = vlib_get_thread_index (); + u16 n_peeked, len_to_deq, available_bufs; + vlib_buffer_t *chain_b, *prev_b; + u8 *data0; + int i; + + n_bufs_per_seg = ceil ((double) max_deq_bytes / tm->bytes_per_buffer); + ASSERT (available_bytes >= max_deq_bytes); + + /* Make sure we have enough buffers */ + available_bufs = vec_len (tm->tx_buffers[thread_index]); + if (n_bufs_per_seg > available_bufs) + { + if (tcp_alloc_tx_buffers (tm, thread_index, + VLIB_FRAME_SIZE - available_bufs)) + { + tcp_return_buffer (tm); + return 0; + } + } + + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (*b), + offset, tm->bytes_per_buffer); + b[0]->current_length = n_bytes; + b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b[0]->total_length_not_including_first_buffer = 0; + + tcp_push_hdr_i (tc, *b, tc->state, 0); + max_deq_bytes -= n_bytes; + + chain_b = *b; + for (i = 1; i < n_bufs_per_seg; i++) + { + prev_b = chain_b; + len_to_deq = clib_min (max_deq_bytes, tm->bytes_per_buffer); + tcp_get_free_buffer_index (tm, &chain_bi); + ASSERT (chain_bi != (u32) ~ 0); + chain_b = vlib_get_buffer (vm, chain_bi); + chain_b->current_data = 0; + data0 = vlib_buffer_get_current (chain_b); + n_peeked = stream_session_peek_bytes (&tc->connection, data0, + n_bytes, len_to_deq); + n_bytes += n_peeked; + ASSERT (n_peeked == len_to_deq); + chain_b->current_length = n_peeked; + b[0]->total_length_not_including_first_buffer += + chain_b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = chain_bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + chain_b->next_buffer = 0; + + max_deq_bytes -= n_peeked; + } + } - n_bytes = stream_session_peek_bytes (&tc->connection, - vlib_buffer_get_current (b), offset, - max_bytes); ASSERT (n_bytes > 0); - b->current_length = n_bytes; - tcp_push_hdr_i (tc, b, tc->state, 0); if (tcp_in_fastrecovery (tc)) tc->snd_rxt_bytes += n_bytes; @@ -1147,7 +1254,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) vlib_main_t *vm = vlib_get_main (); u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - vlib_buffer_t *b; + vlib_buffer_t *b = 0; u32 bi, n_bytes; if (is_syn) @@ -1174,17 +1281,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); - if (tc->state >= TCP_STATE_ESTABLISHED) { /* Lost FIN, retransmit and return */ - if (tc->flags & TCP_CONN_FINSNT) + if (tcp_is_lost_fin (tc)) { - tcp_return_buffer (tm); tcp_send_fin (tc); return; } @@ -1199,7 +1300,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); /* Send one segment */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + ASSERT (n_bytes); + bi = vlib_get_buffer_index (vm, b); /* TODO be less aggressive about this */ scoreboard_clear (&tc->sack_sb); @@ -1212,7 +1315,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_retransmit_timer_set (tc); ASSERT (0 || (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion)); - tcp_return_buffer (tm); return; } @@ -1234,7 +1336,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) clib_warning ("could not remove half-open connection"); ASSERT (0); } - tcp_return_buffer (tm); return; } @@ -1243,6 +1344,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (tc->rto_boff > TCP_RTO_SYN_RETRIES) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); vlib_buffer_make_headroom (b, MAX_HDRS_LEN); tcp_push_hdr_i (tc, b, tc->state, 1); @@ -1256,7 +1360,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_CLOSED); clib_warning ("connection closed ..."); - tcp_return_buffer (tm); return; } @@ -1305,7 +1408,7 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, old_snd_nxt; + u32 bi, old_snd_nxt, snd_bytes = 0, available_bytes = 0; int n_bytes = 0; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1317,34 +1420,31 @@ tcp_timer_persist_handler (u32 index) tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; /* Problem already solved or worse */ + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED - || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc) + || !available_bytes) return; /* Increment RTO backoff */ tc->rto_boff += 1; tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Try to force the first unsent segment */ + /* + * Try to force the first unsent segment (or buffer) + */ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; - b = vlib_get_buffer (vm, bi); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, - tc->snd_mss); - /* Nothing to send */ - if (n_bytes <= 0) - { - // clib_warning ("persist found nothing to send"); - tcp_return_buffer (tm); - return; - } - + snd_bytes); + ASSERT (n_bytes != 0); b->current_length = n_bytes; ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); @@ -1365,32 +1465,20 @@ tcp_timer_persist_handler (u32 index) void tcp_retransmit_first_unacked (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes, old_snd_nxt; + u32 bi, old_snd_nxt, n_bytes; old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - /* Get buffer */ - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); - - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); - if (n_bytes == 0) - { - tcp_return_buffer (tm); - goto done; - } - + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + if (!n_bytes) + return; + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); -done: tc->snd_nxt = old_snd_nxt; } @@ -1400,10 +1488,9 @@ done: void tcp_fast_retransmit_sack (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset = 0, max_bytes; - vlib_buffer_t *b; + vlib_buffer_t *b = 0; sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; u32 bi, old_snd_nxt; @@ -1420,10 +1507,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); while (hole && snd_space > 0) { - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), &can_rescue, &snd_limited); @@ -1443,7 +1526,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) offset = tc->snd_congestion - tc->snd_una - max_bytes; sb->rescue_rxt = tc->snd_congestion; tc->snd_nxt = tc->snd_una + offset; - tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, + &b); + ASSERT (n_written); + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); break; } @@ -1451,15 +1537,13 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; offset = sb->high_rxt - tc->snd_una; tc->snd_nxt = tc->snd_una + offset; - n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b); /* Nothing left to retransmit */ if (n_written == 0) - { - tcp_return_buffer (tm); - break; - } + break; + bi = vlib_get_buffer_index (vm, b); sb->high_rxt += n_written; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); snd_space -= n_written; @@ -1475,7 +1559,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) void tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; @@ -1491,19 +1574,14 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) while (snd_space > 0) { - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - b = vlib_get_buffer (vm, bi); offset += n_written; - n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); + n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b); /* Nothing left to retransmit */ if (n_written == 0) - { - tcp_return_buffer (tm); - break; - } + break; + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); snd_space -= n_written; }