X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Ftcp%2Ftcp.c;h=4b1dd8e5cf5daa34b494fb952e9749c87450f283;hb=48057bd23433a352338358ca1f6cdc6cebd84f08;hp=a4599c2fd9848ce2861ea89220b820b3c5fbe7fe;hpb=23c3d349e52e57600aaaf3ef32e4264fffb2d0db;p=vpp.git diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index a4599c2fd98..4b1dd8e5cf5 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -181,6 +181,13 @@ tcp_session_get_listener (u32 listener_index) return &tc->connection; } +static tcp_connection_t * +tcp_half_open_connection_alloc (void) +{ + ASSERT (vlib_get_thread_index () == 0); + return tcp_connection_alloc (0); +} + /** * Cleanup half-open connection * @@ -188,12 +195,8 @@ tcp_session_get_listener (u32 listener_index) static void tcp_half_open_connection_free (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); - clib_spinlock_lock_if_init (&tm->half_open_lock); - if (CLIB_DEBUG) - clib_memset (tc, 0xFA, sizeof (*tc)); - pool_put (tm->half_open_connections, tc); - clib_spinlock_unlock_if_init (&tm->half_open_lock); + ASSERT (vlib_get_thread_index () == 0); + return tcp_connection_free (tc); } /** @@ -214,25 +217,13 @@ tcp_half_open_connection_cleanup (tcp_connection_t * tc) if (tc->c_thread_index != vlib_get_thread_index ()) return 1; - session_half_open_delete_notify (TRANSPORT_PROTO_TCP, tc->c_s_ho_handle); + session_half_open_delete_notify (&tc->connection); wrk = tcp_get_worker (tc->c_thread_index); tcp_timer_reset (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN); tcp_half_open_connection_free (tc); return 0; } -static tcp_connection_t * -tcp_half_open_connection_new (void) -{ - tcp_main_t *tm = vnet_get_tcp_main (); - tcp_connection_t *tc = 0; - ASSERT (vlib_get_thread_index () == 0); - pool_get (tm->half_open_connections, tc); - clib_memset (tc, 0, sizeof (*tc)); - tc->c_c_index = tc - tm->half_open_connections; - return tc; -} - /** * Cleans up connection state. * @@ -306,13 +297,23 @@ tcp_connection_alloc (u8 thread_index) } tcp_connection_t * -tcp_connection_alloc_w_base (u8 thread_index, tcp_connection_t * base) +tcp_connection_alloc_w_base (u8 thread_index, tcp_connection_t **base) { tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index); tcp_connection_t *tc; - pool_get (wrk->connections, tc); - clib_memcpy_fast (tc, base, sizeof (*tc)); + /* Make sure connection is still valid if pool moves */ + if ((*base)->c_thread_index == thread_index) + { + u32 base_index = (*base)->c_c_index; + pool_get (wrk->connections, tc); + *base = tcp_connection_get (base_index, thread_index); + } + else + { + pool_get (wrk->connections, tc); + } + clib_memcpy_fast (tc, *base, sizeof (*tc)); tc->c_c_index = tc - wrk->connections; tc->c_thread_index = thread_index; return tc; @@ -337,7 +338,7 @@ tcp_program_cleanup (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) tcp_cleanup_req_t *req; clib_time_type_t now; - now = transport_time_now (tc->c_thread_index); + now = tcp_time_now_us (tc->c_thread_index); clib_fifo_add2 (wrk->pending_cleanups, req); req->connection_index = tc->c_c_index; req->free_time = now + tcp_cfg.cleanup_time; @@ -354,7 +355,6 @@ tcp_program_cleanup (tcp_worker_ctx_t * wrk, tcp_connection_t * tc) * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers * and cleanup is called. * - * N.B. Half-close connections are not supported */ void tcp_connection_close (tcp_connection_t * tc) @@ -425,6 +425,30 @@ tcp_connection_close (tcp_connection_t * tc) } } +static void +tcp_session_half_close (u32 conn_index, u32 thread_index) +{ + tcp_worker_ctx_t *wrk; + tcp_connection_t *tc; + + tc = tcp_connection_get (conn_index, thread_index); + wrk = tcp_get_worker (tc->c_thread_index); + + /* If the connection is not in ESTABLISHED state, ignore it */ + if (tc->state != TCP_STATE_ESTABLISHED) + return; + if (!transport_max_tx_dequeue (&tc->connection)) + tcp_send_fin (tc); + else + tc->flags |= TCP_CONN_FINPNDG; + tcp_connection_set_state (tc, TCP_STATE_FIN_WAIT_1); + /* Set a timer in case the peer stops responding. Otherwise the + * connection will be stuck here forever. */ + ASSERT (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID); + tcp_timer_set (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE, + tcp_cfg.finwait1_time); +} + static void tcp_session_close (u32 conn_index, u32 thread_index) { @@ -657,9 +681,10 @@ tcp_init_mss (tcp_connection_t * tc) /* We should have enough space for 40 bytes of options */ ASSERT (tc->snd_mss > 45); - /* If we use timestamp option, account for it */ + /* If we use timestamp option, account for it and make sure + * the options are 4-byte aligned */ if (tcp_opts_tstamp (&tc->rcv_opts)) - tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; + tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP + 2 /* alignment */; } /** @@ -674,13 +699,12 @@ tcp_init_snd_vars (tcp_connection_t * tc) * handshake may make it look as if time has flown in the opposite * direction for us. */ - tcp_set_time_now (tcp_get_worker (vlib_get_thread_index ())); + tcp_update_time_now (tcp_get_worker (vlib_get_thread_index ())); tcp_init_rcv_mss (tc); tc->iss = tcp_generate_random_iss (tc); tc->snd_una = tc->iss; tc->snd_nxt = tc->iss + 1; - tc->snd_una_max = tc->snd_nxt; tc->srtt = 0.1 * THZ; /* 100 ms */ if (!tcp_cfg.csum_offload) @@ -782,21 +806,20 @@ tcp_session_open (transport_endpoint_cfg_t * rmt) return rv; if (session_lookup_connection (rmt->fib_index, &lcl_addr, &rmt->ip, - lcl_port, rmt->port, TRANSPORT_PROTO_UDP, + lcl_port, rmt->port, TRANSPORT_PROTO_TCP, rmt->is_ip4)) return SESSION_E_PORTINUSE; /* 5-tuple is available so increase lcl endpoint refcount and proceed * with connection allocation */ - transport_share_local_endpoint (TRANSPORT_PROTO_UDP, &lcl_addr, + transport_share_local_endpoint (TRANSPORT_PROTO_TCP, &lcl_addr, lcl_port); } /* * Create connection and send SYN */ - clib_spinlock_lock_if_init (&tm->half_open_lock); - tc = tcp_half_open_connection_new (); + tc = tcp_half_open_connection_alloc (); ip_copy (&tc->c_rmt_ip, &rmt->ip, rmt->is_ip4); ip_copy (&tc->c_lcl_ip, &lcl_addr, rmt->is_ip4); tc->c_rmt_port = rmt->port; @@ -808,12 +831,13 @@ tcp_session_open (transport_endpoint_cfg_t * rmt) /* The other connection vars will be initialized after SYN ACK */ tcp_connection_timers_init (tc); tc->mss = rmt->mss; + tc->next_node_index = rmt->next_node_index; + tc->next_node_opaque = rmt->next_node_opaque; TCP_EVT (TCP_EVT_OPEN, tc); tc->state = TCP_STATE_SYN_SENT; tcp_init_snd_vars (tc); tcp_send_syn (tc); - clib_spinlock_unlock_if_init (&tm->half_open_lock); return tc->c_c_index; } @@ -841,9 +865,10 @@ format_tcp_listener_session (u8 * s, va_list * args) u32 __clib_unused thread_index = va_arg (*args, u32); u32 verbose = va_arg (*args, u32); tcp_connection_t *tc = tcp_listener_get (tci); - s = format (s, "%-50U", format_tcp_connection_id, tc); + s = format (s, "%-" SESSION_CLI_ID_LEN "U", format_tcp_connection_id, tc); if (verbose) - s = format (s, "%-15U", format_tcp_state, tc->state); + s = format (s, "%-" SESSION_CLI_STATE_LEN "U", format_tcp_state, + tc->state); return s; } @@ -852,8 +877,20 @@ format_tcp_half_open_session (u8 * s, va_list * args) { u32 tci = va_arg (*args, u32); u32 __clib_unused thread_index = va_arg (*args, u32); - tcp_connection_t *tc = tcp_half_open_connection_get (tci); - return format (s, "%U", format_tcp_connection_id, tc); + u32 verbose = va_arg (*args, u32); + tcp_connection_t *tc; + u8 *state = 0; + + tc = tcp_half_open_connection_get (tci); + if (tc->flags & TCP_CONN_HALF_OPEN_DONE) + state = format (state, "%s", "CLOSED"); + else + state = format (state, "%U", format_tcp_state, tc->state); + s = format (s, "%-" SESSION_CLI_ID_LEN "U", format_tcp_connection_id, tc); + if (verbose) + s = format (s, "%-" SESSION_CLI_STATE_LEN "v", state); + vec_free (state); + return s; } static transport_connection_t * @@ -872,6 +909,115 @@ tcp_half_open_session_get_transport (u32 conn_index) return &tc->connection; } +static int +tcp_set_attribute (tcp_connection_t *tc, transport_endpt_attr_t *attr) +{ + int rv = 0; + + switch (attr->type) + { + case TRANSPORT_ENDPT_ATTR_NEXT_OUTPUT_NODE: + tc->next_node_index = attr->next_output_node & 0xffffffff; + tc->next_node_opaque = attr->next_output_node >> 32; + break; + case TRANSPORT_ENDPT_ATTR_MSS: + tc->mss = attr->mss; + tc->snd_mss = clib_min (tc->snd_mss, tc->mss); + break; + case TRANSPORT_ENDPT_ATTR_FLAGS: + if (attr->flags & TRANSPORT_ENDPT_ATTR_F_CSUM_OFFLOAD) + tc->cfg_flags |= TCP_CFG_F_NO_CSUM_OFFLOAD; + else + tc->cfg_flags &= ~TCP_CFG_F_NO_CSUM_OFFLOAD; + if (attr->flags & TRANSPORT_ENDPT_ATTR_F_GSO) + { + if (!(tc->cfg_flags & TCP_CFG_F_TSO)) + tcp_check_gso (tc); + tc->cfg_flags &= ~TCP_CFG_F_NO_TSO; + } + else + { + tc->cfg_flags |= TCP_CFG_F_NO_TSO; + tc->cfg_flags &= ~TCP_CFG_F_TSO; + } + if (attr->flags & TRANSPORT_ENDPT_ATTR_F_RATE_SAMPLING) + { + if (!(tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)) + tcp_bt_init (tc); + tc->cfg_flags |= TCP_CFG_F_RATE_SAMPLE; + } + else + { + if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) + tcp_bt_cleanup (tc); + tc->cfg_flags &= ~TCP_CFG_F_RATE_SAMPLE; + } + break; + case TRANSPORT_ENDPT_ATTR_CC_ALGO: + if (tc->cc_algo == tcp_cc_algo_get (attr->cc_algo)) + break; + tcp_cc_cleanup (tc); + tc->cc_algo = tcp_cc_algo_get (attr->cc_algo); + tcp_cc_init (tc); + break; + default: + rv = -1; + break; + } + + return rv; +} + +static int +tcp_get_attribute (tcp_connection_t *tc, transport_endpt_attr_t *attr) +{ + int rv = 0; + u64 non; + + switch (attr->type) + { + case TRANSPORT_ENDPT_ATTR_NEXT_OUTPUT_NODE: + non = (u64) tc->next_node_opaque << 32 | tc->next_node_index; + attr->next_output_node = non; + break; + case TRANSPORT_ENDPT_ATTR_MSS: + attr->mss = tc->snd_mss; + break; + case TRANSPORT_ENDPT_ATTR_FLAGS: + attr->flags = 0; + if (!(tc->cfg_flags & TCP_CFG_F_NO_CSUM_OFFLOAD)) + attr->flags |= TRANSPORT_ENDPT_ATTR_F_CSUM_OFFLOAD; + if (tc->cfg_flags & TCP_CFG_F_TSO) + attr->flags |= TRANSPORT_ENDPT_ATTR_F_GSO; + if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE) + attr->flags |= TRANSPORT_ENDPT_ATTR_F_RATE_SAMPLING; + break; + case TRANSPORT_ENDPT_ATTR_CC_ALGO: + attr->cc_algo = tc->cc_algo - tcp_main.cc_algos; + break; + default: + rv = -1; + break; + } + + return rv; +} + +static int +tcp_session_attribute (u32 conn_index, u32 thread_index, u8 is_get, + transport_endpt_attr_t *attr) +{ + tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index); + + if (PREDICT_FALSE (!tc)) + return -1; + + if (is_get) + return tcp_get_attribute (tc, attr); + else + return tcp_set_attribute (tc, attr); +} + static u16 tcp_session_cal_goal_size (tcp_connection_t * tc) { @@ -915,7 +1061,10 @@ tcp_snd_space_inline (tcp_connection_t * tc) { int snd_space; - if (PREDICT_FALSE (tcp_in_fastrecovery (tc) + /* Fast path is disabled when recovery is on. @ref tcp_session_custom_tx + * controls both retransmits and the sending of new data while congested + */ + if (PREDICT_FALSE (tcp_in_cong_recovery (tc) || tc->state == TCP_STATE_CLOSED)) return 0; @@ -1100,7 +1249,7 @@ tcp_dispatch_pending_timers (tcp_worker_ctx_t * wrk) continue; /* Skip if the timer is not pending. Probably it was reset while - * wating for dispatch */ + * waiting for dispatch */ if (PREDICT_FALSE (!(tc->pending_timers & (1 << timer_id)))) continue; @@ -1143,9 +1292,9 @@ tcp_update_time (f64 now, u8 thread_index) { tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index); - tcp_set_time_now (wrk); + tcp_set_time_now (wrk, now); tcp_handle_cleanups (wrk, now); - tw_timer_expire_timers_16t_2w_512sl (&wrk->timer_wheel, now); + tcp_timer_expire_timers (&wrk->timer_wheel, now); tcp_dispatch_pending_timers (wrk); } @@ -1159,6 +1308,27 @@ tcp_session_flush_data (transport_connection_t * tconn) tc->psh_seq = tc->snd_una + transport_max_tx_dequeue (tconn) - 1; } +static int +tcp_session_app_rx_evt (transport_connection_t *conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) conn; + u32 min_free, lo = 4 << 10, hi = 128 << 10; + + if (!(tc->flags & TCP_CONN_ZERO_RWND_SENT)) + return 0; + + min_free = clib_clamp (transport_rx_fifo_size (conn) >> 3, lo, hi); + if (transport_max_rx_enqueue (conn) < min_free) + { + transport_rx_fifo_req_deq_ntf (conn); + return 0; + } + + tcp_send_ack (tc); + + return 0; +} + /* *INDENT-OFF* */ const static transport_proto_vft_t tcp_proto = { .enable = vnet_tcp_enable_disable, @@ -1168,7 +1338,9 @@ const static transport_proto_vft_t tcp_proto = { .get_connection = tcp_session_get_transport, .get_listener = tcp_session_get_listener, .get_half_open = tcp_half_open_session_get_transport, + .attribute = tcp_session_attribute, .connect = tcp_session_open, + .half_close = tcp_session_half_close, .close = tcp_session_close, .cleanup = tcp_session_cleanup, .cleanup_ho = tcp_session_cleanup_ho, @@ -1177,6 +1349,7 @@ const static transport_proto_vft_t tcp_proto = { .update_time = tcp_update_time, .flush_data = tcp_session_flush_data, .custom_tx = tcp_session_custom_tx, + .app_rx_evt = tcp_session_app_rx_evt, .format_connection = format_tcp_session, .format_listener = format_tcp_listener_session, .format_half_open = format_tcp_half_open_session, @@ -1266,21 +1439,6 @@ tcp_expired_timers_dispatch (u32 * expired_timers) session_queue_run_on_main_thread (wrk->vm); } -static void -tcp_initialize_timer_wheels (tcp_main_t * tm) -{ - vlib_main_t *vm = vlib_get_main (); - tw_timer_wheel_16t_2w_512sl_t *tw; - /* *INDENT-OFF* */ - foreach_vlib_main (({ - tw = &tm->wrk_ctx[ii].timer_wheel; - tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch, - TCP_TIMER_TICK, ~0); - tw->last_run_time = vlib_time_now (vm); - })); - /* *INDENT-ON* */ -} - static void tcp_initialize_iss_seed (tcp_main_t * tm) { @@ -1341,7 +1499,7 @@ tcp_main_enable (vlib_main_t * vm) vec_reset_length (wrk->pending_deq_acked); vec_reset_length (wrk->pending_disconnects); vec_reset_length (wrk->pending_resets); - wrk->vm = vlib_mains[thread]; + wrk->vm = vlib_get_main_by_index (thread); wrk->max_timers_per_loop = 10; if (thread > 0) @@ -1356,21 +1514,12 @@ tcp_main_enable (vlib_main_t * vm) */ if ((thread > 0 || num_threads == 1) && prealloc_conn_per_wrk) pool_init_fixed (wrk->connections, prealloc_conn_per_wrk); - } - /* - * Use a preallocated half-open connection pool? - */ - if (tcp_cfg.preallocated_half_open_connections) - pool_init_fixed (tm->half_open_connections, - tcp_cfg.preallocated_half_open_connections); - - if (num_threads > 1) - { - clib_spinlock_init (&tm->half_open_lock); + tcp_timer_initialize_wheel (&wrk->timer_wheel, + tcp_expired_timers_dispatch, + vlib_time_now (vm)); } - tcp_initialize_timer_wheels (tm); tcp_initialize_iss_seed (tm); tm->bytes_per_buffer = vlib_buffer_get_default_data_size (vm); @@ -1432,14 +1581,16 @@ tcp_configuration_init (void) tcp_cfg.rwnd_min_update_ack = 1; tcp_cfg.max_gso_size = TCP_MAX_GSO_SZ; - /* Time constants defined as timer tick (100ms) multiples */ - tcp_cfg.delack_time = 1; /* 0.1s */ - tcp_cfg.closewait_time = 20; /* 2s */ - tcp_cfg.timewait_time = 100; /* 10s */ - tcp_cfg.finwait1_time = 600; /* 60s */ - tcp_cfg.lastack_time = 300; /* 30s */ - tcp_cfg.finwait2_time = 300; /* 30s */ - tcp_cfg.closing_time = 300; /* 30s */ + /* Time constants defined as timer tick (100us) multiples */ + tcp_cfg.closewait_time = 20000; /* 2s */ + tcp_cfg.timewait_time = 100000; /* 10s */ + tcp_cfg.finwait1_time = 600000; /* 60s */ + tcp_cfg.lastack_time = 300000; /* 30s */ + tcp_cfg.finwait2_time = 300000; /* 30s */ + tcp_cfg.closing_time = 300000; /* 30s */ + tcp_cfg.alloc_err_timeout = 1000; /* 100ms */ + + /* This value is seconds */ tcp_cfg.cleanup_time = 0.1; /* 100ms */ }