+static transport_connection_t *
+tcp_session_get_transport (u32 conn_index, u32 thread_index)
+{
+ tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index);
+ if (PREDICT_FALSE (!tc))
+ return 0;
+ return &tc->connection;
+}
+
+static transport_connection_t *
+tcp_half_open_session_get_transport (u32 conn_index)
+{
+ tcp_connection_t *tc = tcp_half_open_connection_get (conn_index);
+ return &tc->connection;
+}
+
+static u16
+tcp_session_cal_goal_size (tcp_connection_t * tc)
+{
+ u16 goal_size = tc->snd_mss;
+
+ goal_size = TCP_MAX_GSO_SZ - tc->snd_mss % TCP_MAX_GSO_SZ;
+ goal_size = clib_min (goal_size, tc->snd_wnd / 2);
+
+ return goal_size > tc->snd_mss ? goal_size : tc->snd_mss;
+}
+
+/**
+ * Compute maximum segment size for session layer.
+ *
+ * Since the result needs to be the actual data length, it first computes
+ * the tcp options to be used in the next burst and subtracts their
+ * length from the connection's snd_mss.
+ */
+static u16
+tcp_session_send_mss (transport_connection_t * trans_conn)
+{
+ tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
+
+ /* Ensure snd_mss does accurately reflect the amount of data we can push
+ * in a segment. This also makes sure that options are updated according to
+ * the current state of the connection. */
+ tcp_update_burst_snd_vars (tc);
+
+ if (PREDICT_FALSE (tc->cfg_flags & TCP_CFG_F_TSO))
+ return tcp_session_cal_goal_size (tc);
+
+ return tc->snd_mss;
+}
+
+always_inline u32
+tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
+{
+ if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
+ {
+ return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0;
+ }
+
+ /* If not snd_wnd constrained and we can't write at least a segment,
+ * don't try at all */
+ if (PREDICT_FALSE (snd_space < tc->snd_mss))
+ return snd_space < tc->cwnd ? 0 : snd_space;
+
+ /* round down to mss multiple */
+ return snd_space - (snd_space % tc->snd_mss);
+}
+
+/**
+ * Compute tx window session is allowed to fill.
+ *
+ * Takes into account available send space, snd_mss and the congestion
+ * state of the connection. If possible, the value returned is a multiple
+ * of snd_mss.
+ *
+ * @param tc tcp connection
+ * @return number of bytes session is allowed to write
+ */
+static inline u32
+tcp_snd_space_inline (tcp_connection_t * tc)
+{
+ int snd_space;
+
+ if (PREDICT_FALSE (tcp_in_fastrecovery (tc)
+ || tc->state == TCP_STATE_CLOSED))
+ return 0;
+
+ snd_space = tcp_available_output_snd_space (tc);
+
+ /* If we got dupacks or sacked bytes but we're not yet in recovery, try
+ * to force the peer to send enough dupacks to start retransmitting as
+ * per Limited Transmit (RFC3042)
+ */
+ if (PREDICT_FALSE (tc->rcv_dupacks != 0 || tc->sack_sb.sacked_bytes))
+ {
+ if (tc->limited_transmit != tc->snd_nxt
+ && (seq_lt (tc->limited_transmit, tc->snd_nxt - 2 * tc->snd_mss)
+ || seq_gt (tc->limited_transmit, tc->snd_nxt)))
+ tc->limited_transmit = tc->snd_nxt;
+
+ ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt));
+
+ int snt_limited = tc->snd_nxt - tc->limited_transmit;
+ snd_space = clib_max ((int) 2 * tc->snd_mss - snt_limited, 0);
+ }
+ return tcp_round_snd_space (tc, snd_space);
+}
+
+u32
+tcp_snd_space (tcp_connection_t * tc)
+{
+ return tcp_snd_space_inline (tc);
+}
+
+static u32
+tcp_session_send_space (transport_connection_t * trans_conn)
+{
+ tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
+ return clib_min (tcp_snd_space_inline (tc),
+ tc->snd_wnd - (tc->snd_nxt - tc->snd_una));
+}
+
+static u32
+tcp_session_tx_fifo_offset (transport_connection_t * trans_conn)
+{
+ tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
+
+ ASSERT (seq_geq (tc->snd_nxt, tc->snd_una));
+
+ /* This still works if fast retransmit is on */
+ return (tc->snd_nxt - tc->snd_una);
+}
+
+static void
+tcp_timer_waitclose_handler (tcp_connection_t * tc)
+{
+ tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
+
+ switch (tc->state)
+ {
+ case TCP_STATE_CLOSE_WAIT:
+ tcp_connection_timers_reset (tc);
+ /* App never returned with a close */
+ if (!(tc->flags & TCP_CONN_FINPNDG))
+ {
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ session_transport_closed_notify (&tc->connection);
+ tcp_program_cleanup (wrk, tc);
+ tcp_workerp_stats_inc (wrk, to_closewait, 1);
+ break;
+ }
+
+ /* Send FIN either way and switch to LAST_ACK. */
+ tcp_cong_recovery_off (tc);
+ /* Make sure we don't try to send unsent data */
+ tc->snd_nxt = tc->snd_una;
+ tcp_send_fin (tc);
+ tcp_connection_set_state (tc, TCP_STATE_LAST_ACK);
+ session_transport_closed_notify (&tc->connection);
+
+ /* Make sure we don't wait in LAST ACK forever */
+ tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, tcp_cfg.lastack_time);
+ tcp_workerp_stats_inc (wrk, to_closewait2, 1);
+
+ /* Don't delete the connection yet */
+ break;
+ case TCP_STATE_FIN_WAIT_1:
+ tcp_connection_timers_reset (tc);
+ if (tc->flags & TCP_CONN_FINPNDG)
+ {
+ /* If FIN pending, we haven't sent everything, but we did try.
+ * Notify session layer that transport is closed. */
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ tcp_send_reset (tc);
+ tcp_program_cleanup (wrk, tc);
+ }
+ else
+ {
+ /* We've sent the fin but no progress. Close the connection and
+ * to make sure everything is flushed, setup a cleanup timer */
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ tcp_program_cleanup (wrk, tc);
+ }
+ session_transport_closed_notify (&tc->connection);
+ tcp_workerp_stats_inc (wrk, to_finwait1, 1);
+ break;
+ case TCP_STATE_LAST_ACK:
+ tcp_connection_timers_reset (tc);
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ session_transport_closed_notify (&tc->connection);
+ tcp_program_cleanup (wrk, tc);
+ tcp_workerp_stats_inc (wrk, to_lastack, 1);
+ break;
+ case TCP_STATE_CLOSING:
+ tcp_connection_timers_reset (tc);
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ session_transport_closed_notify (&tc->connection);
+ tcp_program_cleanup (wrk, tc);
+ tcp_workerp_stats_inc (wrk, to_closing, 1);
+ break;
+ case TCP_STATE_FIN_WAIT_2:
+ tcp_send_reset (tc);
+ tcp_connection_timers_reset (tc);
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ session_transport_closed_notify (&tc->connection);
+ tcp_program_cleanup (wrk, tc);
+ tcp_workerp_stats_inc (wrk, to_finwait2, 1);
+ break;
+ case TCP_STATE_TIME_WAIT:
+ tcp_connection_set_state (tc, TCP_STATE_CLOSED);
+ tcp_program_cleanup (wrk, tc);
+ break;
+ default:
+ clib_warning ("waitclose in state: %U", format_tcp_state, tc->state);
+ break;
+ }
+}
+
+/* *INDENT-OFF* */
+static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] =
+{
+ tcp_timer_retransmit_handler,
+ tcp_timer_delack_handler,
+ tcp_timer_persist_handler,
+ tcp_timer_waitclose_handler,
+ tcp_timer_retransmit_syn_handler,
+};
+/* *INDENT-ON* */
+
+static void
+tcp_dispatch_pending_timers (tcp_worker_ctx_t * wrk)
+{
+ u32 n_timers, connection_index, timer_id, thread_index, timer_handle;
+ tcp_connection_t *tc;
+ int i;
+
+ if (!(n_timers = clib_fifo_elts (wrk->pending_timers)))
+ return;
+
+ thread_index = wrk->vm->thread_index;
+ for (i = 0; i < clib_min (n_timers, wrk->max_timers_per_loop); i++)
+ {
+ clib_fifo_sub1 (wrk->pending_timers, timer_handle);
+ connection_index = timer_handle & 0x0FFFFFFF;
+ timer_id = timer_handle >> 28;
+
+ if (PREDICT_TRUE (timer_id != TCP_TIMER_RETRANSMIT_SYN))
+ tc = tcp_connection_get (connection_index, thread_index);
+ else
+ tc = tcp_half_open_connection_get (connection_index);
+
+ if (PREDICT_FALSE (!tc))
+ continue;
+
+ /* Skip timer if it was rearmed while pending dispatch */
+ if (PREDICT_FALSE (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID))
+ continue;
+
+ (*timer_expiration_handlers[timer_id]) (tc);
+ }
+
+ if (thread_index == 0 && clib_fifo_elts (wrk->pending_timers))
+ vlib_process_signal_event_mt (wrk->vm, session_queue_process_node.index,
+ SESSION_Q_PROCESS_FLUSH_FRAMES, 0);
+}
+
+/**
+ * Flush ip lookup tx frames populated by timer pops
+ */
+static void
+tcp_flush_frames_to_output (tcp_worker_ctx_t * wrk)
+{
+ if (wrk->ip_lookup_tx_frames[0])
+ {
+ vlib_put_frame_to_node (wrk->vm, ip4_lookup_node.index,
+ wrk->ip_lookup_tx_frames[0]);
+ wrk->ip_lookup_tx_frames[0] = 0;
+ }
+ if (wrk->ip_lookup_tx_frames[1])
+ {
+ vlib_put_frame_to_node (wrk->vm, ip6_lookup_node.index,
+ wrk->ip_lookup_tx_frames[1]);
+ wrk->ip_lookup_tx_frames[1] = 0;
+ }
+}
+
+static void
+tcp_handle_cleanups (tcp_worker_ctx_t * wrk, clib_time_type_t now)
+{
+ u32 thread_index = wrk->vm->thread_index;
+ tcp_cleanup_req_t *req;
+ tcp_connection_t *tc;
+
+ while (clib_fifo_elts (wrk->pending_cleanups))
+ {
+ req = clib_fifo_head (wrk->pending_cleanups);
+ if (req->free_time > now)
+ break;
+ clib_fifo_sub2 (wrk->pending_cleanups, req);
+ tc = tcp_connection_get (req->connection_index, thread_index);
+ session_transport_delete_notify (&tc->connection);
+ tcp_connection_cleanup (tc);
+ }
+}
+
+static void
+tcp_update_time (f64 now, u8 thread_index)
+{
+ tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
+
+ tcp_set_time_now (wrk);
+ tcp_handle_cleanups (wrk, now);
+ tw_timer_expire_timers_16t_2w_512sl (&wrk->timer_wheel, now);
+ tcp_dispatch_pending_timers (wrk);
+ tcp_flush_frames_to_output (wrk);
+}
+
+static void
+tcp_session_flush_data (transport_connection_t * tconn)
+{
+ tcp_connection_t *tc = (tcp_connection_t *) tconn;
+ if (tc->flags & TCP_CONN_PSH_PENDING)
+ return;
+ tc->flags |= TCP_CONN_PSH_PENDING;
+ tc->psh_seq = tc->snd_una + transport_max_tx_dequeue (tconn) - 1;
+}
+
+/* *INDENT-OFF* */
+const static transport_proto_vft_t tcp_proto = {
+ .enable = vnet_tcp_enable_disable,
+ .start_listen = tcp_session_bind,
+ .stop_listen = tcp_session_unbind,
+ .push_header = tcp_session_push_header,
+ .get_connection = tcp_session_get_transport,
+ .get_listener = tcp_session_get_listener,
+ .get_half_open = tcp_half_open_session_get_transport,
+ .connect = tcp_session_open,
+ .close = tcp_session_close,
+ .cleanup = tcp_session_cleanup,
+ .reset = tcp_session_reset,
+ .send_mss = tcp_session_send_mss,
+ .send_space = tcp_session_send_space,
+ .update_time = tcp_update_time,
+ .tx_fifo_offset = tcp_session_tx_fifo_offset,
+ .flush_data = tcp_session_flush_data,
+ .custom_tx = tcp_session_custom_tx,
+ .format_connection = format_tcp_session,
+ .format_listener = format_tcp_listener_session,
+ .format_half_open = format_tcp_half_open_session,
+ .transport_options = {
+ .tx_type = TRANSPORT_TX_PEEK,
+ .service_type = TRANSPORT_SERVICE_VC,
+ },
+};
+/* *INDENT-ON* */
+
+void
+tcp_connection_tx_pacer_update (tcp_connection_t * tc)
+{
+ if (!transport_connection_is_tx_paced (&tc->connection))
+ return;
+
+ f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
+
+ transport_connection_tx_pacer_update (&tc->connection,
+ tcp_cc_get_pacing_rate (tc),
+ srtt * CLIB_US_TIME_FREQ);
+}
+
+void
+tcp_connection_tx_pacer_reset (tcp_connection_t * tc, u32 window,
+ u32 start_bucket)
+{
+ f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
+ transport_connection_tx_pacer_reset (&tc->connection,
+ tcp_cc_get_pacing_rate (tc),
+ start_bucket,
+ srtt * CLIB_US_TIME_FREQ);
+}
+
+static void
+tcp_expired_timers_dispatch (u32 * expired_timers)
+{
+ u32 thread_index = vlib_get_thread_index (), n_left, max_per_loop;
+ u32 connection_index, timer_id, n_expired, max_loops;
+ tcp_worker_ctx_t *wrk;
+ tcp_connection_t *tc;
+ int i;
+
+ wrk = tcp_get_worker (thread_index);
+ n_expired = vec_len (expired_timers);
+ tcp_workerp_stats_inc (wrk, timer_expirations, n_expired);
+ n_left = clib_fifo_elts (wrk->pending_timers);
+
+ /*
+ * Invalidate all timer handles before dispatching. This avoids dangling
+ * index references to timer wheel pool entries that have been freed.
+ */
+ for (i = 0; i < n_expired; i++)
+ {
+ connection_index = expired_timers[i] & 0x0FFFFFFF;
+ timer_id = expired_timers[i] >> 28;
+
+ if (timer_id != TCP_TIMER_RETRANSMIT_SYN)
+ tc = tcp_connection_get (connection_index, thread_index);
+ else
+ tc = tcp_half_open_connection_get (connection_index);
+
+ TCP_EVT (TCP_EVT_TIMER_POP, connection_index, timer_id);
+
+ tc->timers[timer_id] = TCP_TIMER_HANDLE_INVALID;
+ }
+
+ clib_fifo_add (wrk->pending_timers, expired_timers, n_expired);
+
+ max_loops = clib_max (1, 0.5 * TCP_TIMER_TICK * wrk->vm->loops_per_second);
+ max_per_loop = clib_max ((n_left + n_expired) / max_loops, 10);
+ max_per_loop = clib_min (max_per_loop, VLIB_FRAME_SIZE);
+ wrk->max_timers_per_loop = clib_max (n_left ? wrk->max_timers_per_loop : 0,
+ max_per_loop);
+
+ if (thread_index == 0)
+ vlib_process_signal_event_mt (wrk->vm, session_queue_process_node.index,
+ SESSION_Q_PROCESS_FLUSH_FRAMES, 0);
+}
+
+static void
+tcp_initialize_timer_wheels (tcp_main_t * tm)
+{
+ tw_timer_wheel_16t_2w_512sl_t *tw;
+ /* *INDENT-OFF* */
+ foreach_vlib_main (({
+ tw = &tm->wrk_ctx[ii].timer_wheel;
+ tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch,
+ TCP_TIMER_TICK, ~0);
+ tw->last_run_time = vlib_time_now (this_vlib_main);
+ }));
+ /* *INDENT-ON* */
+}
+
+static void
+tcp_initialize_iss_seed (tcp_main_t * tm)
+{
+ u32 default_seed = random_default_seed ();
+ u64 time_now = clib_cpu_time_now ();
+
+ tm->iss_seed.first = (u64) random_u32 (&default_seed) << 32;
+ tm->iss_seed.second = random_u64 (&time_now);
+}
+
+static clib_error_t *
+tcp_main_enable (vlib_main_t * vm)
+{
+ vlib_thread_main_t *vtm = vlib_get_thread_main ();
+ u32 num_threads, n_workers, prealloc_conn_per_wrk;
+ tcp_connection_t *tc __attribute__ ((unused));
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ tcp_worker_ctx_t *wrk;
+ clib_error_t *error = 0;
+ int thread;
+
+ if ((error = vlib_call_init_function (vm, ip_main_init)))
+ return error;
+ if ((error = vlib_call_init_function (vm, ip4_lookup_init)))
+ return error;
+ if ((error = vlib_call_init_function (vm, ip6_lookup_init)))
+ return error;
+
+ /*
+ * Registrations
+ */
+
+ ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index);
+ ip6_register_protocol (IP_PROTOCOL_TCP, tcp6_input_node.index);
+
+ /*
+ * Initialize data structures
+ */
+
+ num_threads = 1 /* main thread */ + vtm->n_threads;
+ vec_validate (tm->wrk_ctx, num_threads - 1);
+ n_workers = num_threads == 1 ? 1 : vtm->n_threads;
+ prealloc_conn_per_wrk = tcp_cfg.preallocated_connections / n_workers;
+
+ for (thread = 0; thread < num_threads; thread++)
+ {
+ wrk = &tm->wrk_ctx[thread];
+
+ vec_validate (wrk->pending_deq_acked, 255);
+ vec_validate (wrk->pending_disconnects, 255);
+ vec_validate (wrk->pending_resets, 255);
+ vec_reset_length (wrk->pending_deq_acked);
+ vec_reset_length (wrk->pending_disconnects);
+ vec_reset_length (wrk->pending_resets);
+ wrk->vm = vlib_mains[thread];
+ wrk->max_timers_per_loop = 10;
+
+ /*
+ * Preallocate connections. Assume that thread 0 won't
+ * use preallocated threads when running multi-core
+ */
+ if ((thread > 0 || num_threads == 1) && prealloc_conn_per_wrk)
+ pool_init_fixed (wrk->connections, prealloc_conn_per_wrk);
+ }
+
+ /*
+ * Use a preallocated half-open connection pool?
+ */
+ if (tcp_cfg.preallocated_half_open_connections)
+ pool_init_fixed (tm->half_open_connections,
+ tcp_cfg.preallocated_half_open_connections);
+
+ /* Initialize clocks per tick for TCP timestamp. Used to compute
+ * monotonically increasing timestamps. */
+ tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock
+ / TCP_TSTAMP_RESOLUTION;
+
+ if (num_threads > 1)
+ {
+ clib_spinlock_init (&tm->half_open_lock);
+ }
+
+ tcp_initialize_timer_wheels (tm);
+ tcp_initialize_iss_seed (tm);
+
+ tm->bytes_per_buffer = vlib_buffer_get_default_data_size (vm);
+ tm->cc_last_type = TCP_CC_LAST;
+ return error;
+}
+
+clib_error_t *
+vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en)
+{
+ if (is_en)
+ {
+ if (tcp_main.is_enabled)
+ return 0;
+
+ return tcp_main_enable (vm);
+ }
+ else
+ {
+ tcp_main.is_enabled = 0;
+ }
+
+ return 0;
+}
+
+void
+tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add)
+{
+ tcp_main_t *tm = &tcp_main;
+ if (is_ip4)
+ tm->punt_unknown4 = is_add;
+ else
+ tm->punt_unknown6 = is_add;
+}
+
+/**
+ * Initialize default values for tcp parameters
+ */
+static void
+tcp_configuration_init (void)
+{
+ /* Initial wnd for SYN. Fifos are not allocated at that point so use some
+ * predefined value. For SYN-ACK we still want the scale to be computed in
+ * the same way */
+ tcp_cfg.max_rx_fifo = 32 << 20;
+ tcp_cfg.min_rx_fifo = 4 << 10;
+
+ tcp_cfg.default_mtu = 1500;
+ tcp_cfg.initial_cwnd_multiplier = 0;
+ tcp_cfg.enable_tx_pacing = 1;
+ tcp_cfg.allow_tso = 0;
+ tcp_cfg.csum_offload = 1;
+ tcp_cfg.cc_algo = TCP_CC_NEWRENO;
+ tcp_cfg.rwnd_min_update_ack = 1;
+
+ /* Time constants defined as timer tick (100ms) multiples */
+ tcp_cfg.delack_time = 1; /* 0.1s */
+ tcp_cfg.closewait_time = 20; /* 2s */
+ tcp_cfg.timewait_time = 100; /* 10s */
+ tcp_cfg.finwait1_time = 600; /* 60s */
+ tcp_cfg.lastack_time = 300; /* 30s */
+ tcp_cfg.finwait2_time = 300; /* 30s */
+ tcp_cfg.closing_time = 300; /* 30s */
+ tcp_cfg.cleanup_time = 0.1; /* 100ms */
+}
+
+static clib_error_t *
+tcp_init (vlib_main_t * vm)
+{
+ tcp_main_t *tm = vnet_get_tcp_main ();
+ ip_main_t *im = &ip_main;
+ ip_protocol_info_t *pi;
+
+ /* Session layer, and by implication tcp, are disabled by default */
+ tm->is_enabled = 0;
+
+ /* Register with IP for header parsing */
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP);
+ if (pi == 0)
+ return clib_error_return (0, "TCP protocol info AWOL");
+ pi->format_header = format_tcp_header;
+ pi->unformat_pg_edit = unformat_pg_tcp_header;
+
+ /* Register as transport with session layer */
+ transport_register_protocol (TRANSPORT_PROTO_TCP, &tcp_proto,
+ FIB_PROTOCOL_IP4, tcp4_output_node.index);
+ transport_register_protocol (TRANSPORT_PROTO_TCP, &tcp_proto,
+ FIB_PROTOCOL_IP6, tcp6_output_node.index);
+
+ tcp_api_reference ();
+ tcp_configuration_init ();
+
+ tm->cc_algo_by_name = hash_create_string (0, sizeof (uword));
+
+ return 0;