#define TCP_FIB_RECHECK_PERIOD 1 * THZ /**< Recheck every 1s */
#define TCP_MAX_OPTION_SPACE 40
#define TCP_CC_DATA_SZ 24
+#define TCP_MAX_GSO_SZ 65536
#define TCP_DUPACK_THRESHOLD 3
-#define TCP_MAX_RX_FIFO_SIZE 32 << 20
-#define TCP_MIN_RX_FIFO_SIZE 4 << 10
#define TCP_IW_N_SEGMENTS 10
#define TCP_ALWAYS_ACK 1 /**< On/off delayed acks */
#define TCP_USE_SACKS 1 /**< Disable only for testing */
#define TCP_TIMER_HANDLE_INVALID ((u32) ~0)
-/* Timer delays as multiples of 100ms */
-#define TCP_TO_TIMER_TICK TCP_TICK*10 /* Period for converting from TCP
- * ticks to timer units */
-#define TCP_DELACK_TIME 1 /* 0.1s */
-#define TCP_SYN_RCVD_TIME 600 /* 60s */
-#define TCP_2MSL_TIME 300 /* 30s */
-#define TCP_CLOSEWAIT_TIME 20 /* 2s */
-#define TCP_TIMEWAIT_TIME 100 /* 10s */
-#define TCP_FINWAIT1_TIME 600 /* 60s */
-#define TCP_CLEANUP_TIME 1 /* 0.1s */
-#define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */
+#define TCP_TIMER_TICK 0.1 /**< Timer tick in seconds */
+#define TCP_TO_TIMER_TICK TCP_TICK*10 /**< Factor for converting
+ ticks to timer ticks */
#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */
#define TCP_RTO_MIN 0.2 * THZ /* Min RTO (200ms) - lower than standard */
u32 sacked_bytes; /**< Number of bytes sacked in sb */
u32 last_sacked_bytes; /**< Number of bytes last sacked */
u32 last_bytes_delivered; /**< Sack bytes delivered to app */
- u32 snd_una_adv; /**< Bytes to add to snd_una */
u32 high_sacked; /**< Highest byte sacked (fack) */
u32 high_rxt; /**< Highest retransmitted sequence */
u32 rescue_rxt; /**< Rescue sequence number */
u32 lost_bytes; /**< Bytes lost as per RFC6675 */
u32 last_lost_bytes; /**< Number of bytes last lost */
u32 cur_rxt_hole; /**< Retransmitting from this hole */
+ u8 is_reneging;
#if TCP_SCOREBOARD_TRACE
scoreboard_trace_elt_t *trace;
u64 delivered; /**< Total delivered bytes for sample */
f64 delivered_time; /**< Delivered time when sample taken */
f64 tx_time; /**< Transmit time for the burst */
- u64 tx_rate; /**< Tx pacing rate */
+ f64 first_tx_time; /**< Connection first tx time at tx */
tcp_bts_flags_t flags; /**< Sample flag */
} tcp_bt_sample_t;
f64 prior_time; /**< Delivered time of sample used for rate */
f64 interval_time; /**< Time to ack the bytes delivered */
f64 rtt_time; /**< RTT for sample */
- u64 tx_rate; /**< Tx pacing rate */
u32 delivered; /**< Bytes delivered in interval_time */
u32 acked_and_sacked; /**< Bytes acked + sacked now */
u32 lost; /**< Bytes lost now */
transport_connection_t connection; /**< Common transport data. First! */
u8 state; /**< TCP state as per tcp_state_t */
+ u8 is_tso; /** is connection could use tso */
u16 flags; /**< Connection flags (see tcp_conn_flags_e) */
u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */
u64 delivered; /**< Total bytes delivered to peer */
u64 app_limited; /**< Delivered when app-limited detected */
f64 delivered_time; /**< Time last bytes were acked */
+ f64 first_tx_time; /**< Send time for recently delivered/sent */
tcp_byte_tracker_t *bt; /**< Tx byte tracker */
tcp_errors_t errors; /**< Soft connection errors */
void (*recovered) (tcp_connection_t * tc);
void (*undo_recovery) (tcp_connection_t * tc);
void (*event) (tcp_connection_t *tc, tcp_cc_event_t evt);
+ u64 (*get_pacing_rate) (tcp_connection_t *tc);
};
/* *INDENT-ON* */
u64 second;
} tcp_iss_seed_t;
+typedef struct tcp_configuration_
+{
+ /** Max rx fifo size for a session (in bytes). It is used in to compute the
+ * rfc 7323 window scaling factor */
+ u32 max_rx_fifo;
+
+ /** Min rx fifo for a session (in bytes) */
+ u32 min_rx_fifo;
+
+ /** Default MTU to be used when establishing connections */
+ u16 default_mtu;
+
+ /** Initial CWND multiplier, which multiplies MSS to determine initial CWND.
+ * Set 0 to determine the initial CWND by another way */
+ u16 initial_cwnd_multiplier;
+
+ /** Enable tx pacing for new connections */
+ u8 enable_tx_pacing;
+
+ /** Default congestion control algorithm type */
+ tcp_cc_algorithm_type_e cc_algo;
+
+ /** Min rwnd, as number of snd_mss segments, for update ack to be sent after
+ * a zero rwnd advertisement */
+ u32 rwnd_min_update_ack;
+
+ /** Delayed ack time (disabled) */
+ u16 delack_time;
+
+ /** Timer ticks to wait for close from app */
+ u16 closewait_time;
+
+ /** Timer ticks to wait in time-wait. Also known as 2MSL */
+ u16 timewait_time;
+
+ /** Timer ticks to wait in fin-wait1 to send fin and rcv fin-ack */
+ u16 finwait1_time;
+
+ /** Timer ticks to wait in last ack for ack */
+ u16 lastack_time;
+
+ /** Timer ticks to wait in fin-wait2 for fin */
+ u16 finwait2_time;
+
+ /** Timer ticks to wait in closing for fin ack */
+ u16 closing_time;
+
+ /** Timer ticks to wait before cleaning up the connection */
+ u16 cleanup_time;
+
+ /** Number of preallocated connections */
+ u32 preallocated_connections;
+
+ /** Number of preallocated half-open connections */
+ u32 preallocated_half_open_connections;
+
+ /** Vectors of src addresses. Optional unless one needs > 63K active-opens */
+ ip4_address_t *ip4_src_addrs;
+ ip6_address_t *ip6_src_addrs;
+
+ /** Fault-injection. Debug only */
+ f64 buffer_fail_fraction;
+} tcp_configuration_t;
+
typedef struct _tcp_main
{
/* Per-worker thread tcp connection pools */
/** per-worker context */
tcp_worker_ctx_t *wrk_ctx;
- /* Pool of half-open connections on which we've sent a SYN */
+ /** Pool of half-open connections on which we've sent a SYN */
tcp_connection_t *half_open_connections;
clib_spinlock_t half_open_lock;
/** vlib buffer size */
u32 bytes_per_buffer;
- /* Seed used to generate random iss */
+ /** Seed used to generate random iss */
tcp_iss_seed_t iss_seed;
- /* Congestion control algorithms registered */
+ /** Congestion control algorithms registered */
tcp_cc_algorithm_t *cc_algos;
/** Hash table of cc algorithms by name */
/** Last cc algo registered */
tcp_cc_algorithm_type_e cc_last_type;
- /*
- * Configuration
- */
-
- /* Flag that indicates if stack is on or off */
+ /** Flag that indicates if stack is on or off */
u8 is_enabled;
- /** Max rx fifo size for a session. It is used in to compute the
- * rfc 7323 window scaling factor */
- u32 max_rx_fifo;
-
- /** Default MTU to be used when establishing connections */
- u16 default_mtu;
-
- /** Initial CWND multiplier, which multiplies MSS to determine initial CWND.
- * Set 0 to determine the initial CWND by another way */
- u16 initial_cwnd_multiplier;
-
- /** Number of preallocated connections */
- u32 preallocated_connections;
- u32 preallocated_half_open_connections;
-
- /** Vectors of src addresses. Optional unless one needs > 63K active-opens */
- ip4_address_t *ip4_src_addresses;
- u32 last_v4_address_rotor;
- u32 last_v6_address_rotor;
- ip6_address_t *ip6_src_addresses;
-
- /** Enable tx pacing for new connections */
- u8 tx_pacing;
-
+ /** Flag that indicates if v4 punting is enabled */
u8 punt_unknown4;
+
+ /** Flag that indicates if v6 punting is enabled */
u8 punt_unknown6;
- /** fault-injection */
- f64 buffer_fail_fraction;
+ /** Rotor for v4 source addresses */
+ u32 last_v4_addr_rotor;
- /** Default congestion control algorithm type */
- tcp_cc_algorithm_type_e cc_algo;
+ /** Rotor for v6 source addresses */
+ u32 last_v6_addr_rotor;
+
+ /** Protocol configuration */
+ tcp_configuration_t cfg;
} tcp_main_t;
extern tcp_main_t tcp_main;
extern vlib_node_registration_t tcp4_listen_node;
extern vlib_node_registration_t tcp6_listen_node;
+#define tcp_cfg tcp_main.cfg
#define tcp_node_index(node_id, is_ip4) \
((is_ip4) ? tcp4_##node_id##_node.index : tcp6_##node_id##_node.index)
tcp_connection_set_state (tcp_connection_t * tc, tcp_state_t state)
{
tc->state = state;
- TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
+ TCP_EVT (TCP_EVT_STATE_CHANGE, tc);
}
void tcp_connection_close (tcp_connection_t * tc);
void tcp_connection_del (tcp_connection_t * tc);
int tcp_half_open_connection_cleanup (tcp_connection_t * tc);
tcp_connection_t *tcp_connection_alloc (u8 thread_index);
+tcp_connection_t *tcp_connection_alloc_w_base (u8 thread_index,
+ tcp_connection_t * base);
void tcp_connection_free (tcp_connection_t * tc);
void tcp_connection_reset (tcp_connection_t * tc);
int tcp_configure_v4_source_address_range (vlib_main_t * vm,
void tcp_send_syn (tcp_connection_t * tc);
void tcp_send_synack (tcp_connection_t * tc);
void tcp_send_fin (tcp_connection_t * tc);
-void tcp_init_mss (tcp_connection_t * tc);
void tcp_update_burst_snd_vars (tcp_connection_t * tc);
void tcp_update_rto (tcp_connection_t * tc);
void tcp_flush_frame_to_output (tcp_worker_ctx_t * wrk, u8 is_ip4);
always_inline u32
tcp_initial_cwnd (const tcp_connection_t * tc)
{
- if (tcp_main.initial_cwnd_multiplier > 0)
- return tcp_main.initial_cwnd_multiplier * tc->snd_mss;
+ if (tcp_cfg.initial_cwnd_multiplier > 0)
+ return tcp_cfg.initial_cwnd_multiplier * tc->snd_mss;
if (tc->snd_mss > 2190)
return 2 * tc->snd_mss;
tc->cc_algo->event (tc, evt);
}
+static inline u64
+tcp_cc_get_pacing_rate (tcp_connection_t * tc)
+{
+ if (tc->cc_algo->get_pacing_rate)
+ return tc->cc_algo->get_pacing_rate (tc);
+
+ f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
+ /* TODO should constrain to interface's max throughput but
+ * we don't have link speeds for sw ifs ..*/
+ return (tc->cwnd / srtt);
+}
+
always_inline void
tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval)
{
{
/* Reuse RTO. It's backed off in handler */
tcp_timer_set (tc, TCP_TIMER_PERSIST,
- clib_max (tc->rto * TCP_TO_TIMER_TICK,
- TCP_TIMER_PERSIST_MIN));
+ clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
}
always_inline void
tcp_persist_timer_update (tcp_connection_t * tc)
{
tcp_timer_update (tc, TCP_TIMER_PERSIST,
- clib_max (tc->rto * TCP_TO_TIMER_TICK,
- TCP_TIMER_PERSIST_MIN));
+ clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
}
always_inline void