2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #include <vnet/tcp/tcp.h>
17 #include <vnet/session/session.h>
18 #include <vnet/fib/fib.h>
19 #include <vnet/dpo/load_balance.h>
25 tcp_connection_bind (u32 session_index, transport_endpoint_t * lcl)
27 tcp_main_t *tm = &tcp_main;
28 tcp_connection_t *listener;
30 pool_get (tm->listener_pool, listener);
31 memset (listener, 0, sizeof (*listener));
33 listener->c_c_index = listener - tm->listener_pool;
34 listener->c_lcl_port = clib_host_to_net_u16 (lcl->port);
38 listener->c_lcl_ip4.as_u32 = lcl->ip.ip4.as_u32;
39 listener->c_is_ip4 = 1;
40 listener->c_proto = SESSION_TYPE_IP4_TCP;
44 clib_memcpy (&listener->c_lcl_ip6, &lcl->ip.ip6,
45 sizeof (ip6_address_t));
46 listener->c_proto = SESSION_TYPE_IP6_TCP;
49 listener->c_s_index = session_index;
50 listener->state = TCP_STATE_LISTEN;
52 tcp_connection_timers_init (listener);
54 TCP_EVT_DBG (TCP_EVT_BIND, listener);
56 return listener->c_c_index;
60 tcp_session_bind (u32 session_index, transport_endpoint_t * tep)
62 return tcp_connection_bind (session_index, tep);
66 tcp_connection_unbind (u32 listener_index)
68 tcp_main_t *tm = vnet_get_tcp_main ();
71 tc = pool_elt_at_index (tm->listener_pool, listener_index);
73 TCP_EVT_DBG (TCP_EVT_UNBIND, tc);
75 /* Poison the entry */
77 memset (tc, 0xFA, sizeof (*tc));
79 pool_put_index (tm->listener_pool, listener_index);
83 tcp_session_unbind (u32 listener_index)
85 tcp_connection_unbind (listener_index);
89 transport_connection_t *
90 tcp_session_get_listener (u32 listener_index)
92 tcp_main_t *tm = vnet_get_tcp_main ();
94 tc = pool_elt_at_index (tm->listener_pool, listener_index);
95 return &tc->connection;
99 * Cleans up connection state.
104 tcp_connection_cleanup (tcp_connection_t * tc)
106 tcp_main_t *tm = &tcp_main;
108 transport_endpoint_t *tep;
110 /* Cleanup local endpoint if this was an active connect */
111 tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip,
115 if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX)
117 tep = pool_elt_at_index (tm->local_endpoints, tepi);
118 transport_endpoint_table_del (&tm->local_endpoints_table, tep);
119 pool_put (tm->local_endpoints, tep);
122 /* Make sure all timers are cleared */
123 tcp_connection_timers_reset (tc);
125 /* Check if half-open */
126 if (tc->state == TCP_STATE_SYN_SENT)
128 tcp_half_open_connection_del (tc);
132 int thread_index = tc->c_thread_index;
133 /* Poison the entry */
135 memset (tc, 0xFA, sizeof (*tc));
136 pool_put (tm->connections[thread_index], tc);
141 * Connection removal.
143 * This should be called only once connection enters CLOSED state. Note
144 * that it notifies the session of the removal event, so if the goal is to
145 * just remove the connection, call tcp_connection_cleanup instead.
148 tcp_connection_del (tcp_connection_t * tc)
150 TCP_EVT_DBG (TCP_EVT_DELETE, tc);
151 stream_session_delete_notify (&tc->connection);
152 tcp_connection_cleanup (tc);
156 * Cleanup half-open connection
159 tcp_half_open_connection_del (tcp_connection_t * tc)
161 tcp_main_t *tm = vnet_get_tcp_main ();
163 memset (tc, 0xFA, sizeof (*tc));
164 clib_spinlock_lock_if_init (&tm->half_open_lock);
165 pool_put (tm->half_open_connections, tc);
166 clib_spinlock_unlock_if_init (&tm->half_open_lock);
170 tcp_half_open_connection_new ()
172 tcp_main_t *tm = vnet_get_tcp_main ();
173 tcp_connection_t *tc = 0;
174 clib_spinlock_lock_if_init (&tm->half_open_lock);
175 pool_get (tm->half_open_connections, tc);
176 clib_spinlock_unlock_if_init (&tm->half_open_lock);
177 memset (tc, 0, sizeof (*tc));
182 tcp_connection_new (u8 thread_index)
184 tcp_main_t *tm = vnet_get_tcp_main ();
185 tcp_connection_t *tc;
187 pool_get (tm->connections[thread_index], tc);
188 memset (tc, 0, sizeof (*tc));
189 tc->c_c_index = tc - tm->connections[thread_index];
190 tc->c_thread_index = thread_index;
194 /** Notify session that connection has been reset.
196 * Switch state to closed and wait for session to call cleanup.
199 tcp_connection_reset (tcp_connection_t * tc)
201 TCP_EVT_DBG (TCP_EVT_RST_RCVD, tc);
204 case TCP_STATE_SYN_RCVD:
205 /* Cleanup everything. App wasn't notified yet */
206 stream_session_delete_notify (&tc->connection);
207 tcp_connection_cleanup (tc);
209 case TCP_STATE_SYN_SENT:
210 /* XXX remove sst from call */
211 stream_session_connect_notify (&tc->connection, tc->connection.proto,
213 tcp_connection_cleanup (tc);
215 case TCP_STATE_ESTABLISHED:
216 case TCP_STATE_CLOSE_WAIT:
217 case TCP_STATE_FIN_WAIT_1:
218 case TCP_STATE_FIN_WAIT_2:
219 case TCP_STATE_CLOSING:
220 tc->state = TCP_STATE_CLOSED;
221 TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
223 /* Make sure all timers are cleared */
224 tcp_connection_timers_reset (tc);
225 stream_session_reset_notify (&tc->connection);
227 /* Wait for cleanup from session layer but not forever */
228 tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
230 case TCP_STATE_CLOSED:
236 * Begin connection closing procedure.
238 * If at the end the connection is not in CLOSED state, it is not removed.
239 * Instead, we rely on on TCP to advance through state machine to either
240 * 1) LAST_ACK (passive close) whereby when the last ACK is received
241 * tcp_connection_del is called. This notifies session of the delete and
243 * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers
244 * and cleanup is called.
246 * N.B. Half-close connections are not supported
249 tcp_connection_close (tcp_connection_t * tc)
251 TCP_EVT_DBG (TCP_EVT_CLOSE, tc);
253 /* Send FIN if needed */
254 if (tc->state == TCP_STATE_ESTABLISHED
255 || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT)
259 if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD)
260 tc->state = TCP_STATE_FIN_WAIT_1;
261 else if (tc->state == TCP_STATE_SYN_SENT)
262 tc->state = TCP_STATE_CLOSED;
263 else if (tc->state == TCP_STATE_CLOSE_WAIT)
264 tc->state = TCP_STATE_LAST_ACK;
265 TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
267 /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */
268 if (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID
269 && tc->state == TCP_STATE_CLOSED)
270 tcp_connection_del (tc);
274 tcp_session_close (u32 conn_index, u32 thread_index)
276 tcp_connection_t *tc;
277 tc = tcp_connection_get (conn_index, thread_index);
278 tcp_connection_close (tc);
282 tcp_session_cleanup (u32 conn_index, u32 thread_index)
284 tcp_connection_t *tc;
285 tc = tcp_connection_get (conn_index, thread_index);
287 /* Wait for the session tx events to clear */
288 tc->state = TCP_STATE_CLOSED;
289 TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc);
290 tcp_timer_update (tc, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
294 ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4)
296 ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
297 ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
298 ip_interface_address_t *ia = 0;
303 foreach_ip_interface_address (lm4, ia, sw_if_index, 1 /* unnumbered */ ,
305 return ip_interface_address_get_address (lm4, ia);
312 foreach_ip_interface_address (lm6, ia, sw_if_index, 1 /* unnumbered */ ,
314 return ip_interface_address_get_address (lm6, ia);
322 #define PORT_MASK ((1 << 16)- 1)
324 * Allocate local port and add if successful add entry to local endpoint
325 * table to mark the pair as used.
328 tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip)
330 transport_endpoint_t *tep;
332 u16 min = 1024, max = 65535; /* XXX configurable ? */
336 time_now = tcp_time_now ();
338 /* Only support active opens from thread 0 */
339 ASSERT (vlib_get_thread_index () == 0);
341 /* Start at random point or max */
342 pool_get (tm->local_endpoints, tep);
343 clib_memcpy (&tep->ip, ip, sizeof (*ip));
345 /* Search for first free slot */
346 for (; tries >= 0; tries--)
350 /* Find a port in the specified range */
353 port = random_u32 (&time_now) & PORT_MASK;
354 if (PREDICT_TRUE (port >= min && port < max))
361 tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip,
363 /* If not found, we're done */
364 if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX)
366 transport_endpoint_table_add (&tm->local_endpoints_table, tep,
367 tep - tm->local_endpoints);
372 pool_put (tm->local_endpoints, tep);
377 * Initialize all connection timers as invalid
380 tcp_connection_timers_init (tcp_connection_t * tc)
384 /* Set all to invalid */
385 for (i = 0; i < TCP_N_TIMERS; i++)
387 tc->timers[i] = TCP_TIMER_HANDLE_INVALID;
390 tc->rto = TCP_RTO_INIT;
394 * Stop all connection timers
397 tcp_connection_timers_reset (tcp_connection_t * tc)
400 for (i = 0; i < TCP_N_TIMERS; i++)
402 tcp_timer_reset (tc, i);
407 typedef struct ip4_tcp_hdr
413 typedef struct ip6_tcp_hdr
420 tcp_connection_select_lb_bucket (tcp_connection_t * tc, const dpo_id_t * dpo,
423 const dpo_id_t *choice;
427 lb = load_balance_get (dpo->dpoi_index);
431 memset (&hdr, 0, sizeof (hdr));
432 hdr.ip.protocol = IP_PROTOCOL_TCP;
433 hdr.ip.address_pair.src.as_u32 = tc->c_lcl_ip.ip4.as_u32;
434 hdr.ip.address_pair.dst.as_u32 = tc->c_rmt_ip.ip4.as_u32;
435 hdr.tcp.src_port = tc->c_lcl_port;
436 hdr.tcp.dst_port = tc->c_rmt_port;
437 hash = ip4_compute_flow_hash (&hdr.ip, lb->lb_hash_config);
442 memset (&hdr, 0, sizeof (hdr));
443 hdr.ip.protocol = IP_PROTOCOL_TCP;
444 clib_memcpy (&hdr.ip.src_address, &tc->c_lcl_ip.ip6,
445 sizeof (ip6_address_t));
446 clib_memcpy (&hdr.ip.dst_address, &tc->c_rmt_ip.ip6,
447 sizeof (ip6_address_t));
448 hdr.tcp.src_port = tc->c_lcl_port;
449 hdr.tcp.dst_port = tc->c_rmt_port;
450 hash = ip6_compute_flow_hash (&hdr.ip, lb->lb_hash_config);
452 choice = load_balance_get_bucket_i (lb, hash & lb->lb_n_buckets_minus_1);
453 dpo_copy (result, choice);
457 tcp_lookup_rmt_in_fib (tcp_connection_t * tc)
462 clib_memcpy (&prefix.fp_addr, &tc->c_rmt_ip, sizeof (prefix.fp_addr));
463 prefix.fp_proto = tc->c_is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6;
464 prefix.fp_len = tc->c_is_ip4 ? 32 : 128;
465 fib_index = fib_table_find (prefix.fp_proto, tc->c_vrf);
466 return fib_table_lookup (fib_index, &prefix);
470 tcp_connection_stack_on_fib_entry (tcp_connection_t * tc)
472 dpo_id_t choice = DPO_INVALID;
473 u32 output_node_index;
476 fe = fib_entry_get (tc->c_rmt_fei);
477 if (fe->fe_lb.dpoi_type != DPO_LOAD_BALANCE)
480 tcp_connection_select_lb_bucket (tc, &fe->fe_lb, &choice);
483 tc->c_is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
484 dpo_stack_from_node (output_node_index, &tc->c_rmt_dpo, &choice);
488 /** Stack tcp connection on peer's fib entry.
490 * This ultimately populates the dpo the connection will use to send packets.
493 tcp_connection_fib_attach (tcp_connection_t * tc)
495 tc->c_rmt_fei = tcp_lookup_rmt_in_fib (tc);
497 ASSERT (tc->c_rmt_fei != FIB_NODE_INDEX_INVALID);
499 tcp_connection_stack_on_fib_entry (tc);
503 /** Initialize tcp connection variables
505 * Should be called after having received a msg from the peer, i.e., a SYN or
506 * a SYNACK, such that connection options have already been exchanged. */
508 tcp_connection_init_vars (tcp_connection_t * tc)
510 tcp_connection_timers_init (tc);
512 scoreboard_init (&tc->sack_sb);
514 // tcp_connection_fib_attach (tc);
518 tcp_connection_open (transport_endpoint_t * rmt)
520 tcp_main_t *tm = vnet_get_tcp_main ();
521 tcp_connection_t *tc;
523 fib_node_index_t fei;
524 u32 sw_if_index, fib_index;
525 ip46_address_t lcl_addr;
529 * Find the local address and allocate port
531 memset (&lcl_addr, 0, sizeof (lcl_addr));
533 /* Find a FIB path to the destination */
534 clib_memcpy (&prefix.fp_addr, &rmt->ip, sizeof (rmt->ip));
535 prefix.fp_proto = rmt->is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6;
536 prefix.fp_len = rmt->is_ip4 ? 32 : 128;
538 fib_index = fib_table_find (prefix.fp_proto, rmt->vrf);
539 fei = fib_table_lookup (fib_index, &prefix);
541 /* Couldn't find route to destination. Bail out. */
542 if (fei == FIB_NODE_INDEX_INVALID)
544 clib_warning ("no route to destination");
548 sw_if_index = fib_entry_get_resolving_interface (fei);
550 if (sw_if_index == (u32) ~ 0)
552 clib_warning ("no resolving interface for %U", format_ip46_address,
553 &rmt->ip, IP46_TYPE_IP4);
561 if (vec_len (tm->ip4_src_addresses))
563 index = tm->last_v4_address_rotor++;
564 if (tm->last_v4_address_rotor >= vec_len (tm->ip4_src_addresses))
565 tm->last_v4_address_rotor = 0;
566 lcl_addr.ip4.as_u32 = tm->ip4_src_addresses[index].as_u32;
570 ip4 = ip_interface_get_first_ip (sw_if_index, 1);
571 lcl_addr.ip4.as_u32 = ip4->as_u32;
579 if (vec_len (tm->ip6_src_addresses))
581 index = tm->last_v6_address_rotor++;
582 if (tm->last_v6_address_rotor >= vec_len (tm->ip6_src_addresses))
583 tm->last_v6_address_rotor = 0;
584 clib_memcpy (&lcl_addr.ip6, &tm->ip6_src_addresses[index],
589 ip6 = ip_interface_get_first_ip (sw_if_index, 0);
590 clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6));
594 /* Allocate source port */
595 lcl_port = tcp_allocate_local_port (tm, &lcl_addr);
598 clib_warning ("Failed to allocate src port");
603 * Create connection and send SYN
606 tc = tcp_half_open_connection_new ();
608 clib_memcpy (&tc->c_rmt_ip, &rmt->ip, sizeof (ip46_address_t));
609 clib_memcpy (&tc->c_lcl_ip, &lcl_addr, sizeof (ip46_address_t));
610 tc->c_rmt_port = clib_host_to_net_u16 (rmt->port);
611 tc->c_lcl_port = clib_host_to_net_u16 (lcl_port);
612 tc->c_c_index = tc - tm->half_open_connections;
613 tc->c_is_ip4 = rmt->is_ip4;
614 tc->c_proto = rmt->is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
615 tc->c_vrf = rmt->vrf;
616 /* The other connection vars will be initialized after SYN ACK */
617 tcp_connection_timers_init (tc);
619 TCP_EVT_DBG (TCP_EVT_OPEN, tc);
620 tc->state = TCP_STATE_SYN_SENT;
623 return tc->c_c_index;
627 tcp_session_open (transport_endpoint_t * tep)
629 return tcp_connection_open (tep);
632 const char *tcp_dbg_evt_str[] = {
633 #define _(sym, str) str,
638 const char *tcp_fsm_states[] = {
639 #define _(sym, str) str,
640 foreach_tcp_fsm_state
645 format_tcp_state (u8 * s, va_list * args)
647 u32 state = va_arg (*args, u32);
649 if (state < TCP_N_STATES)
650 s = format (s, "%s", tcp_fsm_states[state]);
652 s = format (s, "UNKNOWN (%d (0x%x))", state, state);
656 const char *tcp_conn_timers[] = {
657 #define _(sym, str) str,
663 format_tcp_timers (u8 * s, va_list * args)
665 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
668 for (i = 0; i < TCP_N_TIMERS; i++)
669 if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
673 for (i = 0; i < last; i++)
675 if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
676 s = format (s, "%s,", tcp_conn_timers[i]);
680 s = format (s, "%s]", tcp_conn_timers[i]);
688 format_tcp_congestion_status (u8 * s, va_list * args)
690 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
691 if (tcp_in_recovery (tc))
692 s = format (s, "recovery");
693 else if (tcp_in_fastrecovery (tc))
694 s = format (s, "fastrecovery");
696 s = format (s, "none");
701 format_tcp_vars (u8 * s, va_list * args)
703 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
704 s = format (s, " snd_una %u snd_nxt %u snd_una_max %u",
705 tc->snd_una - tc->iss, tc->snd_nxt - tc->iss,
706 tc->snd_una_max - tc->iss);
707 s = format (s, " rcv_nxt %u rcv_las %u\n",
708 tc->rcv_nxt - tc->irs, tc->rcv_las - tc->irs);
709 s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n",
710 tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs,
711 tc->snd_wl2 - tc->iss);
712 s = format (s, " flight size %u send space %u rcv_wnd_av %d\n",
713 tcp_flight_size (tc), tcp_available_snd_space (tc),
714 tcp_rcv_wnd_available (tc));
715 s = format (s, " cong %U ", format_tcp_congestion_status, tc);
716 s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n",
717 tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked);
718 s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u",
719 tc->prev_ssthresh, tc->snd_congestion - tc->iss,
721 s = format (s, " limited_transmit %u\n", tc->limited_transmit - tc->iss);
722 s = format (s, " tsecr %u tsecr_last_ack %u\n", tc->rcv_opts.tsecr,
724 s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto,
725 tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
726 s = format (s, "rtt_seq %u\n", tc->rtt_seq);
727 s = format (s, " tsval_recent %u tsval_recent_age %u\n", tc->tsval_recent,
728 tcp_time_now () - tc->tsval_recent_age);
729 s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb);
730 if (vec_len (tc->snd_sacks))
731 s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc);
737 format_tcp_connection_id (u8 * s, va_list * args)
739 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
744 s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T",
745 format_ip4_address, &tc->c_lcl_ip4,
746 clib_net_to_host_u16 (tc->c_lcl_port), format_ip4_address,
747 &tc->c_rmt_ip4, clib_net_to_host_u16 (tc->c_rmt_port));
751 s = format (s, "[#%d][%s] %U:%d->%U:%d", tc->c_thread_index, "T",
752 format_ip6_address, &tc->c_lcl_ip6,
753 clib_net_to_host_u16 (tc->c_lcl_port), format_ip6_address,
754 &tc->c_rmt_ip6, clib_net_to_host_u16 (tc->c_rmt_port));
761 format_tcp_connection (u8 * s, va_list * args)
763 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
764 u32 verbose = va_arg (*args, u32);
766 s = format (s, "%-50U", format_tcp_connection_id, tc);
769 s = format (s, "%-15U", format_tcp_state, tc->state);
771 s = format (s, " %U\n%U", format_tcp_timers, tc, format_tcp_vars, tc);
778 format_tcp_session (u8 * s, va_list * args)
780 u32 tci = va_arg (*args, u32);
781 u32 thread_index = va_arg (*args, u32);
782 u32 verbose = va_arg (*args, u32);
783 tcp_connection_t *tc;
785 tc = tcp_connection_get (tci, thread_index);
787 s = format (s, "%U", format_tcp_connection, tc, verbose);
789 s = format (s, "empty");
794 format_tcp_listener_session (u8 * s, va_list * args)
796 u32 tci = va_arg (*args, u32);
797 tcp_connection_t *tc = tcp_listener_get (tci);
798 return format (s, "%U", format_tcp_connection_id, tc);
802 format_tcp_half_open_session (u8 * s, va_list * args)
804 u32 tci = va_arg (*args, u32);
805 tcp_connection_t *tc = tcp_half_open_connection_get (tci);
806 return format (s, "%U", format_tcp_connection_id, tc);
810 format_tcp_sacks (u8 * s, va_list * args)
812 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
813 sack_block_t *sacks = tc->snd_sacks;
817 len = vec_len (sacks);
818 for (i = 0; i < len - 1; i++)
821 s = format (s, " start %u end %u\n", block->start - tc->irs,
822 block->end - tc->irs);
826 block = &sacks[len - 1];
827 s = format (s, " start %u end %u", block->start - tc->irs,
828 block->end - tc->irs);
834 format_tcp_rcv_sacks (u8 * s, va_list * args)
836 tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
837 sack_block_t *sacks = tc->rcv_opts.sacks;
841 len = vec_len (sacks);
842 for (i = 0; i < len - 1; i++)
845 s = format (s, " start %u end %u\n", block->start - tc->iss,
846 block->end - tc->iss);
850 block = &sacks[len - 1];
851 s = format (s, " start %u end %u", block->start - tc->iss,
852 block->end - tc->iss);
858 format_tcp_sack_hole (u8 * s, va_list * args)
860 sack_scoreboard_hole_t *hole = va_arg (*args, sack_scoreboard_hole_t *);
861 s = format (s, "[%u, %u]", hole->start, hole->end);
866 format_tcp_scoreboard (u8 * s, va_list * args)
868 sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *);
869 sack_scoreboard_hole_t *hole;
870 s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n",
871 sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes);
872 s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n",
873 sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv);
874 s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u",
875 sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt);
877 hole = scoreboard_first_hole (sb);
879 s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail);
883 s = format (s, "%U", format_tcp_sack_hole, hole);
884 hole = scoreboard_next_hole (sb, hole);
890 transport_connection_t *
891 tcp_session_get_transport (u32 conn_index, u32 thread_index)
893 tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index);
894 return &tc->connection;
897 transport_connection_t *
898 tcp_half_open_session_get_transport (u32 conn_index)
900 tcp_connection_t *tc = tcp_half_open_connection_get (conn_index);
901 return &tc->connection;
905 * Compute maximum segment size for session layer.
907 * Since the result needs to be the actual data length, it first computes
908 * the tcp options to be used in the next burst and subtracts their
909 * length from the connection's snd_mss.
912 tcp_session_send_mss (transport_connection_t * trans_conn)
914 tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
916 /* Ensure snd_mss does accurately reflect the amount of data we can push
917 * in a segment. This also makes sure that options are updated according to
918 * the current state of the connection. */
919 tcp_update_snd_mss (tc);
925 tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
927 if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
929 return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0;
932 /* If we can't write at least a segment, don't try at all */
933 if (PREDICT_FALSE (snd_space < tc->snd_mss))
935 if (snd_space > clib_min (tc->mss, tc->rcv_opts.mss) - TCP_HDR_LEN_MAX)
940 /* round down to mss multiple */
941 return snd_space - (snd_space % tc->snd_mss);
945 * Compute tx window session is allowed to fill.
947 * Takes into account available send space, snd_mss and the congestion
948 * state of the connection. If possible, the value returned is a multiple
951 * @param tc tcp connection
952 * @return number of bytes session is allowed to write
955 tcp_snd_space (tcp_connection_t * tc)
957 int snd_space, snt_limited;
959 if (PREDICT_TRUE (tcp_in_cong_recovery (tc) == 0))
961 snd_space = tcp_available_snd_space (tc);
963 /* If we haven't gotten dupacks or if we did and have gotten sacked
964 * bytes then we can still send as per Limited Transmit (RFC3042) */
965 if (PREDICT_FALSE (tc->rcv_dupacks != 0
966 && (tcp_opts_sack_permitted (tc)
967 && tc->sack_sb.last_sacked_bytes == 0)))
969 if (tc->rcv_dupacks == 1 && tc->limited_transmit != tc->snd_nxt)
970 tc->limited_transmit = tc->snd_nxt;
971 ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt));
973 snt_limited = tc->snd_nxt - tc->limited_transmit;
974 snd_space = clib_max (2 * tc->snd_mss - snt_limited, 0);
976 return tcp_round_snd_space (tc, snd_space);
979 if (tcp_in_recovery (tc))
981 tc->snd_nxt = tc->snd_una_max;
982 snd_space = tcp_available_wnd (tc) - tc->snd_rxt_bytes
983 - (tc->snd_una_max - tc->snd_congestion);
984 if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd)
986 return tcp_round_snd_space (tc, snd_space);
989 /* If in fast recovery, send 1 SMSS if wnd allows */
990 if (tcp_in_fastrecovery (tc)
991 && tcp_available_snd_space (tc) && !tcp_fastrecovery_sent_1_smss (tc))
993 tcp_fastrecovery_1_smss_on (tc);
1001 tcp_session_send_space (transport_connection_t * trans_conn)
1003 tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
1004 return tcp_snd_space (tc);
1008 tcp_rcv_wnd_available (tcp_connection_t * tc)
1010 return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
1014 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn)
1016 tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
1018 ASSERT (seq_geq (tc->snd_nxt, tc->snd_una));
1020 /* This still works if fast retransmit is on */
1021 return (tc->snd_nxt - tc->snd_una);
1025 const static transport_proto_vft_t tcp_proto = {
1026 .bind = tcp_session_bind,
1027 .unbind = tcp_session_unbind,
1028 .push_header = tcp_push_header,
1029 .get_connection = tcp_session_get_transport,
1030 .get_listener = tcp_session_get_listener,
1031 .get_half_open = tcp_half_open_session_get_transport,
1032 .open = tcp_session_open,
1033 .close = tcp_session_close,
1034 .cleanup = tcp_session_cleanup,
1035 .send_mss = tcp_session_send_mss,
1036 .send_space = tcp_session_send_space,
1037 .tx_fifo_offset = tcp_session_tx_fifo_offset,
1038 .format_connection = format_tcp_session,
1039 .format_listener = format_tcp_listener_session,
1040 .format_half_open = format_tcp_half_open_session,
1045 tcp_timer_keep_handler (u32 conn_index)
1047 u32 thread_index = vlib_get_thread_index ();
1048 tcp_connection_t *tc;
1050 tc = tcp_connection_get (conn_index, thread_index);
1051 tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID;
1053 tcp_connection_close (tc);
1057 tcp_timer_establish_handler (u32 conn_index)
1059 tcp_connection_t *tc;
1062 tc = tcp_half_open_connection_get (conn_index);
1063 tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID;
1065 ASSERT (tc->state == TCP_STATE_SYN_SENT);
1067 sst = tc->c_is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
1068 stream_session_connect_notify (&tc->connection, sst, 1 /* fail */ );
1070 tcp_connection_cleanup (tc);
1074 tcp_timer_waitclose_handler (u32 conn_index)
1076 u32 thread_index = vlib_get_thread_index ();
1077 tcp_connection_t *tc;
1079 tc = tcp_connection_get (conn_index, thread_index);
1080 tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID;
1082 /* Session didn't come back with a close(). Send FIN either way
1083 * and switch to LAST_ACK. */
1084 if (tc->state == TCP_STATE_CLOSE_WAIT)
1086 if (tc->flags & TCP_CONN_FINSNT)
1088 clib_warning ("FIN was sent and still in CLOSE WAIT. Weird!");
1092 tc->state = TCP_STATE_LAST_ACK;
1094 /* Make sure we don't wait in LAST ACK forever */
1095 tcp_timer_set (tc, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
1097 /* Don't delete the connection yet */
1101 tcp_connection_del (tc);
1105 static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] =
1107 tcp_timer_retransmit_handler,
1108 tcp_timer_delack_handler,
1109 tcp_timer_persist_handler,
1110 tcp_timer_keep_handler,
1111 tcp_timer_waitclose_handler,
1112 tcp_timer_retransmit_syn_handler,
1113 tcp_timer_establish_handler
1118 tcp_expired_timers_dispatch (u32 * expired_timers)
1121 u32 connection_index, timer_id;
1123 for (i = 0; i < vec_len (expired_timers); i++)
1125 /* Get session index and timer id */
1126 connection_index = expired_timers[i] & 0x0FFFFFFF;
1127 timer_id = expired_timers[i] >> 28;
1129 TCP_EVT_DBG (TCP_EVT_TIMER_POP, connection_index, timer_id);
1131 /* Handle expiration */
1132 (*timer_expiration_handlers[timer_id]) (connection_index);
1137 tcp_initialize_timer_wheels (tcp_main_t * tm)
1139 tw_timer_wheel_16t_2w_512sl_t *tw;
1141 foreach_vlib_main (({
1142 tw = &tm->timer_wheels[ii];
1143 tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch,
1144 100e-3 /* timer period 100ms */ , ~0);
1145 tw->last_run_time = vlib_time_now (this_vlib_main);
1151 tcp_main_enable (vlib_main_t * vm)
1153 tcp_main_t *tm = vnet_get_tcp_main ();
1154 ip_protocol_info_t *pi;
1155 ip_main_t *im = &ip_main;
1156 vlib_thread_main_t *vtm = vlib_get_thread_main ();
1157 clib_error_t *error = 0;
1160 tcp_connection_t *tc __attribute__ ((unused));
1162 if ((error = vlib_call_init_function (vm, ip_main_init)))
1164 if ((error = vlib_call_init_function (vm, ip4_lookup_init)))
1166 if ((error = vlib_call_init_function (vm, ip6_lookup_init)))
1173 /* Register with IP */
1174 pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP);
1176 return clib_error_return (0, "TCP protocol info AWOL");
1177 pi->format_header = format_tcp_header;
1178 pi->unformat_pg_edit = unformat_pg_tcp_header;
1180 ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index);
1182 /* Register as transport with session layer */
1183 session_register_transport (SESSION_TYPE_IP4_TCP, &tcp_proto);
1184 session_register_transport (SESSION_TYPE_IP6_TCP, &tcp_proto);
1187 * Initialize data structures
1190 num_threads = 1 /* main thread */ + vtm->n_threads;
1191 vec_validate (tm->connections, num_threads - 1);
1194 * Preallocate connections
1196 for (thread = 0; thread < num_threads; thread++)
1198 for (i = 0; i < tm->preallocated_connections; i++)
1199 pool_get (tm->connections[thread], tc);
1201 for (i = 0; i < tm->preallocated_connections; i++)
1202 pool_put_index (tm->connections[thread], i);
1206 * Preallocate half-open connections
1208 for (i = 0; i < tm->preallocated_half_open_connections; i++)
1209 pool_get (tm->half_open_connections, tc);
1211 for (i = 0; i < tm->preallocated_half_open_connections; i++)
1212 pool_put_index (tm->half_open_connections, i);
1214 /* Initialize per worker thread tx buffers (used for control messages) */
1215 vec_validate (tm->tx_buffers, num_threads - 1);
1217 /* Initialize timer wheels */
1218 vec_validate (tm->timer_wheels, num_threads - 1);
1219 tcp_initialize_timer_wheels (tm);
1221 /* Initialize clocks per tick for TCP timestamp. Used to compute
1222 * monotonically increasing timestamps. */
1223 tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock
1224 / TCP_TSTAMP_RESOLUTION;
1226 clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table",
1227 200000 /* $$$$ config parameter nbuckets */ ,
1228 (64 << 20) /*$$$ config parameter table size */ );
1229 if (num_threads > 1)
1230 clib_spinlock_init (&tm->half_open_lock);
1235 vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en)
1239 if (tcp_main.is_enabled)
1242 return tcp_main_enable (vm);
1246 tcp_main.is_enabled = 0;
1253 tcp_init (vlib_main_t * vm)
1255 tcp_main_t *tm = vnet_get_tcp_main ();
1257 tm->vnet_main = vnet_get_main ();
1263 VLIB_INIT_FUNCTION (tcp_init);
1266 static clib_error_t *
1267 tcp_config_fn (vlib_main_t * vm, unformat_input_t * input)
1269 tcp_main_t *tm = vnet_get_tcp_main ();
1271 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1274 (input, "preallocated-connections %d",
1275 &tm->preallocated_connections))
1277 else if (unformat (input, "preallocated-half-open-connections %d",
1278 &tm->preallocated_half_open_connections))
1281 return clib_error_return (0, "unknown input `%U'",
1282 format_unformat_error, input);
1287 VLIB_CONFIG_FUNCTION (tcp_config_fn, "tcp");
1289 static clib_error_t *
1290 tcp_src_address (vlib_main_t * vm,
1291 unformat_input_t * input, vlib_cli_command_t * cmd_arg)
1293 tcp_main_t *tm = vnet_get_tcp_main ();
1294 ip4_address_t v4start, v4end;
1295 ip6_address_t v6start, v6end;
1299 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1301 if (unformat (input, "%U - %U", unformat_ip4_address, &v4start,
1302 unformat_ip4_address, &v4end))
1304 else if (unformat (input, "%U", unformat_ip4_address, &v4start))
1306 memcpy (&v4end, &v4start, sizeof (v4start));
1309 else if (unformat (input, "%U - %U", unformat_ip6_address, &v6start,
1310 unformat_ip4_address, &v6end))
1312 else if (unformat (input, "%U", unformat_ip6_address, &v6start))
1314 memcpy (&v6end, &v6start, sizeof (v4start));
1321 if (!v4set && !v6set)
1322 return clib_error_return (0, "at least one v4 or v6 address required");
1330 vec_add1 (tm->ip4_src_addresses, v4start);
1331 tmp = clib_net_to_host_u32 (v4start.as_u32);
1333 v4start.as_u32 = clib_host_to_net_u32 (tmp);
1335 while (clib_host_to_net_u32 (v4start.as_u32) <=
1336 clib_host_to_net_u32 (v4end.as_u32));
1340 clib_warning ("v6 src address list unimplemented...");
1346 VLIB_CLI_COMMAND (tcp_src_address_command, static) =
1348 .path = "tcp src-address",
1349 .short_help = "tcp src-address <ip-addr> [- <ip-addr>] add src address range",
1350 .function = tcp_src_address,
1355 tcp_scoreboard_dump_trace (u8 * s, sack_scoreboard_t * sb)
1357 #if TCP_SCOREBOARD_TRACE
1359 scoreboard_trace_elt_t *block;
1365 s = format (s, "scoreboard trace:");
1366 vec_foreach (block, sb->trace)
1368 s = format (s, "{%u, %u, %u, %u, %u}, ", block->start, block->end,
1369 block->ack, block->snd_una_max, block->group);
1371 s = format (s, "\n");
1379 static clib_error_t *
1380 tcp_show_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
1381 vlib_cli_command_t * cmd_arg)
1383 transport_connection_t *tconn = 0;
1384 tcp_connection_t *tc;
1386 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1388 if (unformat (input, "%U", unformat_transport_connection, &tconn,
1389 TRANSPORT_PROTO_TCP))
1392 return clib_error_return (0, "unknown input `%U'",
1393 format_unformat_error, input);
1396 if (!TCP_SCOREBOARD_TRACE)
1398 vlib_cli_output (vm, "scoreboard tracing not enabled");
1402 tc = tcp_get_connection_from_transport (tconn);
1403 s = tcp_scoreboard_dump_trace (s, &tc->sack_sb);
1404 vlib_cli_output (vm, "%v", s);
1409 VLIB_CLI_COMMAND (tcp_show_scoreboard_trace_command, static) =
1411 .path = "show tcp scoreboard trace",
1412 .short_help = "show tcp scoreboard trace <connection>",
1413 .function = tcp_show_scoreboard_trace_fn,
1418 tcp_scoreboard_replay (u8 * s, tcp_connection_t * tc, u8 verbose)
1421 scoreboard_trace_elt_t *trace;
1422 u32 next_ack, left, group, has_new_ack = 0;
1423 tcp_connection_t _dummy_tc, *dummy_tc = &_dummy_tc;
1424 sack_block_t *block;
1429 memset (dummy_tc, 0, sizeof (*dummy_tc));
1430 tcp_connection_timers_init (dummy_tc);
1431 scoreboard_init (&dummy_tc->sack_sb);
1432 dummy_tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK;
1434 #if TCP_SCOREBOARD_TRACE
1435 trace = tc->sack_sb.trace;
1436 trace_len = vec_len (tc->sack_sb.trace);
1442 for (i = 0; i < trace_len; i++)
1444 if (trace[i].ack != 0)
1446 dummy_tc->snd_una = trace[i].ack - 1448;
1447 dummy_tc->snd_una_max = trace[i].ack;
1452 while (left < trace_len)
1454 group = trace[left].group;
1455 vec_reset_length (dummy_tc->rcv_opts.sacks);
1457 while (trace[left].group == group)
1459 if (trace[left].ack != 0)
1462 s = format (s, "Adding ack %u, snd_una_max %u, segs: ",
1463 trace[left].ack, trace[left].snd_una_max);
1464 dummy_tc->snd_una_max = trace[left].snd_una_max;
1465 next_ack = trace[left].ack;
1471 s = format (s, "[%u, %u], ", trace[left].start,
1473 vec_add2 (dummy_tc->rcv_opts.sacks, block, 1);
1474 block->start = trace[left].start;
1475 block->end = trace[left].end;
1481 tcp_rcv_sacks (dummy_tc, next_ack);
1483 dummy_tc->snd_una = next_ack + dummy_tc->sack_sb.snd_una_adv;
1486 s = format (s, "result: %U", format_tcp_scoreboard,
1487 &dummy_tc->sack_sb);
1490 s = format (s, "result: %U", format_tcp_scoreboard, &dummy_tc->sack_sb);
1495 static clib_error_t *
1496 tcp_scoreboard_trace_fn (vlib_main_t * vm, unformat_input_t * input,
1497 vlib_cli_command_t * cmd_arg)
1499 transport_connection_t *tconn = 0;
1500 tcp_connection_t *tc = 0;
1502 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1504 if (unformat (input, "%U", unformat_transport_connection, &tconn,
1505 TRANSPORT_PROTO_TCP))
1508 return clib_error_return (0, "unknown input `%U'",
1509 format_unformat_error, input);
1512 if (!TCP_SCOREBOARD_TRACE)
1514 vlib_cli_output (vm, "scoreboard tracing not enabled");
1518 tc = tcp_get_connection_from_transport (tconn);
1521 vlib_cli_output (vm, "connection not found");
1524 str = tcp_scoreboard_replay (str, tc, 1);
1525 vlib_cli_output (vm, "%v", str);
1530 VLIB_CLI_COMMAND (tcp_replay_scoreboard_command, static) =
1532 .path = "tcp replay scoreboard",
1533 .short_help = "tcp replay scoreboard <connection>",
1534 .function = tcp_scoreboard_trace_fn,
1539 * fd.io coding-style-patch-verification: ON
1542 * eval: (c-set-style "gnu")