2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
16 #include <vppinfra/sparse_vec.h>
17 #include <vnet/tcp/tcp_packet.h>
18 #include <vnet/tcp/tcp.h>
19 #include <vnet/session/session.h>
22 static char *tcp_error_strings[] = {
23 #define tcp_error(n,s) s,
24 #include <vnet/tcp/tcp_error.def>
28 /* All TCP nodes have the same outgoing arcs */
29 #define foreach_tcp_state_next \
30 _ (DROP, "error-drop") \
31 _ (TCP4_OUTPUT, "tcp4-output") \
32 _ (TCP6_OUTPUT, "tcp6-output")
34 typedef enum _tcp_established_next
36 #define _(s,n) TCP_ESTABLISHED_NEXT_##s,
37 foreach_tcp_state_next
39 TCP_ESTABLISHED_N_NEXT,
40 } tcp_established_next_t;
42 typedef enum _tcp_rcv_process_next
44 #define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
45 foreach_tcp_state_next
47 TCP_RCV_PROCESS_N_NEXT,
48 } tcp_rcv_process_next_t;
50 typedef enum _tcp_syn_sent_next
52 #define _(s,n) TCP_SYN_SENT_NEXT_##s,
53 foreach_tcp_state_next
56 } tcp_syn_sent_next_t;
58 typedef enum _tcp_listen_next
60 #define _(s,n) TCP_LISTEN_NEXT_##s,
61 foreach_tcp_state_next
66 /* Generic, state independent indices */
67 typedef enum _tcp_state_next
69 #define _(s,n) TCP_NEXT_##s,
70 foreach_tcp_state_next
75 #define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
76 : TCP_NEXT_TCP6_OUTPUT)
78 vlib_node_registration_t tcp4_established_node;
79 vlib_node_registration_t tcp6_established_node;
82 * Validate segment sequence number. As per RFC793:
84 * Segment Receive Test
86 * ------- ------- -------------------------------------------
87 * 0 0 SEG.SEQ = RCV.NXT
88 * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
90 * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
91 * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
93 * This ultimately consists in checking if segment falls within the window.
94 * The one important difference compared to RFC793 is that we use rcv_las,
95 * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
96 * peer's reference when computing our receive window.
99 * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
100 * however, is too strict when we have retransmits. Instead we just check that
101 * the seq is not beyond the right edge and that the end of the segment is not
102 * less than the left edge.
104 * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
105 * use rcv_nxt in the right edge window test instead of rcv_las.
109 tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq)
111 return (seq_geq (end_seq, tc->rcv_las)
112 && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
116 * Parse TCP header options.
118 * @param th TCP header
119 * @param to TCP options data structure to be populated
120 * @return -1 if parsing failed
123 tcp_options_parse (tcp_header_t * th, tcp_options_t * to)
126 u8 opt_len, opts_len, kind;
130 opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
131 data = (const u8 *) (th + 1);
133 /* Zero out all flags but those set in SYN */
134 to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE);
136 for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
140 /* Get options length */
141 if (kind == TCP_OPTION_EOL)
143 else if (kind == TCP_OPTION_NOOP)
155 /* weird option length */
156 if (opt_len < 2 || opt_len > opts_len)
164 if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
166 to->flags |= TCP_OPTS_FLAG_MSS;
167 to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
170 case TCP_OPTION_WINDOW_SCALE:
171 if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
173 to->flags |= TCP_OPTS_FLAG_WSCALE;
174 to->wscale = data[2];
175 if (to->wscale > TCP_MAX_WND_SCALE)
177 clib_warning ("Illegal window scaling value: %d",
179 to->wscale = TCP_MAX_WND_SCALE;
183 case TCP_OPTION_TIMESTAMP:
184 if (opt_len == TCP_OPTION_LEN_TIMESTAMP)
186 to->flags |= TCP_OPTS_FLAG_TSTAMP;
187 to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
188 to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
191 case TCP_OPTION_SACK_PERMITTED:
192 if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
193 to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
195 case TCP_OPTION_SACK_BLOCK:
196 /* If SACK permitted was not advertised or a SYN, break */
197 if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
200 /* If too short or not correctly formatted, break */
201 if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
204 to->flags |= TCP_OPTS_FLAG_SACK;
205 to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
206 vec_reset_length (to->sacks);
207 for (j = 0; j < to->n_sack_blocks; j++)
209 b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 4 * j));
210 b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 4 * j));
211 vec_add1 (to->sacks, b);
215 /* Nothing to see here */
223 * RFC1323: Check against wrapped sequence numbers (PAWS). If we have
224 * timestamp to echo and it's less than tsval_recent, drop segment
225 * but still send an ACK in order to retain TCP's mechanism for detecting
226 * and recovering from half-open connections
228 * Or at least that's what the theory says. It seems that this might not work
229 * very well with packet reordering and fast retransmit. XXX
232 tcp_segment_check_paws (tcp_connection_t * tc)
234 return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent
235 && timestamp_lt (tc->opt.tsval, tc->tsval_recent);
239 * Update tsval recent
242 tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end)
245 * RFC1323: If Last.ACK.sent falls within the range of sequence numbers
246 * of an incoming segment:
247 * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
248 * then the TSval from the segment is copied to TS.Recent;
249 * otherwise, the TSval is ignored.
251 if (tcp_opts_tstamp (&tc->opt) && tc->tsval_recent
252 && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end))
254 tc->tsval_recent = tc->opt.tsval;
255 tc->tsval_recent_age = tcp_time_now ();
260 * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
262 * It first verifies if segment has a wrapped sequence number (PAWS) and then
263 * does the processing associated to the first four steps (ignoring security
264 * and precedence): sequence number, rst bit and syn bit checks.
266 * @return 0 if segments passes validation.
269 tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
270 vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0)
272 if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
275 if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->opt)))
280 if (tcp_segment_check_paws (tc0))
282 clib_warning ("paws failed");
283 TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
284 vnet_buffer (b0)->tcp.seq_end);
286 /* If it just so happens that a segment updates tsval_recent for a
287 * segment over 24 days old, invalidate tsval_recent. */
288 if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
291 /* Age isn't reset until we get a valid tsval (bsd inspired) */
292 tc0->tsval_recent = 0;
293 clib_warning ("paws failed - really old segment. REALLY?");
297 /* Drop after ack if not rst */
300 tcp_make_ack (tc0, b0);
301 *next0 = tcp_next_output (tc0->c_is_ip4);
302 TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
308 /* 1st: check sequence number */
309 if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
310 vnet_buffer (b0)->tcp.seq_end))
312 /* If our window is 0 and the packet is in sequence, let it pass
313 * through for ack processing. It should be dropped later.*/
314 if (tc0->rcv_wnd == 0
315 && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
317 /* TODO Should segment be tagged? */
321 /* If not RST, send dup ack */
324 tcp_make_ack (tc0, b0);
325 *next0 = tcp_next_output (tc0->c_is_ip4);
326 TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc0);
332 /* 2nd: check the RST bit */
335 tcp_connection_reset (tc0);
339 /* 3rd: check security and precedence (skip) */
341 /* 4th: check the SYN bit */
344 tcp_send_reset (b0, tc0->c_is_ip4);
348 /* If segment in window, save timestamp */
349 tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
350 vnet_buffer (b0)->tcp.seq_end);
356 tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0)
358 /* SND.UNA =< SEG.ACK =< SND.NXT */
359 return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number)
360 && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt));
364 * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
366 * Note that although the original article, srtt and rttvar are scaled
367 * to minimize round-off errors, here we don't. Instead, we rely on
368 * better precision time measurements.
370 * TODO support us rtt resolution
373 tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt)
379 err = mrtt - tc->srtt;
380 tc->srtt += err >> 3;
382 /* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
383 * The increase should be bound */
384 tc->rttvar += ((int) clib_abs (err) - (int) tc->rttvar) >> 2;
388 /* First measurement. */
390 tc->rttvar = mrtt >> 1;
394 /** Update RTT estimate and RTO timer
396 * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
397 * timing. Middle boxes are known to fiddle with TCP options so we
398 * should give higher priority to ACK timing.
400 * return 1 if valid rtt 0 otherwise
403 tcp_update_rtt (tcp_connection_t * tc, u32 ack)
408 /* Determine if only rtx bytes are acked. TODO fast retransmit */
409 rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss);
411 /* Karn's rule, part 1. Don't use retransmitted segments to estimate
412 * RTT because they're ambiguous. */
413 if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq) && !rtx_acked)
415 mrtt = tcp_time_now () - tc->rtt_ts;
417 /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
418 * snd_una, i.e., the left side of the send window:
419 * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't
420 * try to update rtt for dupacks */
421 else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked)
423 mrtt = tcp_time_now () - tc->opt.tsecr;
426 /* Allow measuring of a new RTT */
429 /* If ACK moves left side of the wnd make sure boff is 0, even if mrtt is
434 /* Ignore dubious measurements */
435 if (mrtt == 0 || mrtt > TCP_RTT_MAX)
438 tcp_estimate_rtt (tc, mrtt);
439 tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
445 * Dequeue bytes that have been acked and while at it update RTT estimates.
448 tcp_dequeue_acked (tcp_connection_t * tc, u32 ack)
450 /* Dequeue the newly ACKed bytes */
451 stream_session_dequeue_drop (&tc->connection, tc->bytes_acked);
453 /* Update rtt and rto */
454 tcp_update_rtt (tc, ack);
458 * Check if dupack as per RFC5681 Sec. 2
460 * This works only if called before updating snd_wnd.
463 tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd)
465 return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una)
466 && seq_gt (tc->snd_una_max, tc->snd_una)
467 && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
468 && (new_snd_wnd == tc->snd_wnd));
472 scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
474 sack_scoreboard_hole_t *next, *prev;
476 if (hole->next != TCP_INVALID_SACK_HOLE_INDEX)
478 next = pool_elt_at_index (sb->holes, hole->next);
479 next->prev = hole->prev;
482 if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
484 prev = pool_elt_at_index (sb->holes, hole->prev);
485 prev->next = hole->next;
489 sb->head = hole->next;
492 pool_put (sb->holes, hole);
495 sack_scoreboard_hole_t *
496 scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
499 sack_scoreboard_hole_t *hole, *next, *prev;
502 pool_get (sb->holes, hole);
503 memset (hole, 0, sizeof (*hole));
507 hole_index = hole - sb->holes;
509 prev = scoreboard_get_hole (sb, prev_index);
512 hole->prev = prev - sb->holes;
513 hole->next = prev->next;
515 if ((next = scoreboard_next_hole (sb, hole)))
516 next->prev = hole_index;
518 prev->next = hole_index;
522 sb->head = hole_index;
523 hole->prev = TCP_INVALID_SACK_HOLE_INDEX;
524 hole->next = TCP_INVALID_SACK_HOLE_INDEX;
531 tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
533 sack_scoreboard_t *sb = &tc->sack_sb;
534 sack_block_t *blk, tmp;
535 sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole;
536 u32 blk_index = 0, old_sacked_bytes, delivered_bytes, hole_index;
539 sb->last_sacked_bytes = 0;
541 old_sacked_bytes = sb->sacked_bytes;
544 if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
547 /* Remove invalid blocks */
549 while (blk < vec_end (tc->opt.sacks))
551 if (seq_lt (blk->start, blk->end)
552 && seq_gt (blk->start, tc->snd_una)
553 && seq_gt (blk->start, ack) && seq_leq (blk->end, tc->snd_nxt))
558 vec_del1 (tc->opt.sacks, blk - tc->opt.sacks);
561 /* Add block for cumulative ack */
562 if (seq_gt (ack, tc->snd_una))
564 tmp.start = tc->snd_una;
566 vec_add1 (tc->opt.sacks, tmp);
569 if (vec_len (tc->opt.sacks) == 0)
572 /* Make sure blocks are ordered */
573 for (i = 0; i < vec_len (tc->opt.sacks); i++)
574 for (j = i + 1; j < vec_len (tc->opt.sacks); j++)
575 if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start))
577 tmp = tc->opt.sacks[i];
578 tc->opt.sacks[i] = tc->opt.sacks[j];
579 tc->opt.sacks[j] = tmp;
582 if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
584 /* If no holes, insert the first that covers all outstanding bytes */
585 last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
586 tc->snd_una, tc->snd_una_max);
587 sb->tail = scoreboard_hole_index (sb, last_hole);
588 tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1];
589 sb->max_byte_sacked = tmp.end;
593 /* If we have holes but snd_una_max is beyond the last hole, update
595 tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1];
596 last_hole = scoreboard_last_hole (sb);
597 if (seq_gt (tc->snd_una_max, sb->max_byte_sacked)
598 && seq_gt (tc->snd_una_max, last_hole->end))
599 last_hole->end = tc->snd_una_max;
602 /* Walk the holes with the SACK blocks */
603 hole = pool_elt_at_index (sb->holes, sb->head);
604 while (hole && blk_index < vec_len (tc->opt.sacks))
606 blk = &tc->opt.sacks[blk_index];
608 if (seq_leq (blk->start, hole->start))
610 /* Block covers hole. Remove hole */
611 if (seq_geq (blk->end, hole->end))
613 next_hole = scoreboard_next_hole (sb, hole);
615 /* Byte accounting */
616 if (seq_leq (hole->end, ack))
618 /* Bytes lost because snd_wnd left edge advances */
619 if (next_hole && seq_leq (next_hole->start, ack))
620 delivered_bytes += next_hole->start - hole->end;
622 delivered_bytes += ack - hole->end;
626 sb->sacked_bytes += scoreboard_hole_bytes (hole);
629 /* About to remove last hole */
630 if (hole == last_hole)
632 sb->tail = hole->prev;
633 last_hole = scoreboard_last_hole (sb);
634 /* keep track of max byte sacked for when the last hole
636 if (seq_gt (hole->end, sb->max_byte_sacked))
637 sb->max_byte_sacked = hole->end;
640 /* snd_una needs to be advanced */
641 if (blk->end == ack && seq_geq (ack, hole->end))
643 if (next_hole && seq_lt (ack, next_hole->start))
645 sb->snd_una_adv = next_hole->start - ack;
647 /* all these can be delivered */
648 delivered_bytes += sb->snd_una_adv;
652 sb->snd_una_adv = sb->max_byte_sacked - ack;
653 delivered_bytes += sb->snd_una_adv;
657 scoreboard_remove_hole (sb, hole);
660 /* Partial 'head' overlap */
663 if (seq_gt (blk->end, hole->start))
665 sb->sacked_bytes += blk->end - hole->start;
666 hole->start = blk->end;
673 /* Hole must be split */
674 if (seq_lt (blk->end, hole->end))
676 sb->sacked_bytes += blk->end - blk->start;
677 hole_index = scoreboard_hole_index (sb, hole);
678 new_hole = scoreboard_insert_hole (sb, hole_index, blk->end,
681 /* Pool might've moved */
682 hole = scoreboard_get_hole (sb, hole_index);
683 hole->end = blk->start;
685 /* New or split of tail */
686 if ((last_hole->end == new_hole->end)
687 || seq_lt (last_hole->end, new_hole->start))
689 last_hole = new_hole;
690 sb->tail = scoreboard_hole_index (sb, new_hole);
694 hole = scoreboard_next_hole (sb, hole);
698 sb->sacked_bytes += hole->end - blk->start;
699 hole->end = blk->start;
700 hole = scoreboard_next_hole (sb, hole);
705 sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes;
706 sb->sacked_bytes -= delivered_bytes;
711 * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
712 * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
714 tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
716 if (seq_lt (tc->snd_wl1, seq)
717 || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
719 tc->snd_wnd = snd_wnd;
722 TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
724 /* Set probe timer if we just got 0 wnd */
725 if (tc->snd_wnd < tc->snd_mss
726 && !tcp_timer_is_active (tc, TCP_TIMER_PERSIST))
727 tcp_persist_timer_set (tc);
729 tcp_persist_timer_reset (tc);
734 tcp_cc_congestion (tcp_connection_t * tc)
736 tc->snd_congestion = tc->snd_nxt;
737 tc->cc_algo->congestion (tc);
738 TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
742 tcp_cc_recover (tcp_connection_t * tc)
744 /* TODO: check if time to recover was small. It might be that RTO popped
748 tc->cc_algo->recovered (tc);
752 tc->snd_nxt = tc->snd_una;
754 tc->cc_algo->rcv_ack (tc);
755 tc->tsecr_last_ack = tc->opt.tsecr;
757 tcp_cong_recovery_off (tc);
759 TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
763 tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b)
767 if (tcp_in_fastrecovery (tc))
769 partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
772 /* Clear retransmitted bytes. */
777 TCP_EVT_DBG (TCP_EVT_CC_PACK, tc);
779 /* Clear retransmitted bytes. XXX should we clear all? */
782 tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
784 /* In case snd_nxt is still in the past and output tries to
785 * shove some new bytes */
786 tc->snd_nxt = tc->snd_una_max;
788 /* XXX need proper RFC6675 support */
789 if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc))
791 tcp_fast_retransmit (tc);
795 /* Retransmit first unacked segment */
796 tcp_retransmit_first_unacked (tc);
802 tc->cc_algo->rcv_ack (tc);
803 tc->tsecr_last_ack = tc->opt.tsecr;
805 if (tcp_in_recovery (tc))
807 tc->rtx_bytes -= clib_min (tc->bytes_acked, tc->rtx_bytes);
808 tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
809 if (seq_geq (tc->snd_una, tc->snd_congestion))
810 tcp_recovery_off (tc);
816 tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack)
818 // ASSERT (seq_geq(tc->snd_una, ack));
821 if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
823 /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */
824 if (tc->opt.tsecr != tc->tsecr_last_ack)
830 tcp_fastrecovery_on (tc);
832 /* Handle congestion and dupack */
833 tcp_cc_congestion (tc);
834 tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
836 tcp_fast_retransmit (tc);
838 /* Post retransmit update cwnd to ssthresh and account for the
839 * three segments that have left the network and should've been
840 * buffered at the receiver */
841 tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss;
843 else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD)
845 ASSERT (tcp_in_fastrecovery (tc));
847 tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
852 tcp_cc_init (tcp_connection_t * tc)
854 tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO);
855 tc->cc_algo->init (tc);
859 tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
860 tcp_header_t * th, u32 * next, u32 * error)
864 /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
865 if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))
867 /* If we have outstanding data and this is within the window, accept it,
868 * probably retransmit has timed out. Otherwise ACK segment and then
870 if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max))
872 tcp_make_ack (tc, b);
873 *next = tcp_next_output (tc->c_is_ip4);
874 *error = TCP_ERROR_ACK_INVALID;
875 TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 0,
876 vnet_buffer (b)->tcp.ack_number);
880 TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 2,
881 vnet_buffer (b)->tcp.ack_number);
883 tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
884 *error = TCP_ERROR_ACK_FUTURE;
887 /* If old ACK, probably it's an old dupack */
888 if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
890 *error = TCP_ERROR_ACK_OLD;
891 TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
892 vnet_buffer (b)->tcp.ack_number);
893 if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
895 TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc);
896 tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number);
902 if (tcp_opts_sack_permitted (&tc->opt))
903 tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
905 new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale;
907 if (tcp_ack_is_dupack (tc, b, new_snd_wnd))
909 TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
910 tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number);
911 *error = TCP_ERROR_ACK_DUP;
919 tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
920 tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv;
922 /* Dequeue ACKed data and update RTT */
923 tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
924 tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
925 vnet_buffer (b)->tcp.ack_number, new_snd_wnd);
927 /* If some of our sent bytes have been acked, update cc and retransmit
931 TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
933 /* Updates congestion control (slow start/congestion avoidance) */
934 tcp_cc_rcv_ack (tc, b);
936 /* If everything has been acked, stop retransmit timer
937 * otherwise update. */
938 if (tc->snd_una == tc->snd_una_max)
939 tcp_retransmit_timer_reset (tc);
941 tcp_retransmit_timer_update (tc);
948 * Build SACK list as per RFC2018.
950 * Makes sure the first block contains the segment that generated the current
951 * ACK and the following ones are the ones most recently reported in SACK
954 * @param tc TCP connection for which the SACK list is updated
955 * @param start Start sequence number of the newest SACK block
956 * @param end End sequence of the newest SACK block
959 tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end)
961 sack_block_t *new_list = 0, *block = 0;
964 /* If the first segment is ooo add it to the list. Last write might've moved
965 * rcv_nxt over the first segment. */
966 if (seq_lt (tc->rcv_nxt, start))
968 vec_add2 (new_list, block, 1);
969 block->start = start;
973 /* Find the blocks still worth keeping. */
974 for (i = 0; i < vec_len (tc->snd_sacks); i++)
976 /* Discard if rcv_nxt advanced beyond current block */
977 if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt))
980 /* Merge or drop if segment overlapped by the new segment */
981 if (block && (seq_geq (tc->snd_sacks[i].end, new_list[0].start)
982 && seq_leq (tc->snd_sacks[i].start, new_list[0].end)))
984 if (seq_lt (tc->snd_sacks[i].start, new_list[0].start))
985 new_list[0].start = tc->snd_sacks[i].start;
986 if (seq_lt (new_list[0].end, tc->snd_sacks[i].end))
987 new_list[0].end = tc->snd_sacks[i].end;
991 /* Save to new SACK list if we have space. */
992 if (vec_len (new_list) < TCP_MAX_SACK_BLOCKS)
994 vec_add1 (new_list, tc->snd_sacks[i]);
998 ASSERT (vec_len (new_list) <= TCP_MAX_SACK_BLOCKS);
1000 /* Replace old vector with new one */
1001 vec_free (tc->snd_sacks);
1002 tc->snd_sacks = new_list;
1005 /** Enqueue data for delivery to application */
1007 tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
1012 /* Pure ACK. Update rcv_nxt and be done. */
1013 if (PREDICT_FALSE (data_len == 0))
1015 tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end;
1016 return TCP_ERROR_PURE_ACK;
1019 written = stream_session_enqueue_data (&tc->connection, b, 0,
1020 1 /* queue event */ , 1);
1022 TCP_EVT_DBG (TCP_EVT_INPUT, tc, 0, data_len, written);
1024 /* Update rcv_nxt */
1025 if (PREDICT_TRUE (written == data_len))
1027 tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end;
1029 /* If more data written than expected, account for out-of-order bytes. */
1030 else if (written > data_len)
1032 tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len;
1034 /* Send ACK confirming the update */
1035 tc->flags |= TCP_CONN_SNDACK;
1037 else if (written > 0)
1039 /* We've written something but FIFO is probably full now */
1040 tc->rcv_nxt += written;
1042 /* Depending on how fast the app is, all remaining buffers in burst will
1043 * not be enqueued. Inform peer */
1044 tc->flags |= TCP_CONN_SNDACK;
1046 return TCP_ERROR_PARTIALLY_ENQUEUED;
1050 tc->flags |= TCP_CONN_SNDACK;
1051 return TCP_ERROR_FIFO_FULL;
1054 /* Update SACK list if need be */
1055 if (tcp_opts_sack_permitted (&tc->opt))
1057 /* Remove SACK blocks that have been delivered */
1058 tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
1061 return TCP_ERROR_ENQUEUED;
1064 /** Enqueue out-of-order data */
1066 tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
1069 stream_session_t *s0;
1072 /* Pure ACK. Do nothing */
1073 if (PREDICT_FALSE (data_len == 0))
1075 return TCP_ERROR_PURE_ACK;
1078 /* Enqueue out-of-order data with absolute offset */
1079 rv = stream_session_enqueue_data (&tc->connection, b,
1080 vnet_buffer (b)->tcp.seq_number,
1081 0 /* queue event */ , 0);
1083 /* Nothing written */
1086 TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, 0);
1087 return TCP_ERROR_FIFO_FULL;
1090 TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len);
1092 /* Update SACK list if in use */
1093 if (tcp_opts_sack_permitted (&tc->opt))
1095 ooo_segment_t *newest;
1098 s0 = stream_session_get (tc->c_s_index, tc->c_thread_index);
1100 /* Get the newest segment from the fifo */
1101 newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo);
1102 start = ooo_segment_offset (s0->server_rx_fifo, newest);
1103 end = ooo_segment_end_offset (s0->server_rx_fifo, newest);
1105 tcp_update_sack_list (tc, start, end);
1108 return TCP_ERROR_ENQUEUED;
1112 * Check if ACK could be delayed. If ack can be delayed, it should return
1113 * true for a full frame. If we're always acking return 0.
1116 tcp_can_delack (tcp_connection_t * tc)
1118 /* Send ack if ... */
1120 /* just sent a rcv wnd 0 */
1121 || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0
1122 /* constrained to send ack */
1123 || (tc->flags & TCP_CONN_SNDACK) != 0
1124 /* we're almost out of tx wnd */
1125 || tcp_available_snd_space (tc) < 2 * tc->snd_mss)
1132 tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b,
1133 u16 n_data_bytes, u32 * next0)
1135 u32 error = 0, n_bytes_to_drop;
1137 /* Handle out-of-order data */
1138 if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
1140 /* Old sequence numbers allowed through because they overlapped
1142 if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
1144 error = TCP_ERROR_SEGMENT_OLD;
1145 *next0 = TCP_NEXT_DROP;
1147 /* Completely in the past (possible retransmit) */
1148 if (seq_lt (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
1151 /* Chop off the bytes in the past */
1152 n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
1153 n_data_bytes -= n_bytes_to_drop;
1154 vlib_buffer_advance (b, n_bytes_to_drop);
1159 error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
1161 /* N.B. Should not filter burst of dupacks. Two issues 1) dupacks open
1162 * cwnd on remote peer when congested 2) acks leaving should have the
1163 * latest rcv_wnd since the burst may eaten up all of it, so only the
1164 * old ones could be filtered.
1167 /* RFC2581: Send DUPACK for fast retransmit */
1168 tcp_make_ack (tc, b);
1169 *next0 = tcp_next_output (tc->c_is_ip4);
1171 /* Mark as DUPACK. We may filter these in output if
1172 * the burst fills the holes. */
1174 vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK;
1176 TCP_EVT_DBG (TCP_EVT_DUPACK_SENT, tc);
1182 /* In order data, enqueue. Fifo figures out by itself if any out-of-order
1183 * segments can be enqueued after fifo tail offset changes. */
1184 error = tcp_session_enqueue_data (tc, b, n_data_bytes);
1186 if (n_data_bytes == 0)
1188 *next0 = TCP_NEXT_DROP;
1192 /* Check if ACK can be delayed */
1193 if (tcp_can_delack (tc))
1195 if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
1196 tcp_timer_set (tc, TCP_TIMER_DELACK, TCP_DELACK_TIME);
1200 *next0 = tcp_next_output (tc->c_is_ip4);
1201 tcp_make_ack (tc, b);
1209 tcp_header_t tcp_header;
1210 tcp_connection_t tcp_connection;
1214 format_tcp_rx_trace (u8 * s, va_list * args)
1216 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1217 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1218 tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1219 uword indent = format_get_indent (s);
1221 s = format (s, "%U\n%U%U",
1222 format_tcp_header, &t->tcp_header, 128,
1223 format_white_space, indent,
1224 format_tcp_connection_verbose, &t->tcp_connection);
1230 format_tcp_rx_trace_short (u8 * s, va_list * args)
1232 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1233 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1234 tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1236 s = format (s, "%d -> %d (%U)",
1237 clib_net_to_host_u16 (t->tcp_header.src_port),
1238 clib_net_to_host_u16 (t->tcp_header.dst_port), format_tcp_state,
1239 &t->tcp_connection.state);
1245 tcp_set_rx_trace_data (tcp_rx_trace_t * t0, tcp_connection_t * tc0,
1246 tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
1250 clib_memcpy (&t0->tcp_connection, tc0, sizeof (t0->tcp_connection));
1254 th0 = tcp_buffer_hdr (b0);
1256 clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
1260 tcp_established_inc_counter (vlib_main_t * vm, u8 is_ip4, u8 evt, u8 val)
1262 if (PREDICT_TRUE (!val))
1266 vlib_node_increment_counter (vm, tcp4_established_node.index, evt, val);
1268 vlib_node_increment_counter (vm, tcp6_established_node.index, evt, val);
1272 tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1273 vlib_frame_t * from_frame, int is_ip4)
1275 u32 n_left_from, next_index, *from, *to_next;
1276 u32 my_thread_index = vm->thread_index, errors = 0;
1277 tcp_main_t *tm = vnet_get_tcp_main ();
1280 from = vlib_frame_vector_args (from_frame);
1281 n_left_from = from_frame->n_vectors;
1283 next_index = node->cached_next_index;
1285 while (n_left_from > 0)
1289 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1291 while (n_left_from > 0 && n_left_to_next > 0)
1295 tcp_header_t *th0 = 0;
1296 tcp_connection_t *tc0;
1297 u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
1304 n_left_to_next -= 1;
1306 b0 = vlib_get_buffer (vm, bi0);
1307 tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
1310 if (PREDICT_FALSE (tc0 == 0))
1312 error0 = TCP_ERROR_INVALID_CONNECTION;
1316 th0 = tcp_buffer_hdr (b0);
1318 is_fin = (th0->flags & TCP_FLAG_FIN) != 0;
1320 /* SYNs, FINs and data consume sequence numbers */
1321 vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
1322 + tcp_is_syn (th0) + is_fin + vnet_buffer (b0)->tcp.data_len;
1324 /* TODO header prediction fast path */
1326 /* 1-4: check SEQ, RST, SYN */
1327 if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0)))
1329 error0 = TCP_ERROR_SEGMENT_INVALID;
1330 TCP_EVT_DBG (TCP_EVT_SEG_INVALID, tc0,
1331 vnet_buffer (b0)->tcp.seq_number,
1332 vnet_buffer (b0)->tcp.seq_end);
1336 /* 5: check the ACK field */
1337 if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0))
1342 /* 6: check the URG bit TODO */
1344 /* 7: process the segment text */
1346 vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
1347 error0 = tcp_segment_rcv (tm, tc0, b0,
1348 vnet_buffer (b0)->tcp.data_len, &next0);
1350 /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a
1351 * dangling reference. */
1353 /* 8: check the FIN bit */
1356 /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead
1357 * wait for session to call close. To avoid lingering
1358 * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
1359 tc0->state = TCP_STATE_CLOSE_WAIT;
1360 TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
1361 stream_session_disconnect_notify (&tc0->connection);
1362 tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
1366 b0->error = node->errors[error0];
1367 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1369 tcp_rx_trace_t *t0 =
1370 vlib_add_trace (vm, node, b0, sizeof (*t0));
1371 tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
1374 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1375 n_left_to_next, bi0, next0);
1378 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1381 errors = session_manager_flush_enqueue_events (my_thread_index);
1382 tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors);
1384 return from_frame->n_vectors;
1388 tcp4_established (vlib_main_t * vm, vlib_node_runtime_t * node,
1389 vlib_frame_t * from_frame)
1391 return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
1395 tcp6_established (vlib_main_t * vm, vlib_node_runtime_t * node,
1396 vlib_frame_t * from_frame)
1398 return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
1402 VLIB_REGISTER_NODE (tcp4_established_node) =
1404 .function = tcp4_established,
1405 .name = "tcp4-established",
1406 /* Takes a vector of packets. */
1407 .vector_size = sizeof (u32),
1408 .n_errors = TCP_N_ERROR,
1409 .error_strings = tcp_error_strings,
1410 .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1413 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1414 foreach_tcp_state_next
1417 .format_trace = format_tcp_rx_trace_short,
1421 VLIB_NODE_FUNCTION_MULTIARCH (tcp4_established_node, tcp4_established);
1424 VLIB_REGISTER_NODE (tcp6_established_node) =
1426 .function = tcp6_established,
1427 .name = "tcp6-established",
1428 /* Takes a vector of packets. */
1429 .vector_size = sizeof (u32),
1430 .n_errors = TCP_N_ERROR,
1431 .error_strings = tcp_error_strings,
1432 .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1435 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1436 foreach_tcp_state_next
1439 .format_trace = format_tcp_rx_trace_short,
1444 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established);
1446 vlib_node_registration_t tcp4_syn_sent_node;
1447 vlib_node_registration_t tcp6_syn_sent_node;
1450 tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1451 vlib_frame_t * from_frame, int is_ip4)
1453 tcp_main_t *tm = vnet_get_tcp_main ();
1454 u32 n_left_from, next_index, *from, *to_next;
1455 u32 my_thread_index = vm->thread_index, errors = 0;
1456 u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
1458 from = vlib_frame_vector_args (from_frame);
1459 n_left_from = from_frame->n_vectors;
1461 next_index = node->cached_next_index;
1463 while (n_left_from > 0)
1467 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1469 while (n_left_from > 0 && n_left_to_next > 0)
1471 u32 bi0, ack0, seq0;
1474 tcp_header_t *tcp0 = 0;
1475 tcp_connection_t *tc0;
1476 tcp_connection_t *new_tc0;
1477 u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
1484 n_left_to_next -= 1;
1486 b0 = vlib_get_buffer (vm, bi0);
1488 tcp_half_open_connection_get (vnet_buffer (b0)->
1489 tcp.connection_index);
1491 ack0 = vnet_buffer (b0)->tcp.ack_number;
1492 seq0 = vnet_buffer (b0)->tcp.seq_number;
1493 tcp0 = tcp_buffer_hdr (b0);
1496 (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0)))
1499 /* SYNs, FINs and data consume sequence numbers */
1500 vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0)
1501 + tcp_is_fin (tcp0) + vnet_buffer (b0)->tcp.data_len;
1504 * 1. check the ACK bit
1508 * If the ACK bit is set
1509 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
1510 * the RST bit is set, if so drop the segment and return)
1511 * <SEQ=SEG.ACK><CTL=RST>
1512 * and discard the segment. Return.
1513 * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
1517 if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt)
1519 if (!tcp_rst (tcp0))
1520 tcp_send_reset (b0, is_ip4);
1525 /* Make sure ACK is valid */
1526 if (tc0->snd_una > ack0)
1531 * 2. check the RST bit
1536 /* If ACK is acceptable, signal client that peer is not
1537 * willing to accept connection and drop connection*/
1540 stream_session_connect_notify (&tc0->connection, sst,
1542 tcp_connection_cleanup (tc0);
1548 * 3. check the security and precedence (skipped)
1552 * 4. check the SYN bit
1555 /* No SYN flag. Drop. */
1556 if (!tcp_syn (tcp0))
1559 /* Stop connection establishment and retransmit timers */
1560 tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH);
1561 tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN);
1563 /* Valid SYN or SYN-ACK. Move connection from half-open pool to
1564 * current thread pool. */
1565 pool_get (tm->connections[my_thread_index], new_tc0);
1566 clib_memcpy (new_tc0, tc0, sizeof (*new_tc0));
1568 new_tc0->c_thread_index = my_thread_index;
1569 new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index];
1571 /* Cleanup half-open connection XXX lock */
1572 pool_put (tm->half_open_connections, tc0);
1574 new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
1575 new_tc0->irs = seq0;
1578 if (tcp_options_parse (tcp0, &new_tc0->opt))
1581 if (tcp_opts_tstamp (&new_tc0->opt))
1583 new_tc0->tsval_recent = new_tc0->opt.tsval;
1584 new_tc0->tsval_recent_age = tcp_time_now ();
1587 if (tcp_opts_wscale (&new_tc0->opt))
1588 new_tc0->snd_wscale = new_tc0->opt.wscale;
1591 new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
1592 new_tc0->snd_wl1 = seq0;
1593 new_tc0->snd_wl2 = ack0;
1595 tcp_connection_init_vars (new_tc0);
1597 /* SYN-ACK: See if we can switch to ESTABLISHED state */
1600 /* Our SYN is ACKed: we have iss < ack = snd_una */
1602 /* TODO Dequeue acknowledged segments if we support Fast Open */
1603 new_tc0->snd_una = ack0;
1604 new_tc0->state = TCP_STATE_ESTABLISHED;
1606 /* Make sure las is initialized for the wnd computation */
1607 new_tc0->rcv_las = new_tc0->rcv_nxt;
1609 /* Notify app that we have connection */
1610 stream_session_connect_notify (&new_tc0->connection, sst, 0);
1612 stream_session_init_fifos_pointers (&new_tc0->connection,
1615 /* Make sure after data segment processing ACK is sent */
1616 new_tc0->flags |= TCP_CONN_SNDACK;
1618 /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
1621 new_tc0->state = TCP_STATE_SYN_RCVD;
1623 /* Notify app that we have connection */
1624 stream_session_connect_notify (&new_tc0->connection, sst, 0);
1625 stream_session_init_fifos_pointers (&new_tc0->connection,
1628 tcp_make_synack (new_tc0, b0);
1629 next0 = tcp_next_output (is_ip4);
1634 /* Read data, if any */
1635 if (vnet_buffer (b0)->tcp.data_len)
1637 vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
1638 error0 = tcp_segment_rcv (tm, new_tc0, b0,
1639 vnet_buffer (b0)->tcp.data_len,
1641 if (error0 == TCP_ERROR_PURE_ACK)
1642 error0 = TCP_ERROR_SYN_ACKS_RCVD;
1646 tcp_make_ack (new_tc0, b0);
1647 next0 = tcp_next_output (new_tc0->c_is_ip4);
1652 b0->error = error0 ? node->errors[error0] : 0;
1653 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1655 t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
1656 clib_memcpy (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
1657 clib_memcpy (&t0->tcp_connection, tc0,
1658 sizeof (t0->tcp_connection));
1661 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1662 n_left_to_next, bi0, next0);
1665 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1668 errors = session_manager_flush_enqueue_events (my_thread_index);
1672 vlib_node_increment_counter (vm, tcp4_established_node.index,
1673 TCP_ERROR_EVENT_FIFO_FULL, errors);
1675 vlib_node_increment_counter (vm, tcp6_established_node.index,
1676 TCP_ERROR_EVENT_FIFO_FULL, errors);
1679 return from_frame->n_vectors;
1683 tcp4_syn_sent (vlib_main_t * vm, vlib_node_runtime_t * node,
1684 vlib_frame_t * from_frame)
1686 return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
1690 tcp6_syn_sent_rcv (vlib_main_t * vm, vlib_node_runtime_t * node,
1691 vlib_frame_t * from_frame)
1693 return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
1697 VLIB_REGISTER_NODE (tcp4_syn_sent_node) =
1699 .function = tcp4_syn_sent,
1700 .name = "tcp4-syn-sent",
1701 /* Takes a vector of packets. */
1702 .vector_size = sizeof (u32),
1703 .n_errors = TCP_N_ERROR,
1704 .error_strings = tcp_error_strings,
1705 .n_next_nodes = TCP_SYN_SENT_N_NEXT,
1708 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
1709 foreach_tcp_state_next
1712 .format_trace = format_tcp_rx_trace_short,
1716 VLIB_NODE_FUNCTION_MULTIARCH (tcp4_syn_sent_node, tcp4_syn_sent);
1719 VLIB_REGISTER_NODE (tcp6_syn_sent_node) =
1721 .function = tcp6_syn_sent_rcv,
1722 .name = "tcp6-syn-sent",
1723 /* Takes a vector of packets. */
1724 .vector_size = sizeof (u32),
1725 .n_errors = TCP_N_ERROR,
1726 .error_strings = tcp_error_strings,
1727 .n_next_nodes = TCP_SYN_SENT_N_NEXT,
1730 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
1731 foreach_tcp_state_next
1734 .format_trace = format_tcp_rx_trace_short,
1738 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv);
1740 * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
1741 * as per RFC793 p. 64
1744 tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1745 vlib_frame_t * from_frame, int is_ip4)
1747 tcp_main_t *tm = vnet_get_tcp_main ();
1748 u32 n_left_from, next_index, *from, *to_next;
1749 u32 my_thread_index = vm->thread_index, errors = 0;
1751 from = vlib_frame_vector_args (from_frame);
1752 n_left_from = from_frame->n_vectors;
1754 next_index = node->cached_next_index;
1756 while (n_left_from > 0)
1760 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1762 while (n_left_from > 0 && n_left_to_next > 0)
1766 tcp_header_t *tcp0 = 0;
1767 tcp_connection_t *tc0;
1768 u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED;
1775 n_left_to_next -= 1;
1777 b0 = vlib_get_buffer (vm, bi0);
1778 tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
1780 if (PREDICT_FALSE (tc0 == 0))
1782 error0 = TCP_ERROR_INVALID_CONNECTION;
1786 tcp0 = tcp_buffer_hdr (b0);
1788 /* SYNs, FINs and data consume sequence numbers */
1789 vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number
1790 + tcp_is_syn (tcp0) + tcp_is_fin (tcp0)
1791 + vnet_buffer (b0)->tcp.data_len;
1794 * Special treatment for CLOSED
1798 case TCP_STATE_CLOSED:
1804 * For all other states (except LISTEN)
1807 /* 1-4: check SEQ, RST, SYN */
1809 (tcp_segment_validate (vm, tc0, b0, tcp0, &next0)))
1811 error0 = TCP_ERROR_SEGMENT_INVALID;
1815 /* 5: check the ACK field */
1818 case TCP_STATE_SYN_RCVD:
1820 * If the segment acknowledgment is not acceptable, form a
1822 * <SEQ=SEG.ACK><CTL=RST>
1825 if (!tcp_rcv_ack_is_acceptable (tc0, b0))
1827 tcp_send_reset (b0, is_ip4);
1831 /* Update rtt and rto */
1832 tc0->bytes_acked = 1;
1833 tcp_update_rtt (tc0, vnet_buffer (b0)->tcp.ack_number);
1835 /* Switch state to ESTABLISHED */
1836 tc0->state = TCP_STATE_ESTABLISHED;
1838 /* Initialize session variables */
1839 tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
1840 tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
1842 tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
1843 tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
1845 /* Shoulder tap the server */
1846 stream_session_accept_notify (&tc0->connection);
1848 /* Reset SYN-ACK retransmit timer */
1849 tcp_retransmit_timer_reset (tc0);
1851 case TCP_STATE_ESTABLISHED:
1852 /* We can get packets in established state here because they
1853 * were enqueued before state change */
1854 if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
1858 case TCP_STATE_FIN_WAIT_1:
1859 /* In addition to the processing for the ESTABLISHED state, if
1860 * our FIN is now acknowledged then enter FIN-WAIT-2 and
1861 * continue processing in that state. */
1862 if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
1865 /* If FIN is ACKed */
1866 if (tc0->snd_una == tc0->snd_una_max)
1868 tc0->state = TCP_STATE_FIN_WAIT_2;
1869 /* Stop all timers, 2MSL will be set lower */
1870 tcp_connection_timers_reset (tc0);
1873 case TCP_STATE_FIN_WAIT_2:
1874 /* In addition to the processing for the ESTABLISHED state, if
1875 * the retransmission queue is empty, the user's CLOSE can be
1876 * acknowledged ("ok") but do not delete the TCB. */
1877 if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
1879 /* check if rtx queue is empty and ack CLOSE TODO */
1881 case TCP_STATE_CLOSE_WAIT:
1882 /* Do the same processing as for the ESTABLISHED state. */
1883 if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
1886 case TCP_STATE_CLOSING:
1887 /* In addition to the processing for the ESTABLISHED state, if
1888 * the ACK acknowledges our FIN then enter the TIME-WAIT state,
1889 * otherwise ignore the segment. */
1890 if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
1893 /* XXX test that send queue empty */
1894 tc0->state = TCP_STATE_TIME_WAIT;
1898 case TCP_STATE_LAST_ACK:
1899 /* The only thing that can arrive in this state is an
1900 * acknowledgment of our FIN. If our FIN is now acknowledged,
1901 * delete the TCB, enter the CLOSED state, and return. */
1903 if (!tcp_rcv_ack_is_acceptable (tc0, b0))
1906 tc0->state = TCP_STATE_CLOSED;
1908 /* Don't delete the connection/session yet. Instead, wait a
1909 * reasonable amount of time until the pipes are cleared. In
1910 * particular, this makes sure that we won't have dead sessions
1911 * when processing events on the tx path */
1912 tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_CLEANUP_TIME);
1914 /* Stop retransmit */
1915 tcp_retransmit_timer_reset (tc0);
1920 case TCP_STATE_TIME_WAIT:
1921 /* The only thing that can arrive in this state is a
1922 * retransmission of the remote FIN. Acknowledge it, and restart
1923 * the 2 MSL timeout. */
1932 /* 6: check the URG bit TODO */
1934 /* 7: process the segment text */
1937 case TCP_STATE_ESTABLISHED:
1938 case TCP_STATE_FIN_WAIT_1:
1939 case TCP_STATE_FIN_WAIT_2:
1940 vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset);
1941 error0 = tcp_segment_rcv (tm, tc0, b0,
1942 vnet_buffer (b0)->tcp.data_len,
1945 case TCP_STATE_CLOSE_WAIT:
1946 case TCP_STATE_CLOSING:
1947 case TCP_STATE_LAST_ACK:
1948 case TCP_STATE_TIME_WAIT:
1949 /* This should not occur, since a FIN has been received from the
1950 * remote side. Ignore the segment text. */
1954 /* 8: check the FIN bit */
1955 if (!tcp_fin (tcp0))
1960 case TCP_STATE_ESTABLISHED:
1961 case TCP_STATE_SYN_RCVD:
1962 /* Send FIN-ACK notify app and enter CLOSE-WAIT */
1963 tcp_connection_timers_reset (tc0);
1964 tcp_make_fin (tc0, b0);
1965 next0 = tcp_next_output (tc0->c_is_ip4);
1966 stream_session_disconnect_notify (&tc0->connection);
1967 tc0->state = TCP_STATE_CLOSE_WAIT;
1969 case TCP_STATE_CLOSE_WAIT:
1970 case TCP_STATE_CLOSING:
1971 case TCP_STATE_LAST_ACK:
1974 case TCP_STATE_FIN_WAIT_1:
1975 tc0->state = TCP_STATE_TIME_WAIT;
1976 tcp_connection_timers_reset (tc0);
1977 tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
1979 case TCP_STATE_FIN_WAIT_2:
1980 /* Got FIN, send ACK! */
1981 tc0->state = TCP_STATE_TIME_WAIT;
1982 tcp_connection_timers_reset (tc0);
1983 tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME);
1984 tcp_make_ack (tc0, b0);
1985 next0 = tcp_next_output (is_ip4);
1987 case TCP_STATE_TIME_WAIT:
1988 /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait
1991 tcp_timer_update (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
1994 TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0);
1997 b0->error = error0 ? node->errors[error0] : 0;
1999 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
2001 tcp_rx_trace_t *t0 =
2002 vlib_add_trace (vm, node, b0, sizeof (*t0));
2003 tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
2006 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2007 n_left_to_next, bi0, next0);
2010 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2013 errors = session_manager_flush_enqueue_events (my_thread_index);
2017 vlib_node_increment_counter (vm, tcp4_established_node.index,
2018 TCP_ERROR_EVENT_FIFO_FULL, errors);
2020 vlib_node_increment_counter (vm, tcp6_established_node.index,
2021 TCP_ERROR_EVENT_FIFO_FULL, errors);
2024 return from_frame->n_vectors;
2028 tcp4_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node,
2029 vlib_frame_t * from_frame)
2031 return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2035 tcp6_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node,
2036 vlib_frame_t * from_frame)
2038 return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2042 VLIB_REGISTER_NODE (tcp4_rcv_process_node) =
2044 .function = tcp4_rcv_process,
2045 .name = "tcp4-rcv-process",
2046 /* Takes a vector of packets. */
2047 .vector_size = sizeof (u32),
2048 .n_errors = TCP_N_ERROR,
2049 .error_strings = tcp_error_strings,
2050 .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2053 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2054 foreach_tcp_state_next
2057 .format_trace = format_tcp_rx_trace_short,
2061 VLIB_NODE_FUNCTION_MULTIARCH (tcp4_rcv_process_node, tcp4_rcv_process);
2064 VLIB_REGISTER_NODE (tcp6_rcv_process_node) =
2066 .function = tcp6_rcv_process,
2067 .name = "tcp6-rcv-process",
2068 /* Takes a vector of packets. */
2069 .vector_size = sizeof (u32),
2070 .n_errors = TCP_N_ERROR,
2071 .error_strings = tcp_error_strings,
2072 .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2075 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2076 foreach_tcp_state_next
2079 .format_trace = format_tcp_rx_trace_short,
2083 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_rcv_process_node, tcp6_rcv_process);
2085 vlib_node_registration_t tcp4_listen_node;
2086 vlib_node_registration_t tcp6_listen_node;
2089 * LISTEN state processing as per RFC 793 p. 65
2092 tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
2093 vlib_frame_t * from_frame, int is_ip4)
2095 u32 n_left_from, next_index, *from, *to_next;
2096 u32 my_thread_index = vm->thread_index;
2097 tcp_main_t *tm = vnet_get_tcp_main ();
2098 u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
2100 from = vlib_frame_vector_args (from_frame);
2101 n_left_from = from_frame->n_vectors;
2103 next_index = node->cached_next_index;
2105 while (n_left_from > 0)
2109 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2111 while (n_left_from > 0 && n_left_to_next > 0)
2116 tcp_header_t *th0 = 0;
2117 tcp_connection_t *lc0;
2120 tcp_connection_t *child0;
2121 u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP;
2128 n_left_to_next -= 1;
2130 b0 = vlib_get_buffer (vm, bi0);
2131 lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index);
2135 ip40 = vlib_buffer_get_current (b0);
2136 th0 = ip4_next_header (ip40);
2140 ip60 = vlib_buffer_get_current (b0);
2141 th0 = ip6_next_header (ip60);
2144 /* Create child session. For syn-flood protection use filter */
2146 /* 1. first check for an RST: handled in dispatch */
2147 /* if (tcp_rst (th0))
2150 /* 2. second check for an ACK: handled in dispatch */
2151 /* if (tcp_ack (th0))
2153 tcp_send_reset (b0, is_ip4);
2157 /* 3. check for a SYN (did that already) */
2159 /* Create child session and send SYN-ACK */
2160 pool_get (tm->connections[my_thread_index], child0);
2161 memset (child0, 0, sizeof (*child0));
2163 child0->c_c_index = child0 - tm->connections[my_thread_index];
2164 child0->c_lcl_port = lc0->c_lcl_port;
2165 child0->c_rmt_port = th0->src_port;
2166 child0->c_is_ip4 = is_ip4;
2167 child0->c_thread_index = my_thread_index;
2171 child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32;
2172 child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32;
2176 clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address,
2177 sizeof (ip6_address_t));
2178 clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address,
2179 sizeof (ip6_address_t));
2182 if (stream_session_accept (&child0->connection, lc0->c_s_index, sst,
2185 error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2189 if (tcp_options_parse (th0, &child0->opt))
2194 child0->irs = vnet_buffer (b0)->tcp.seq_number;
2195 child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1;
2196 child0->rcv_las = child0->rcv_nxt;
2197 child0->state = TCP_STATE_SYN_RCVD;
2199 /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
2200 * segments are used to initialize PAWS. */
2201 if (tcp_opts_tstamp (&child0->opt))
2203 child0->tsval_recent = child0->opt.tsval;
2204 child0->tsval_recent_age = tcp_time_now ();
2207 if (tcp_opts_wscale (&child0->opt))
2208 child0->snd_wscale = child0->opt.wscale;
2211 child0->snd_wnd = clib_net_to_host_u16 (th0->window);
2212 child0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2213 child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2215 tcp_connection_init_vars (child0);
2217 TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0);
2219 /* Reuse buffer to make syn-ack and send */
2220 tcp_make_synack (child0, b0);
2221 next0 = tcp_next_output (is_ip4);
2223 /* Init fifo pointers after we have iss */
2224 stream_session_init_fifos_pointers (&child0->connection,
2228 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
2230 t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2231 clib_memcpy (&t0->tcp_header, th0, sizeof (t0->tcp_header));
2232 clib_memcpy (&t0->tcp_connection, lc0,
2233 sizeof (t0->tcp_connection));
2236 b0->error = node->errors[error0];
2238 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2239 n_left_to_next, bi0, next0);
2242 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2244 return from_frame->n_vectors;
2248 tcp4_listen (vlib_main_t * vm, vlib_node_runtime_t * node,
2249 vlib_frame_t * from_frame)
2251 return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2255 tcp6_listen (vlib_main_t * vm, vlib_node_runtime_t * node,
2256 vlib_frame_t * from_frame)
2258 return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2262 VLIB_REGISTER_NODE (tcp4_listen_node) =
2264 .function = tcp4_listen,
2265 .name = "tcp4-listen",
2266 /* Takes a vector of packets. */
2267 .vector_size = sizeof (u32),
2268 .n_errors = TCP_N_ERROR,
2269 .error_strings = tcp_error_strings,
2270 .n_next_nodes = TCP_LISTEN_N_NEXT,
2273 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2274 foreach_tcp_state_next
2277 .format_trace = format_tcp_rx_trace_short,
2281 VLIB_NODE_FUNCTION_MULTIARCH (tcp4_listen_node, tcp4_listen);
2284 VLIB_REGISTER_NODE (tcp6_listen_node) =
2286 .function = tcp6_listen,
2287 .name = "tcp6-listen",
2288 /* Takes a vector of packets. */
2289 .vector_size = sizeof (u32),
2290 .n_errors = TCP_N_ERROR,
2291 .error_strings = tcp_error_strings,
2292 .n_next_nodes = TCP_LISTEN_N_NEXT,
2295 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2296 foreach_tcp_state_next
2299 .format_trace = format_tcp_rx_trace_short,
2303 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_listen_node, tcp6_listen);
2305 vlib_node_registration_t tcp4_input_node;
2306 vlib_node_registration_t tcp6_input_node;
2308 typedef enum _tcp_input_next
2310 TCP_INPUT_NEXT_DROP,
2311 TCP_INPUT_NEXT_LISTEN,
2312 TCP_INPUT_NEXT_RCV_PROCESS,
2313 TCP_INPUT_NEXT_SYN_SENT,
2314 TCP_INPUT_NEXT_ESTABLISHED,
2315 TCP_INPUT_NEXT_RESET,
2319 #define foreach_tcp4_input_next \
2320 _ (DROP, "error-drop") \
2321 _ (LISTEN, "tcp4-listen") \
2322 _ (RCV_PROCESS, "tcp4-rcv-process") \
2323 _ (SYN_SENT, "tcp4-syn-sent") \
2324 _ (ESTABLISHED, "tcp4-established") \
2325 _ (RESET, "tcp4-reset")
2327 #define foreach_tcp6_input_next \
2328 _ (DROP, "error-drop") \
2329 _ (LISTEN, "tcp6-listen") \
2330 _ (RCV_PROCESS, "tcp6-rcv-process") \
2331 _ (SYN_SENT, "tcp6-syn-sent") \
2332 _ (ESTABLISHED, "tcp6-established") \
2333 _ (RESET, "tcp6-reset")
2335 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
2338 tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
2339 vlib_frame_t * from_frame, int is_ip4)
2341 u32 n_left_from, next_index, *from, *to_next;
2342 u32 my_thread_index = vm->thread_index;
2343 tcp_main_t *tm = vnet_get_tcp_main ();
2345 from = vlib_frame_vector_args (from_frame);
2346 n_left_from = from_frame->n_vectors;
2348 next_index = node->cached_next_index;
2350 while (n_left_from > 0)
2354 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2356 while (n_left_from > 0 && n_left_to_next > 0)
2358 int n_advance_bytes0, n_data_bytes0;
2361 tcp_header_t *tcp0 = 0;
2362 tcp_connection_t *tc0;
2365 u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP;
2373 n_left_to_next -= 1;
2375 b0 = vlib_get_buffer (vm, bi0);
2376 vnet_buffer (b0)->tcp.flags = 0;
2378 /* Checksum computed by ipx_local no need to compute again */
2382 ip40 = vlib_buffer_get_current (b0);
2383 tcp0 = ip4_next_header (ip40);
2384 n_advance_bytes0 = (ip4_header_bytes (ip40)
2385 + tcp_header_bytes (tcp0));
2386 n_data_bytes0 = clib_net_to_host_u16 (ip40->length)
2389 /* lookup session */
2391 (tcp_connection_t *)
2392 stream_session_lookup_transport4 (&ip40->dst_address,
2396 SESSION_TYPE_IP4_TCP,
2401 ip60 = vlib_buffer_get_current (b0);
2402 tcp0 = ip6_next_header (ip60);
2403 n_advance_bytes0 = tcp_header_bytes (tcp0);
2404 n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length)
2406 n_advance_bytes0 += sizeof (ip60[0]);
2409 (tcp_connection_t *)
2410 stream_session_lookup_transport6 (&ip60->src_address,
2414 SESSION_TYPE_IP6_TCP,
2419 if (PREDICT_FALSE (n_advance_bytes0 < 0))
2421 error0 = TCP_ERROR_LENGTH;
2425 /* Session exists */
2426 if (PREDICT_TRUE (0 != tc0))
2428 /* Save connection index */
2429 vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index;
2430 vnet_buffer (b0)->tcp.seq_number =
2431 clib_net_to_host_u32 (tcp0->seq_number);
2432 vnet_buffer (b0)->tcp.ack_number =
2433 clib_net_to_host_u32 (tcp0->ack_number);
2435 vnet_buffer (b0)->tcp.hdr_offset = (u8 *) tcp0
2436 - (u8 *) vlib_buffer_get_current (b0);
2437 vnet_buffer (b0)->tcp.data_offset = n_advance_bytes0;
2438 vnet_buffer (b0)->tcp.data_len = n_data_bytes0;
2440 flags0 = tcp0->flags & filter_flags;
2441 next0 = tm->dispatch_table[tc0->state][flags0].next;
2442 error0 = tm->dispatch_table[tc0->state][flags0].error;
2444 if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH
2445 || next0 == TCP_INPUT_NEXT_RESET))
2447 /* Overload tcp flags to store state */
2448 tcp_state_t state0 = tc0->state;
2449 vnet_buffer (b0)->tcp.flags = tc0->state;
2451 if (error0 == TCP_ERROR_DISPATCH)
2452 clib_warning ("disp error state %U flags %U",
2453 format_tcp_state, &state0, format_tcp_flags,
2460 next0 = TCP_INPUT_NEXT_RESET;
2461 error0 = TCP_ERROR_NO_LISTENER;
2465 b0->error = error0 ? node->errors[error0] : 0;
2467 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
2469 tcp_rx_trace_t *t0 =
2470 vlib_add_trace (vm, node, b0, sizeof (*t0));
2471 tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
2474 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2475 n_left_to_next, bi0, next0);
2478 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2481 return from_frame->n_vectors;
2485 tcp4_input (vlib_main_t * vm, vlib_node_runtime_t * node,
2486 vlib_frame_t * from_frame)
2488 return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2492 tcp6_input (vlib_main_t * vm, vlib_node_runtime_t * node,
2493 vlib_frame_t * from_frame)
2495 return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2499 VLIB_REGISTER_NODE (tcp4_input_node) =
2501 .function = tcp4_input,
2502 .name = "tcp4-input",
2503 /* Takes a vector of packets. */
2504 .vector_size = sizeof (u32),
2505 .n_errors = TCP_N_ERROR,
2506 .error_strings = tcp_error_strings,
2507 .n_next_nodes = TCP_INPUT_N_NEXT,
2510 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
2511 foreach_tcp4_input_next
2514 .format_buffer = format_tcp_header,
2515 .format_trace = format_tcp_rx_trace,
2519 VLIB_NODE_FUNCTION_MULTIARCH (tcp4_input_node, tcp4_input);
2522 VLIB_REGISTER_NODE (tcp6_input_node) =
2524 .function = tcp6_input,
2525 .name = "tcp6-input",
2526 /* Takes a vector of packets. */
2527 .vector_size = sizeof (u32),
2528 .n_errors = TCP_N_ERROR,
2529 .error_strings = tcp_error_strings,
2530 .n_next_nodes = TCP_INPUT_N_NEXT,
2533 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
2534 foreach_tcp6_input_next
2537 .format_buffer = format_tcp_header,
2538 .format_trace = format_tcp_rx_trace,
2542 VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input);
2545 tcp_dispatch_table_init (tcp_main_t * tm)
2548 for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
2549 for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
2551 tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
2552 tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
2555 #define _(t,f,n,e) \
2557 tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
2558 tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
2561 /* SYNs for new connections -> tcp-listen. */
2562 _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
2563 _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_NONE);
2564 _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_NONE);
2565 /* ACK for for a SYN-ACK -> tcp-rcv-process. */
2566 _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2567 _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2568 /* SYN-ACK for a SYN */
2569 _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
2571 _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
2572 _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
2573 _(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT,
2575 /* ACK for for established connection -> tcp-established. */
2576 _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
2577 /* FIN for for established connection -> tcp-established. */
2578 _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
2579 _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
2581 _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
2582 _(ESTABLISHED, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED,
2584 /* ACK or FIN-ACK to our FIN */
2585 _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2586 _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS,
2588 /* FIN in reply to our FIN from the other side */
2589 _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2590 /* FIN confirming that the peer (app) has closed */
2591 _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2592 _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2593 _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
2595 _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2596 _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
2597 _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
2598 _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
2603 tcp_input_init (vlib_main_t * vm)
2605 clib_error_t *error = 0;
2606 tcp_main_t *tm = vnet_get_tcp_main ();
2608 if ((error = vlib_call_init_function (vm, tcp_init)))
2611 /* Initialize dispatch table. */
2612 tcp_dispatch_table_init (tm);
2617 VLIB_INIT_FUNCTION (tcp_input_init);
2620 * fd.io coding-style-patch-verification: ON
2623 * eval: (c-set-style "gnu")