Mtrie optimisations
[vpp.git] / src / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
43 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
44 #include <vnet/ppp/ppp.h>
45 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
46 #include <vnet/api_errno.h>     /* for API error numbers */
47 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
48 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_urpf_list.h>     /* for FIB uRPF check */
50 #include <vnet/fib/ip4_fib.h>
51 #include <vnet/dpo/load_balance.h>
52 #include <vnet/dpo/classify_dpo.h>
53 #include <vnet/mfib/mfib_table.h>       /* for mFIB table and entry creation */
54
55 /**
56  * @file
57  * @brief IPv4 Forwarding.
58  *
59  * This file contains the source code for IPv4 forwarding.
60  */
61
62 void
63 ip4_forward_next_trace (vlib_main_t * vm,
64                         vlib_node_runtime_t * node,
65                         vlib_frame_t * frame,
66                         vlib_rx_or_tx_t which_adj_index);
67
68 always_inline uword
69 ip4_lookup_inline (vlib_main_t * vm,
70                    vlib_node_runtime_t * node,
71                    vlib_frame_t * frame,
72                    int lookup_for_responses_to_locally_received_packets)
73 {
74   ip4_main_t *im = &ip4_main;
75   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters;
76   u32 n_left_from, n_left_to_next, *from, *to_next;
77   ip_lookup_next_t next;
78   u32 cpu_index = os_get_cpu_number ();
79
80   from = vlib_frame_vector_args (frame);
81   n_left_from = frame->n_vectors;
82   next = node->cached_next_index;
83
84   while (n_left_from > 0)
85     {
86       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
87
88       while (n_left_from >= 8 && n_left_to_next >= 4)
89         {
90           vlib_buffer_t *p0, *p1, *p2, *p3;
91           ip4_header_t *ip0, *ip1, *ip2, *ip3;
92           __attribute__ ((unused)) tcp_header_t *tcp0, *tcp1, *tcp2, *tcp3;
93           ip_lookup_next_t next0, next1, next2, next3;
94           const load_balance_t *lb0, *lb1, *lb2, *lb3;
95           ip4_fib_mtrie_t *mtrie0, *mtrie1, *mtrie2, *mtrie3;
96           ip4_fib_mtrie_leaf_t leaf0, leaf1, leaf2, leaf3;
97           ip4_address_t *dst_addr0, *dst_addr1, *dst_addr2, *dst_addr3;
98           __attribute__ ((unused)) u32 pi0, fib_index0, lb_index0,
99             is_tcp_udp0;
100           __attribute__ ((unused)) u32 pi1, fib_index1, lb_index1,
101             is_tcp_udp1;
102           __attribute__ ((unused)) u32 pi2, fib_index2, lb_index2,
103             is_tcp_udp2;
104           __attribute__ ((unused)) u32 pi3, fib_index3, lb_index3,
105             is_tcp_udp3;
106           flow_hash_config_t flow_hash_config0, flow_hash_config1;
107           flow_hash_config_t flow_hash_config2, flow_hash_config3;
108           u32 hash_c0, hash_c1, hash_c2, hash_c3;
109           const dpo_id_t *dpo0, *dpo1, *dpo2, *dpo3;
110
111           /* Prefetch next iteration. */
112           {
113             vlib_buffer_t *p4, *p5, *p6, *p7;
114
115             p4 = vlib_get_buffer (vm, from[4]);
116             p5 = vlib_get_buffer (vm, from[5]);
117             p6 = vlib_get_buffer (vm, from[6]);
118             p7 = vlib_get_buffer (vm, from[7]);
119
120             vlib_prefetch_buffer_header (p4, LOAD);
121             vlib_prefetch_buffer_header (p5, LOAD);
122             vlib_prefetch_buffer_header (p6, LOAD);
123             vlib_prefetch_buffer_header (p7, LOAD);
124
125             CLIB_PREFETCH (p4->data, sizeof (ip0[0]), LOAD);
126             CLIB_PREFETCH (p5->data, sizeof (ip0[0]), LOAD);
127             CLIB_PREFETCH (p6->data, sizeof (ip0[0]), LOAD);
128             CLIB_PREFETCH (p7->data, sizeof (ip0[0]), LOAD);
129           }
130
131           pi0 = to_next[0] = from[0];
132           pi1 = to_next[1] = from[1];
133           pi2 = to_next[2] = from[2];
134           pi3 = to_next[3] = from[3];
135
136           from += 4;
137           to_next += 4;
138           n_left_to_next -= 4;
139           n_left_from -= 4;
140
141           p0 = vlib_get_buffer (vm, pi0);
142           p1 = vlib_get_buffer (vm, pi1);
143           p2 = vlib_get_buffer (vm, pi2);
144           p3 = vlib_get_buffer (vm, pi3);
145
146           ip0 = vlib_buffer_get_current (p0);
147           ip1 = vlib_buffer_get_current (p1);
148           ip2 = vlib_buffer_get_current (p2);
149           ip3 = vlib_buffer_get_current (p3);
150
151           dst_addr0 = &ip0->dst_address;
152           dst_addr1 = &ip1->dst_address;
153           dst_addr2 = &ip2->dst_address;
154           dst_addr3 = &ip3->dst_address;
155
156           fib_index0 =
157             vec_elt (im->fib_index_by_sw_if_index,
158                      vnet_buffer (p0)->sw_if_index[VLIB_RX]);
159           fib_index1 =
160             vec_elt (im->fib_index_by_sw_if_index,
161                      vnet_buffer (p1)->sw_if_index[VLIB_RX]);
162           fib_index2 =
163             vec_elt (im->fib_index_by_sw_if_index,
164                      vnet_buffer (p2)->sw_if_index[VLIB_RX]);
165           fib_index3 =
166             vec_elt (im->fib_index_by_sw_if_index,
167                      vnet_buffer (p3)->sw_if_index[VLIB_RX]);
168           fib_index0 =
169             (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
170              (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];
171           fib_index1 =
172             (vnet_buffer (p1)->sw_if_index[VLIB_TX] ==
173              (u32) ~ 0) ? fib_index1 : vnet_buffer (p1)->sw_if_index[VLIB_TX];
174           fib_index2 =
175             (vnet_buffer (p2)->sw_if_index[VLIB_TX] ==
176              (u32) ~ 0) ? fib_index2 : vnet_buffer (p2)->sw_if_index[VLIB_TX];
177           fib_index3 =
178             (vnet_buffer (p3)->sw_if_index[VLIB_TX] ==
179              (u32) ~ 0) ? fib_index3 : vnet_buffer (p3)->sw_if_index[VLIB_TX];
180
181
182           if (!lookup_for_responses_to_locally_received_packets)
183             {
184               mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
185               mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
186               mtrie2 = &ip4_fib_get (fib_index2)->mtrie;
187               mtrie3 = &ip4_fib_get (fib_index3)->mtrie;
188
189
190               leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
191               leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, dst_addr1);
192               leaf2 = ip4_fib_mtrie_lookup_step_one (mtrie2, dst_addr2);
193               leaf3 = ip4_fib_mtrie_lookup_step_one (mtrie3, dst_addr3);
194             }
195
196           tcp0 = (void *) (ip0 + 1);
197           tcp1 = (void *) (ip1 + 1);
198           tcp2 = (void *) (ip2 + 1);
199           tcp3 = (void *) (ip3 + 1);
200
201           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
202                          || ip0->protocol == IP_PROTOCOL_UDP);
203           is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
204                          || ip1->protocol == IP_PROTOCOL_UDP);
205           is_tcp_udp2 = (ip2->protocol == IP_PROTOCOL_TCP
206                          || ip2->protocol == IP_PROTOCOL_UDP);
207           is_tcp_udp3 = (ip1->protocol == IP_PROTOCOL_TCP
208                          || ip1->protocol == IP_PROTOCOL_UDP);
209
210           if (!lookup_for_responses_to_locally_received_packets)
211             {
212               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
213               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 1);
214               leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 1);
215               leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 1);
216             }
217
218           if (!lookup_for_responses_to_locally_received_packets)
219             {
220               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
221               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
222               leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 2);
223               leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 2);
224             }
225
226           if (!lookup_for_responses_to_locally_received_packets)
227             {
228               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
229               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
230               leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 3);
231               leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 3);
232             }
233
234           if (lookup_for_responses_to_locally_received_packets)
235             {
236               lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
237               lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
238               lb_index2 = vnet_buffer (p2)->ip.adj_index[VLIB_RX];
239               lb_index3 = vnet_buffer (p3)->ip.adj_index[VLIB_RX];
240             }
241           else
242             {
243               lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
244               lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
245               lb_index2 = ip4_fib_mtrie_leaf_get_adj_index (leaf2);
246               lb_index3 = ip4_fib_mtrie_leaf_get_adj_index (leaf3);
247             }
248
249           ASSERT (lb_index0 && lb_index1 && lb_index2 && lb_index3);
250           lb0 = load_balance_get (lb_index0);
251           lb1 = load_balance_get (lb_index1);
252           lb2 = load_balance_get (lb_index2);
253           lb3 = load_balance_get (lb_index3);
254
255           /* Use flow hash to compute multipath adjacency. */
256           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
257           hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
258           hash_c2 = vnet_buffer (p2)->ip.flow_hash = 0;
259           hash_c3 = vnet_buffer (p3)->ip.flow_hash = 0;
260           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
261             {
262               flow_hash_config0 = lb0->lb_hash_config;
263               hash_c0 = vnet_buffer (p0)->ip.flow_hash =
264                 ip4_compute_flow_hash (ip0, flow_hash_config0);
265             }
266           if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
267             {
268               flow_hash_config1 = lb1->lb_hash_config;
269               hash_c1 = vnet_buffer (p1)->ip.flow_hash =
270                 ip4_compute_flow_hash (ip1, flow_hash_config1);
271             }
272           if (PREDICT_FALSE (lb2->lb_n_buckets > 1))
273             {
274               flow_hash_config2 = lb2->lb_hash_config;
275               hash_c2 = vnet_buffer (p2)->ip.flow_hash =
276                 ip4_compute_flow_hash (ip2, flow_hash_config2);
277             }
278           if (PREDICT_FALSE (lb3->lb_n_buckets > 1))
279             {
280               flow_hash_config3 = lb3->lb_hash_config;
281               hash_c3 = vnet_buffer (p3)->ip.flow_hash =
282                 ip4_compute_flow_hash (ip3, flow_hash_config3);
283             }
284
285           ASSERT (lb0->lb_n_buckets > 0);
286           ASSERT (is_pow2 (lb0->lb_n_buckets));
287           ASSERT (lb1->lb_n_buckets > 0);
288           ASSERT (is_pow2 (lb1->lb_n_buckets));
289           ASSERT (lb2->lb_n_buckets > 0);
290           ASSERT (is_pow2 (lb2->lb_n_buckets));
291           ASSERT (lb3->lb_n_buckets > 0);
292           ASSERT (is_pow2 (lb3->lb_n_buckets));
293
294           dpo0 = load_balance_get_bucket_i (lb0,
295                                             (hash_c0 &
296                                              (lb0->lb_n_buckets_minus_1)));
297           dpo1 = load_balance_get_bucket_i (lb1,
298                                             (hash_c1 &
299                                              (lb1->lb_n_buckets_minus_1)));
300           dpo2 = load_balance_get_bucket_i (lb2,
301                                             (hash_c2 &
302                                              (lb2->lb_n_buckets_minus_1)));
303           dpo3 = load_balance_get_bucket_i (lb3,
304                                             (hash_c3 &
305                                              (lb3->lb_n_buckets_minus_1)));
306
307           next0 = dpo0->dpoi_next_node;
308           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
309           next1 = dpo1->dpoi_next_node;
310           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
311           next2 = dpo2->dpoi_next_node;
312           vnet_buffer (p2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index;
313           next3 = dpo3->dpoi_next_node;
314           vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;
315
316           vlib_increment_combined_counter
317             (cm, cpu_index, lb_index0, 1,
318              vlib_buffer_length_in_chain (vm, p0)
319              + sizeof (ethernet_header_t));
320           vlib_increment_combined_counter
321             (cm, cpu_index, lb_index1, 1,
322              vlib_buffer_length_in_chain (vm, p1)
323              + sizeof (ethernet_header_t));
324           vlib_increment_combined_counter
325             (cm, cpu_index, lb_index2, 1,
326              vlib_buffer_length_in_chain (vm, p2)
327              + sizeof (ethernet_header_t));
328           vlib_increment_combined_counter
329             (cm, cpu_index, lb_index3, 1,
330              vlib_buffer_length_in_chain (vm, p3)
331              + sizeof (ethernet_header_t));
332
333           vlib_validate_buffer_enqueue_x4 (vm, node, next,
334                                            to_next, n_left_to_next,
335                                            pi0, pi1, pi2, pi3,
336                                            next0, next1, next2, next3);
337         }
338
339       while (n_left_from > 0 && n_left_to_next > 0)
340         {
341           vlib_buffer_t *p0;
342           ip4_header_t *ip0;
343           __attribute__ ((unused)) tcp_header_t *tcp0;
344           ip_lookup_next_t next0;
345           const load_balance_t *lb0;
346           ip4_fib_mtrie_t *mtrie0;
347           ip4_fib_mtrie_leaf_t leaf0;
348           ip4_address_t *dst_addr0;
349           __attribute__ ((unused)) u32 pi0, fib_index0, is_tcp_udp0, lbi0;
350           flow_hash_config_t flow_hash_config0;
351           const dpo_id_t *dpo0;
352           u32 hash_c0;
353
354           pi0 = from[0];
355           to_next[0] = pi0;
356
357           p0 = vlib_get_buffer (vm, pi0);
358
359           ip0 = vlib_buffer_get_current (p0);
360
361           dst_addr0 = &ip0->dst_address;
362
363           fib_index0 =
364             vec_elt (im->fib_index_by_sw_if_index,
365                      vnet_buffer (p0)->sw_if_index[VLIB_RX]);
366           fib_index0 =
367             (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
368              (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];
369
370           if (!lookup_for_responses_to_locally_received_packets)
371             {
372               mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
373
374               leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, dst_addr0);
375             }
376
377           tcp0 = (void *) (ip0 + 1);
378
379           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
380                          || ip0->protocol == IP_PROTOCOL_UDP);
381
382           if (!lookup_for_responses_to_locally_received_packets)
383             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
384
385           if (!lookup_for_responses_to_locally_received_packets)
386             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
387
388           if (!lookup_for_responses_to_locally_received_packets)
389             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
390
391           if (lookup_for_responses_to_locally_received_packets)
392             lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
393           else
394             {
395               /* Handle default route. */
396               lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
397             }
398
399           ASSERT (lbi0);
400           lb0 = load_balance_get (lbi0);
401
402           /* Use flow hash to compute multipath adjacency. */
403           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
404           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
405             {
406               flow_hash_config0 = lb0->lb_hash_config;
407
408               hash_c0 = vnet_buffer (p0)->ip.flow_hash =
409                 ip4_compute_flow_hash (ip0, flow_hash_config0);
410             }
411
412           ASSERT (lb0->lb_n_buckets > 0);
413           ASSERT (is_pow2 (lb0->lb_n_buckets));
414
415           dpo0 = load_balance_get_bucket_i (lb0,
416                                             (hash_c0 &
417                                              (lb0->lb_n_buckets_minus_1)));
418
419           next0 = dpo0->dpoi_next_node;
420           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
421
422           vlib_increment_combined_counter
423             (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
424
425           from += 1;
426           to_next += 1;
427           n_left_to_next -= 1;
428           n_left_from -= 1;
429
430           if (PREDICT_FALSE (next0 != next))
431             {
432               n_left_to_next += 1;
433               vlib_put_next_frame (vm, node, next, n_left_to_next);
434               next = next0;
435               vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
436               to_next[0] = pi0;
437               to_next += 1;
438               n_left_to_next -= 1;
439             }
440         }
441
442       vlib_put_next_frame (vm, node, next, n_left_to_next);
443     }
444
445   if (node->flags & VLIB_NODE_FLAG_TRACE)
446     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
447
448   return frame->n_vectors;
449 }
450
451 /** @brief IPv4 lookup node.
452     @node ip4-lookup
453
454     This is the main IPv4 lookup dispatch node.
455
456     @param vm vlib_main_t corresponding to the current thread
457     @param node vlib_node_runtime_t
458     @param frame vlib_frame_t whose contents should be dispatched
459
460     @par Graph mechanics: buffer metadata, next index usage
461
462     @em Uses:
463     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
464         - Indicates the @c sw_if_index value of the interface that the
465           packet was received on.
466     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
467         - When the value is @c ~0 then the node performs a longest prefix
468           match (LPM) for the packet destination address in the FIB attached
469           to the receive interface.
470         - Otherwise perform LPM for the packet destination address in the
471           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
472           value (0, 1, ...) and not a VRF id.
473
474     @em Sets:
475     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
476         - The lookup result adjacency index.
477
478     <em>Next Index:</em>
479     - Dispatches the packet to the node index found in
480       ip_adjacency_t @c adj->lookup_next_index
481       (where @c adj is the lookup result adjacency).
482 */
483 static uword
484 ip4_lookup (vlib_main_t * vm,
485             vlib_node_runtime_t * node, vlib_frame_t * frame)
486 {
487   return ip4_lookup_inline (vm, node, frame,
488                             /* lookup_for_responses_to_locally_received_packets */
489                             0);
490
491 }
492
493 static u8 *format_ip4_lookup_trace (u8 * s, va_list * args);
494
495 VLIB_REGISTER_NODE (ip4_lookup_node) =
496 {
497 .function = ip4_lookup,.name = "ip4-lookup",.vector_size =
498     sizeof (u32),.format_trace = format_ip4_lookup_trace,.n_next_nodes =
499     IP_LOOKUP_N_NEXT,.next_nodes = IP4_LOOKUP_NEXT_NODES,};
500
501 VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_node, ip4_lookup);
502
503 always_inline uword
504 ip4_load_balance (vlib_main_t * vm,
505                   vlib_node_runtime_t * node, vlib_frame_t * frame)
506 {
507   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
508   u32 n_left_from, n_left_to_next, *from, *to_next;
509   ip_lookup_next_t next;
510   u32 cpu_index = os_get_cpu_number ();
511
512   from = vlib_frame_vector_args (frame);
513   n_left_from = frame->n_vectors;
514   next = node->cached_next_index;
515
516   if (node->flags & VLIB_NODE_FLAG_TRACE)
517     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
518
519   while (n_left_from > 0)
520     {
521       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
522
523
524       while (n_left_from >= 4 && n_left_to_next >= 2)
525         {
526           ip_lookup_next_t next0, next1;
527           const load_balance_t *lb0, *lb1;
528           vlib_buffer_t *p0, *p1;
529           u32 pi0, lbi0, hc0, pi1, lbi1, hc1;
530           const ip4_header_t *ip0, *ip1;
531           const dpo_id_t *dpo0, *dpo1;
532
533           /* Prefetch next iteration. */
534           {
535             vlib_buffer_t *p2, *p3;
536
537             p2 = vlib_get_buffer (vm, from[2]);
538             p3 = vlib_get_buffer (vm, from[3]);
539
540             vlib_prefetch_buffer_header (p2, STORE);
541             vlib_prefetch_buffer_header (p3, STORE);
542
543             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
544             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
545           }
546
547           pi0 = to_next[0] = from[0];
548           pi1 = to_next[1] = from[1];
549
550           from += 2;
551           n_left_from -= 2;
552           to_next += 2;
553           n_left_to_next -= 2;
554
555           p0 = vlib_get_buffer (vm, pi0);
556           p1 = vlib_get_buffer (vm, pi1);
557
558           ip0 = vlib_buffer_get_current (p0);
559           ip1 = vlib_buffer_get_current (p1);
560           lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
561           lbi1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];
562
563           lb0 = load_balance_get (lbi0);
564           lb1 = load_balance_get (lbi1);
565
566           /*
567            * this node is for via FIBs we can re-use the hash value from the
568            * to node if present.
569            * We don't want to use the same hash value at each level in the recursion
570            * graph as that would lead to polarisation
571            */
572           hc0 = hc1 = 0;
573
574           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
575             {
576               if (PREDICT_TRUE (vnet_buffer (p0)->ip.flow_hash))
577                 {
578                   hc0 = vnet_buffer (p0)->ip.flow_hash =
579                     vnet_buffer (p0)->ip.flow_hash >> 1;
580                 }
581               else
582                 {
583                   hc0 = vnet_buffer (p0)->ip.flow_hash =
584                     ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
585                 }
586             }
587           if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
588             {
589               if (PREDICT_TRUE (vnet_buffer (p1)->ip.flow_hash))
590                 {
591                   hc1 = vnet_buffer (p1)->ip.flow_hash =
592                     vnet_buffer (p1)->ip.flow_hash >> 1;
593                 }
594               else
595                 {
596                   hc1 = vnet_buffer (p1)->ip.flow_hash =
597                     ip4_compute_flow_hash (ip1, lb1->lb_hash_config);
598                 }
599             }
600
601           dpo0 =
602             load_balance_get_bucket_i (lb0,
603                                        hc0 & (lb0->lb_n_buckets_minus_1));
604           dpo1 =
605             load_balance_get_bucket_i (lb1,
606                                        hc1 & (lb1->lb_n_buckets_minus_1));
607
608           next0 = dpo0->dpoi_next_node;
609           next1 = dpo1->dpoi_next_node;
610
611           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
612           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
613
614           vlib_increment_combined_counter
615             (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
616           vlib_increment_combined_counter
617             (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));
618
619           vlib_validate_buffer_enqueue_x2 (vm, node, next,
620                                            to_next, n_left_to_next,
621                                            pi0, pi1, next0, next1);
622         }
623
624       while (n_left_from > 0 && n_left_to_next > 0)
625         {
626           ip_lookup_next_t next0;
627           const load_balance_t *lb0;
628           vlib_buffer_t *p0;
629           u32 pi0, lbi0, hc0;
630           const ip4_header_t *ip0;
631           const dpo_id_t *dpo0;
632
633           pi0 = from[0];
634           to_next[0] = pi0;
635           from += 1;
636           to_next += 1;
637           n_left_to_next -= 1;
638           n_left_from -= 1;
639
640           p0 = vlib_get_buffer (vm, pi0);
641
642           ip0 = vlib_buffer_get_current (p0);
643           lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
644
645           lb0 = load_balance_get (lbi0);
646
647           hc0 = 0;
648           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
649             {
650               if (PREDICT_TRUE (vnet_buffer (p0)->ip.flow_hash))
651                 {
652                   hc0 = vnet_buffer (p0)->ip.flow_hash =
653                     vnet_buffer (p0)->ip.flow_hash >> 1;
654                 }
655               else
656                 {
657                   hc0 = vnet_buffer (p0)->ip.flow_hash =
658                     ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
659                 }
660             }
661
662           dpo0 =
663             load_balance_get_bucket_i (lb0,
664                                        hc0 & (lb0->lb_n_buckets_minus_1));
665
666           next0 = dpo0->dpoi_next_node;
667           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
668
669           vlib_increment_combined_counter
670             (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
671
672           vlib_validate_buffer_enqueue_x1 (vm, node, next,
673                                            to_next, n_left_to_next,
674                                            pi0, next0);
675         }
676
677       vlib_put_next_frame (vm, node, next, n_left_to_next);
678     }
679
680   return frame->n_vectors;
681 }
682
683 VLIB_REGISTER_NODE (ip4_load_balance_node) =
684 {
685 .function = ip4_load_balance,.name = "ip4-load-balance",.vector_size =
686     sizeof (u32),.sibling_of = "ip4-lookup",.format_trace =
687     format_ip4_lookup_trace,};
688
689 VLIB_NODE_FUNCTION_MULTIARCH (ip4_load_balance_node, ip4_load_balance);
690
691 /* get first interface address */
692 ip4_address_t *
693 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
694                              ip_interface_address_t ** result_ia)
695 {
696   ip_lookup_main_t *lm = &im->lookup_main;
697   ip_interface_address_t *ia = 0;
698   ip4_address_t *result = 0;
699
700   /* *INDENT-OFF* */
701   foreach_ip_interface_address
702     (lm, ia, sw_if_index,
703      1 /* honor unnumbered */ ,
704      ({
705        ip4_address_t * a =
706          ip_interface_address_get_address (lm, ia);
707        result = a;
708        break;
709      }));
710   /* *INDENT-OFF* */
711   if (result_ia)
712     *result_ia = result ? ia : 0;
713   return result;
714 }
715
716 static void
717 ip4_add_interface_routes (u32 sw_if_index,
718                           ip4_main_t * im, u32 fib_index,
719                           ip_interface_address_t * a)
720 {
721   ip_lookup_main_t *lm = &im->lookup_main;
722   ip4_address_t *address = ip_interface_address_get_address (lm, a);
723   fib_prefix_t pfx = {
724     .fp_len = a->address_length,
725     .fp_proto = FIB_PROTOCOL_IP4,
726     .fp_addr.ip4 = *address,
727   };
728
729   a->neighbor_probe_adj_index = ~0;
730
731   if (pfx.fp_len <= 30)
732     {
733       /* a /30 or shorter - add a glean for the network address */
734       fib_node_index_t fei;
735
736       fei = fib_table_entry_update_one_path (fib_index, &pfx,
737                                              FIB_SOURCE_INTERFACE,
738                                              (FIB_ENTRY_FLAG_CONNECTED |
739                                               FIB_ENTRY_FLAG_ATTACHED),
740                                              FIB_PROTOCOL_IP4,
741                                              /* No next-hop address */
742                                              NULL,
743                                              sw_if_index,
744                                              // invalid FIB index
745                                              ~0,
746                                              1,
747                                              // no out-label stack
748                                              NULL,
749                                              FIB_ROUTE_PATH_FLAG_NONE);
750       a->neighbor_probe_adj_index = fib_entry_get_adj (fei);
751
752       /* Add the two broadcast addresses as drop */
753       fib_prefix_t net_pfx = {
754         .fp_len = 32,
755         .fp_proto = FIB_PROTOCOL_IP4,
756         .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[pfx.fp_len],
757       };
758       if (net_pfx.fp_addr.ip4.as_u32 != pfx.fp_addr.ip4.as_u32)
759         fib_table_entry_special_add(fib_index,
760                                     &net_pfx,
761                                     FIB_SOURCE_INTERFACE,
762                                     (FIB_ENTRY_FLAG_DROP |
763                                      FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT),
764                                     ADJ_INDEX_INVALID);
765       net_pfx.fp_addr.ip4.as_u32 |= ~im->fib_masks[pfx.fp_len];
766       if (net_pfx.fp_addr.ip4.as_u32 != pfx.fp_addr.ip4.as_u32)
767         fib_table_entry_special_add(fib_index,
768                                     &net_pfx,
769                                     FIB_SOURCE_INTERFACE,
770                                     (FIB_ENTRY_FLAG_DROP |
771                                      FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT),
772                                     ADJ_INDEX_INVALID);
773     }
774   else if (pfx.fp_len == 31)
775     {
776       u32 mask = clib_host_to_net_u32(1);
777       fib_prefix_t net_pfx = pfx;
778
779       net_pfx.fp_len = 32;
780       net_pfx.fp_addr.ip4.as_u32 ^= mask;
781
782       /* a /31 - add the other end as an attached host */
783       fib_table_entry_update_one_path (fib_index, &net_pfx,
784                                        FIB_SOURCE_INTERFACE,
785                                        (FIB_ENTRY_FLAG_ATTACHED),
786                                        FIB_PROTOCOL_IP4,
787                                        &net_pfx.fp_addr,
788                                        sw_if_index,
789                                        // invalid FIB index
790                                        ~0,
791                                        1,
792                                        NULL,
793                                        FIB_ROUTE_PATH_FLAG_NONE);
794     }
795   pfx.fp_len = 32;
796
797   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
798     {
799       u32 classify_table_index =
800         lm->classify_table_index_by_sw_if_index[sw_if_index];
801       if (classify_table_index != (u32) ~ 0)
802         {
803           dpo_id_t dpo = DPO_INVALID;
804
805           dpo_set (&dpo,
806                    DPO_CLASSIFY,
807                    DPO_PROTO_IP4,
808                    classify_dpo_create (DPO_PROTO_IP4, classify_table_index));
809
810           fib_table_entry_special_dpo_add (fib_index,
811                                            &pfx,
812                                            FIB_SOURCE_CLASSIFY,
813                                            FIB_ENTRY_FLAG_NONE, &dpo);
814           dpo_reset (&dpo);
815         }
816     }
817
818   fib_table_entry_update_one_path (fib_index, &pfx,
819                                    FIB_SOURCE_INTERFACE,
820                                    (FIB_ENTRY_FLAG_CONNECTED |
821                                     FIB_ENTRY_FLAG_LOCAL),
822                                    FIB_PROTOCOL_IP4,
823                                    &pfx.fp_addr,
824                                    sw_if_index,
825                                    // invalid FIB index
826                                    ~0,
827                                    1, NULL,
828                                    FIB_ROUTE_PATH_FLAG_NONE);
829 }
830
831 static void
832 ip4_del_interface_routes (ip4_main_t * im,
833                           u32 fib_index,
834                           ip4_address_t * address, u32 address_length)
835 {
836   fib_prefix_t pfx = {
837     .fp_len = address_length,
838     .fp_proto = FIB_PROTOCOL_IP4,
839     .fp_addr.ip4 = *address,
840   };
841
842   if (pfx.fp_len <= 30)
843     {
844       fib_prefix_t net_pfx = {
845         .fp_len = 32,
846         .fp_proto = FIB_PROTOCOL_IP4,
847         .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[pfx.fp_len],
848       };
849       if (net_pfx.fp_addr.ip4.as_u32 != pfx.fp_addr.ip4.as_u32)
850         fib_table_entry_special_remove(fib_index,
851                                        &net_pfx,
852                                        FIB_SOURCE_INTERFACE);
853       net_pfx.fp_addr.ip4.as_u32 |= ~im->fib_masks[pfx.fp_len];
854       if (net_pfx.fp_addr.ip4.as_u32 != pfx.fp_addr.ip4.as_u32)
855         fib_table_entry_special_remove(fib_index,
856                                        &net_pfx,
857                                        FIB_SOURCE_INTERFACE);
858       fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
859     }
860     else if (pfx.fp_len == 31)
861     {
862       u32 mask = clib_host_to_net_u32(1);
863       fib_prefix_t net_pfx = pfx;
864
865       net_pfx.fp_len = 32;
866       net_pfx.fp_addr.ip4.as_u32 ^= mask;
867
868       fib_table_entry_delete (fib_index, &net_pfx, FIB_SOURCE_INTERFACE);
869     }
870
871   pfx.fp_len = 32;
872   fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
873 }
874
875 void
876 ip4_sw_interface_enable_disable (u32 sw_if_index, u32 is_enable)
877 {
878   ip4_main_t *im = &ip4_main;
879
880   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
881
882   /*
883    * enable/disable only on the 1<->0 transition
884    */
885   if (is_enable)
886     {
887       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
888         return;
889     }
890   else
891     {
892       ASSERT (im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
893       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
894         return;
895     }
896   vnet_feature_enable_disable ("ip4-unicast", "ip4-drop", sw_if_index,
897                                !is_enable, 0, 0);
898
899
900   vnet_feature_enable_disable ("ip4-multicast", "ip4-drop",
901                                sw_if_index, !is_enable, 0, 0);
902 }
903
904 static clib_error_t *
905 ip4_add_del_interface_address_internal (vlib_main_t * vm,
906                                         u32 sw_if_index,
907                                         ip4_address_t * address,
908                                         u32 address_length, u32 is_del)
909 {
910   vnet_main_t *vnm = vnet_get_main ();
911   ip4_main_t *im = &ip4_main;
912   ip_lookup_main_t *lm = &im->lookup_main;
913   clib_error_t *error = 0;
914   u32 if_address_index, elts_before;
915   ip4_address_fib_t ip4_af, *addr_fib = 0;
916
917   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
918   ip4_addr_fib_init (&ip4_af, address,
919                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
920   vec_add1 (addr_fib, ip4_af);
921
922   /* FIXME-LATER
923    * there is no support for adj-fib handling in the presence of overlapping
924    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
925    * most routers do.
926    */
927   /* *INDENT-OFF* */
928   if (!is_del)
929     {
930       /* When adding an address check that it does not conflict
931          with an existing address. */
932       ip_interface_address_t *ia;
933       foreach_ip_interface_address
934         (&im->lookup_main, ia, sw_if_index,
935          0 /* honor unnumbered */ ,
936          ({
937            ip4_address_t * x =
938              ip_interface_address_get_address
939              (&im->lookup_main, ia);
940            if (ip4_destination_matches_route
941                (im, address, x, ia->address_length) ||
942                ip4_destination_matches_route (im,
943                                               x,
944                                               address,
945                                               address_length))
946              return
947                clib_error_create
948                ("failed to add %U which conflicts with %U for interface %U",
949                 format_ip4_address_and_length, address,
950                 address_length,
951                 format_ip4_address_and_length, x,
952                 ia->address_length,
953                 format_vnet_sw_if_index_name, vnm,
954                 sw_if_index);
955          }));
956     }
957   /* *INDENT-ON* */
958
959   elts_before = pool_elts (lm->if_address_pool);
960
961   error = ip_interface_address_add_del
962     (lm, sw_if_index, addr_fib, address_length, is_del, &if_address_index);
963   if (error)
964     goto done;
965
966   ip4_sw_interface_enable_disable (sw_if_index, !is_del);
967
968   if (is_del)
969     ip4_del_interface_routes (im, ip4_af.fib_index, address, address_length);
970   else
971     ip4_add_interface_routes (sw_if_index,
972                               im, ip4_af.fib_index,
973                               pool_elt_at_index
974                               (lm->if_address_pool, if_address_index));
975
976   /* If pool did not grow/shrink: add duplicate address. */
977   if (elts_before != pool_elts (lm->if_address_pool))
978     {
979       ip4_add_del_interface_address_callback_t *cb;
980       vec_foreach (cb, im->add_del_interface_address_callbacks)
981         cb->function (im, cb->function_opaque, sw_if_index,
982                       address, address_length, if_address_index, is_del);
983     }
984
985 done:
986   vec_free (addr_fib);
987   return error;
988 }
989
990 clib_error_t *
991 ip4_add_del_interface_address (vlib_main_t * vm,
992                                u32 sw_if_index,
993                                ip4_address_t * address,
994                                u32 address_length, u32 is_del)
995 {
996   return ip4_add_del_interface_address_internal
997     (vm, sw_if_index, address, address_length, is_del);
998 }
999
1000 /* Built-in ip4 unicast rx feature path definition */
1001 /* *INDENT-OFF* */
1002 VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
1003 {
1004   .arc_name = "ip4-unicast",
1005   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
1006   .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
1007 };
1008
1009 VNET_FEATURE_INIT (ip4_flow_classify, static) =
1010 {
1011   .arc_name = "ip4-unicast",
1012   .node_name = "ip4-flow-classify",
1013   .runs_before = VNET_FEATURES ("ip4-inacl"),
1014 };
1015
1016 VNET_FEATURE_INIT (ip4_inacl, static) =
1017 {
1018   .arc_name = "ip4-unicast",
1019   .node_name = "ip4-inacl",
1020   .runs_before = VNET_FEATURES ("ip4-source-check-via-rx"),
1021 };
1022
1023 VNET_FEATURE_INIT (ip4_source_check_1, static) =
1024 {
1025   .arc_name = "ip4-unicast",
1026   .node_name = "ip4-source-check-via-rx",
1027   .runs_before = VNET_FEATURES ("ip4-source-check-via-any"),
1028 };
1029
1030 VNET_FEATURE_INIT (ip4_source_check_2, static) =
1031 {
1032   .arc_name = "ip4-unicast",
1033   .node_name = "ip4-source-check-via-any",
1034   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
1035 };
1036
1037 VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) =
1038 {
1039   .arc_name = "ip4-unicast",
1040   .node_name = "ip4-source-and-port-range-check-rx",
1041   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
1042 };
1043
1044 VNET_FEATURE_INIT (ip4_policer_classify, static) =
1045 {
1046   .arc_name = "ip4-unicast",
1047   .node_name = "ip4-policer-classify",
1048   .runs_before = VNET_FEATURES ("ipsec-input-ip4"),
1049 };
1050
1051 VNET_FEATURE_INIT (ip4_ipsec, static) =
1052 {
1053   .arc_name = "ip4-unicast",
1054   .node_name = "ipsec-input-ip4",
1055   .runs_before = VNET_FEATURES ("vpath-input-ip4"),
1056 };
1057
1058 VNET_FEATURE_INIT (ip4_vpath, static) =
1059 {
1060   .arc_name = "ip4-unicast",
1061   .node_name = "vpath-input-ip4",
1062   .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
1063 };
1064
1065 VNET_FEATURE_INIT (ip4_vxlan_bypass, static) =
1066 {
1067   .arc_name = "ip4-unicast",
1068   .node_name = "ip4-vxlan-bypass",
1069   .runs_before = VNET_FEATURES ("ip4-lookup"),
1070 };
1071
1072 VNET_FEATURE_INIT (ip4_drop, static) =
1073 {
1074   .arc_name = "ip4-unicast",
1075   .node_name = "ip4-drop",
1076   .runs_before = VNET_FEATURES ("ip4-lookup"),
1077 };
1078
1079 VNET_FEATURE_INIT (ip4_lookup, static) =
1080 {
1081   .arc_name = "ip4-unicast",
1082   .node_name = "ip4-lookup",
1083   .runs_before = 0,     /* not before any other features */
1084 };
1085
1086 /* Built-in ip4 multicast rx feature path definition */
1087 VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
1088 {
1089   .arc_name = "ip4-multicast",
1090   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
1091   .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
1092 };
1093
1094 VNET_FEATURE_INIT (ip4_vpath_mc, static) =
1095 {
1096   .arc_name = "ip4-multicast",
1097   .node_name = "vpath-input-ip4",
1098   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
1099 };
1100
1101 VNET_FEATURE_INIT (ip4_mc_drop, static) =
1102 {
1103   .arc_name = "ip4-multicast",
1104   .node_name = "ip4-drop",
1105   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
1106 };
1107
1108 VNET_FEATURE_INIT (ip4_lookup_mc, static) =
1109 {
1110   .arc_name = "ip4-multicast",
1111   .node_name = "ip4-mfib-forward-lookup",
1112   .runs_before = 0,     /* last feature */
1113 };
1114
1115 /* Source and port-range check ip4 tx feature path definition */
1116 VNET_FEATURE_ARC_INIT (ip4_output, static) =
1117 {
1118   .arc_name = "ip4-output",
1119   .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain"),
1120   .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
1121 };
1122
1123 VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) =
1124 {
1125   .arc_name = "ip4-output",
1126   .node_name = "ip4-source-and-port-range-check-tx",
1127   .runs_before = VNET_FEATURES ("ipsec-output-ip4"),
1128 };
1129
1130 VNET_FEATURE_INIT (ip4_ipsec_output, static) =
1131 {
1132   .arc_name = "ip4-output",
1133   .node_name = "ipsec-output-ip4",
1134   .runs_before = VNET_FEATURES ("interface-output"),
1135 };
1136
1137 /* Built-in ip4 tx feature path definition */
1138 VNET_FEATURE_INIT (ip4_interface_output, static) =
1139 {
1140   .arc_name = "ip4-output",
1141   .node_name = "interface-output",
1142   .runs_before = 0,     /* not before any other features */
1143 };
1144 /* *INDENT-ON* */
1145
1146 static clib_error_t *
1147 ip4_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add)
1148 {
1149   ip4_main_t *im = &ip4_main;
1150
1151   /* Fill in lookup tables with default table (0). */
1152   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1153   vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);
1154
1155   vnet_feature_enable_disable ("ip4-unicast", "ip4-drop", sw_if_index,
1156                                is_add, 0, 0);
1157
1158   vnet_feature_enable_disable ("ip4-multicast", "ip4-drop", sw_if_index,
1159                                is_add, 0, 0);
1160
1161   return /* no error */ 0;
1162 }
1163
1164 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1165
1166 /* Global IP4 main. */
1167 ip4_main_t ip4_main;
1168
1169 clib_error_t *
1170 ip4_lookup_init (vlib_main_t * vm)
1171 {
1172   ip4_main_t *im = &ip4_main;
1173   clib_error_t *error;
1174   uword i;
1175
1176   if ((error = vlib_call_init_function (vm, vnet_feature_init)))
1177     return error;
1178
1179   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1180     {
1181       u32 m;
1182
1183       if (i < 32)
1184         m = pow2_mask (i) << (32 - i);
1185       else
1186         m = ~0;
1187       im->fib_masks[i] = clib_host_to_net_u32 (m);
1188     }
1189
1190   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1191
1192   /* Create FIB with index 0 and table id of 0. */
1193   fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0);
1194   mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0);
1195
1196   {
1197     pg_node_t *pn;
1198     pn = pg_get_node (ip4_lookup_node.index);
1199     pn->unformat_edit = unformat_pg_ip4_header;
1200   }
1201
1202   {
1203     ethernet_arp_header_t h;
1204
1205     memset (&h, 0, sizeof (h));
1206
1207     /* Set target ethernet address to all zeros. */
1208     memset (h.ip4_over_ethernet[1].ethernet, 0,
1209             sizeof (h.ip4_over_ethernet[1].ethernet));
1210
1211 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1212 #define _8(f,v) h.f = v;
1213     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1214     _16 (l3_type, ETHERNET_TYPE_IP4);
1215     _8 (n_l2_address_bytes, 6);
1216     _8 (n_l3_address_bytes, 4);
1217     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1218 #undef _16
1219 #undef _8
1220
1221     vlib_packet_template_init (vm, &im->ip4_arp_request_packet_template,
1222                                /* data */ &h,
1223                                sizeof (h),
1224                                /* alloc chunk size */ 8,
1225                                "ip4 arp");
1226   }
1227
1228   return error;
1229 }
1230
1231 VLIB_INIT_FUNCTION (ip4_lookup_init);
1232
1233 typedef struct
1234 {
1235   /* Adjacency taken. */
1236   u32 dpo_index;
1237   u32 flow_hash;
1238   u32 fib_index;
1239
1240   /* Packet data, possibly *after* rewrite. */
1241   u8 packet_data[64 - 1 * sizeof (u32)];
1242 }
1243 ip4_forward_next_trace_t;
1244
1245 u8 *
1246 format_ip4_forward_next_trace (u8 * s, va_list * args)
1247 {
1248   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1249   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1250   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1251   uword indent = format_get_indent (s);
1252   s = format (s, "%U%U",
1253               format_white_space, indent,
1254               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1255   return s;
1256 }
1257
1258 static u8 *
1259 format_ip4_lookup_trace (u8 * s, va_list * args)
1260 {
1261   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1262   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1263   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1264   uword indent = format_get_indent (s);
1265
1266   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1267               t->fib_index, t->dpo_index, t->flow_hash);
1268   s = format (s, "\n%U%U",
1269               format_white_space, indent,
1270               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1271   return s;
1272 }
1273
1274 static u8 *
1275 format_ip4_rewrite_trace (u8 * s, va_list * args)
1276 {
1277   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1278   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1279   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1280   uword indent = format_get_indent (s);
1281
1282   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1283               t->fib_index, t->dpo_index, format_ip_adjacency,
1284               t->dpo_index, FORMAT_IP_ADJACENCY_NONE, t->flow_hash);
1285   s = format (s, "\n%U%U",
1286               format_white_space, indent,
1287               format_ip_adjacency_packet_data,
1288               t->dpo_index, t->packet_data, sizeof (t->packet_data));
1289   return s;
1290 }
1291
1292 /* Common trace function for all ip4-forward next nodes. */
1293 void
1294 ip4_forward_next_trace (vlib_main_t * vm,
1295                         vlib_node_runtime_t * node,
1296                         vlib_frame_t * frame, vlib_rx_or_tx_t which_adj_index)
1297 {
1298   u32 *from, n_left;
1299   ip4_main_t *im = &ip4_main;
1300
1301   n_left = frame->n_vectors;
1302   from = vlib_frame_vector_args (frame);
1303
1304   while (n_left >= 4)
1305     {
1306       u32 bi0, bi1;
1307       vlib_buffer_t *b0, *b1;
1308       ip4_forward_next_trace_t *t0, *t1;
1309
1310       /* Prefetch next iteration. */
1311       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1312       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1313
1314       bi0 = from[0];
1315       bi1 = from[1];
1316
1317       b0 = vlib_get_buffer (vm, bi0);
1318       b1 = vlib_get_buffer (vm, bi1);
1319
1320       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1321         {
1322           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1323           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1324           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1325           t0->fib_index =
1326             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1327              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1328             vec_elt (im->fib_index_by_sw_if_index,
1329                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1330
1331           clib_memcpy (t0->packet_data,
1332                        vlib_buffer_get_current (b0),
1333                        sizeof (t0->packet_data));
1334         }
1335       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1336         {
1337           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1338           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1339           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1340           t1->fib_index =
1341             (vnet_buffer (b1)->sw_if_index[VLIB_TX] !=
1342              (u32) ~ 0) ? vnet_buffer (b1)->sw_if_index[VLIB_TX] :
1343             vec_elt (im->fib_index_by_sw_if_index,
1344                      vnet_buffer (b1)->sw_if_index[VLIB_RX]);
1345           clib_memcpy (t1->packet_data, vlib_buffer_get_current (b1),
1346                        sizeof (t1->packet_data));
1347         }
1348       from += 2;
1349       n_left -= 2;
1350     }
1351
1352   while (n_left >= 1)
1353     {
1354       u32 bi0;
1355       vlib_buffer_t *b0;
1356       ip4_forward_next_trace_t *t0;
1357
1358       bi0 = from[0];
1359
1360       b0 = vlib_get_buffer (vm, bi0);
1361
1362       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1363         {
1364           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1365           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1366           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1367           t0->fib_index =
1368             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1369              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1370             vec_elt (im->fib_index_by_sw_if_index,
1371                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1372           clib_memcpy (t0->packet_data, vlib_buffer_get_current (b0),
1373                        sizeof (t0->packet_data));
1374         }
1375       from += 1;
1376       n_left -= 1;
1377     }
1378 }
1379
1380 static uword
1381 ip4_drop_or_punt (vlib_main_t * vm,
1382                   vlib_node_runtime_t * node,
1383                   vlib_frame_t * frame, ip4_error_t error_code)
1384 {
1385   u32 *buffers = vlib_frame_vector_args (frame);
1386   uword n_packets = frame->n_vectors;
1387
1388   vlib_error_drop_buffers (vm, node, buffers,
1389                            /* stride */ 1,
1390                            n_packets,
1391                            /* next */ 0,
1392                            ip4_input_node.index, error_code);
1393
1394   if (node->flags & VLIB_NODE_FLAG_TRACE)
1395     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1396
1397   return n_packets;
1398 }
1399
1400 static uword
1401 ip4_drop (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
1402 {
1403   return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_DROP);
1404 }
1405
1406 static uword
1407 ip4_punt (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
1408 {
1409   return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT);
1410 }
1411
1412 /* *INDENT-OFF* */
1413 VLIB_REGISTER_NODE (ip4_drop_node, static) =
1414 {
1415   .function = ip4_drop,.
1416   name = "ip4-drop",
1417   .vector_size = sizeof (u32),
1418   .format_trace = format_ip4_forward_next_trace,
1419   .n_next_nodes = 1,
1420   .next_nodes = {
1421     [0] = "error-drop",
1422   },
1423 };
1424
1425 VLIB_NODE_FUNCTION_MULTIARCH (ip4_drop_node, ip4_drop);
1426
1427 VLIB_REGISTER_NODE (ip4_punt_node, static) =
1428 {
1429   .function = ip4_punt,
1430   .name = "ip4-punt",
1431   .vector_size = sizeof (u32),
1432   .format_trace = format_ip4_forward_next_trace,
1433   .n_next_nodes = 1,
1434   .next_nodes = {
1435     [0] = "error-punt",
1436   },
1437 };
1438
1439 VLIB_NODE_FUNCTION_MULTIARCH (ip4_punt_node, ip4_punt);
1440 /* *INDENT-ON */
1441
1442 /* Compute TCP/UDP/ICMP4 checksum in software. */
1443 u16
1444 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1445                               ip4_header_t * ip0)
1446 {
1447   ip_csum_t sum0;
1448   u32 ip_header_length, payload_length_host_byte_order;
1449   u32 n_this_buffer, n_bytes_left;
1450   u16 sum16;
1451   void *data_this_buffer;
1452
1453   /* Initialize checksum with ip header. */
1454   ip_header_length = ip4_header_bytes (ip0);
1455   payload_length_host_byte_order =
1456     clib_net_to_host_u16 (ip0->length) - ip_header_length;
1457   sum0 =
1458     clib_host_to_net_u32 (payload_length_host_byte_order +
1459                           (ip0->protocol << 16));
1460
1461   if (BITS (uword) == 32)
1462     {
1463       sum0 =
1464         ip_csum_with_carry (sum0,
1465                             clib_mem_unaligned (&ip0->src_address, u32));
1466       sum0 =
1467         ip_csum_with_carry (sum0,
1468                             clib_mem_unaligned (&ip0->dst_address, u32));
1469     }
1470   else
1471     sum0 =
1472       ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1473
1474   n_bytes_left = n_this_buffer = payload_length_host_byte_order;
1475   data_this_buffer = (void *) ip0 + ip_header_length;
1476   if (n_this_buffer + ip_header_length > p0->current_length)
1477     n_this_buffer =
1478       p0->current_length >
1479       ip_header_length ? p0->current_length - ip_header_length : 0;
1480   while (1)
1481     {
1482       sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
1483       n_bytes_left -= n_this_buffer;
1484       if (n_bytes_left == 0)
1485         break;
1486
1487       ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
1488       p0 = vlib_get_buffer (vm, p0->next_buffer);
1489       data_this_buffer = vlib_buffer_get_current (p0);
1490       n_this_buffer = p0->current_length;
1491     }
1492
1493   sum16 = ~ip_csum_fold (sum0);
1494
1495   return sum16;
1496 }
1497
1498 u32
1499 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1500 {
1501   ip4_header_t *ip0 = vlib_buffer_get_current (p0);
1502   udp_header_t *udp0;
1503   u16 sum16;
1504
1505   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1506           || ip0->protocol == IP_PROTOCOL_UDP);
1507
1508   udp0 = (void *) (ip0 + 1);
1509   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1510     {
1511       p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1512                     | IP_BUFFER_L4_CHECKSUM_CORRECT);
1513       return p0->flags;
1514     }
1515
1516   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1517
1518   p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1519                 | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT));
1520
1521   return p0->flags;
1522 }
1523
1524 /* *INDENT-OFF* */
1525 VNET_FEATURE_ARC_INIT (ip4_local) =
1526 {
1527   .arc_name  = "ip4-local",
1528   .start_nodes = VNET_FEATURES ("ip4-local"),
1529 };
1530 /* *INDENT-ON* */
1531
1532 static inline uword
1533 ip4_local_inline (vlib_main_t * vm,
1534                   vlib_node_runtime_t * node,
1535                   vlib_frame_t * frame, int head_of_feature_arc)
1536 {
1537   ip4_main_t *im = &ip4_main;
1538   ip_lookup_main_t *lm = &im->lookup_main;
1539   ip_local_next_t next_index;
1540   u32 *from, *to_next, n_left_from, n_left_to_next;
1541   vlib_node_runtime_t *error_node =
1542     vlib_node_get_runtime (vm, ip4_input_node.index);
1543   u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;
1544
1545   from = vlib_frame_vector_args (frame);
1546   n_left_from = frame->n_vectors;
1547   next_index = node->cached_next_index;
1548
1549   if (node->flags & VLIB_NODE_FLAG_TRACE)
1550     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1551
1552   while (n_left_from > 0)
1553     {
1554       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1555
1556       while (n_left_from >= 4 && n_left_to_next >= 2)
1557         {
1558           vlib_buffer_t *p0, *p1;
1559           ip4_header_t *ip0, *ip1;
1560           udp_header_t *udp0, *udp1;
1561           ip4_fib_mtrie_t *mtrie0, *mtrie1;
1562           ip4_fib_mtrie_leaf_t leaf0, leaf1;
1563           const dpo_id_t *dpo0, *dpo1;
1564           const load_balance_t *lb0, *lb1;
1565           u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, lbi0;
1566           u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, lbi1;
1567           i32 len_diff0, len_diff1;
1568           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
1569           u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
1570           u32 sw_if_index0, sw_if_index1;
1571
1572           pi0 = to_next[0] = from[0];
1573           pi1 = to_next[1] = from[1];
1574           from += 2;
1575           n_left_from -= 2;
1576           to_next += 2;
1577           n_left_to_next -= 2;
1578
1579           next0 = next1 = IP_LOCAL_NEXT_DROP;
1580
1581           p0 = vlib_get_buffer (vm, pi0);
1582           p1 = vlib_get_buffer (vm, pi1);
1583
1584           ip0 = vlib_buffer_get_current (p0);
1585           ip1 = vlib_buffer_get_current (p1);
1586
1587           vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data;
1588           vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data;
1589
1590           sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
1591           sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX];
1592
1593           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0);
1594           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1);
1595
1596           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0);
1597           fib_index0 =
1598             (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
1599              (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];
1600
1601           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1);
1602           fib_index1 =
1603             (vnet_buffer (p1)->sw_if_index[VLIB_TX] ==
1604              (u32) ~ 0) ? fib_index1 : vnet_buffer (p1)->sw_if_index[VLIB_TX];
1605
1606           mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
1607           mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
1608
1609           leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, &ip0->src_address);
1610           leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, &ip1->src_address);
1611
1612           /* Treat IP frag packets as "experimental" protocol for now
1613              until support of IP frag reassembly is implemented */
1614           proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol;
1615           proto1 = ip4_is_fragment (ip1) ? 0xfe : ip1->protocol;
1616
1617           if (head_of_feature_arc == 0)
1618             {
1619               error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;
1620               goto skip_checks;
1621             }
1622
1623           is_udp0 = proto0 == IP_PROTOCOL_UDP;
1624           is_udp1 = proto1 == IP_PROTOCOL_UDP;
1625           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
1626           is_tcp_udp1 = is_udp1 || proto1 == IP_PROTOCOL_TCP;
1627
1628           flags0 = p0->flags;
1629           flags1 = p1->flags;
1630
1631           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1632           good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1633
1634           udp0 = ip4_next_header (ip0);
1635           udp1 = ip4_next_header (ip1);
1636
1637           /* Don't verify UDP checksum for packets with explicit zero checksum. */
1638           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1639           good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1640
1641           leaf0 =
1642             ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
1643           leaf1 =
1644             ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1);
1645
1646           /* Verify UDP length. */
1647           ip_len0 = clib_net_to_host_u16 (ip0->length);
1648           ip_len1 = clib_net_to_host_u16 (ip1->length);
1649           udp_len0 = clib_net_to_host_u16 (udp0->length);
1650           udp_len1 = clib_net_to_host_u16 (udp1->length);
1651
1652           len_diff0 = ip_len0 - udp_len0;
1653           len_diff1 = ip_len1 - udp_len1;
1654
1655           len_diff0 = is_udp0 ? len_diff0 : 0;
1656           len_diff1 = is_udp1 ? len_diff1 : 0;
1657
1658           if (PREDICT_FALSE (!(is_tcp_udp0 & is_tcp_udp1
1659                                & good_tcp_udp0 & good_tcp_udp1)))
1660             {
1661               if (is_tcp_udp0)
1662                 {
1663                   if (is_tcp_udp0
1664                       && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1665                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
1666                   good_tcp_udp0 =
1667                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1668                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1669                 }
1670               if (is_tcp_udp1)
1671                 {
1672                   if (is_tcp_udp1
1673                       && !(flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1674                     flags1 = ip4_tcp_udp_validate_checksum (vm, p1);
1675                   good_tcp_udp1 =
1676                     (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1677                   good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1678                 }
1679             }
1680
1681           good_tcp_udp0 &= len_diff0 >= 0;
1682           good_tcp_udp1 &= len_diff1 >= 0;
1683
1684           leaf0 =
1685             ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1686           leaf1 =
1687             ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2);
1688
1689           error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;
1690
1691           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
1692           error1 = len_diff1 < 0 ? IP4_ERROR_UDP_LENGTH : error1;
1693
1694           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1695           error0 = (is_tcp_udp0 && !good_tcp_udp0
1696                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0 : error0);
1697           error1 = (is_tcp_udp1 && !good_tcp_udp1
1698                     ? IP4_ERROR_TCP_CHECKSUM + is_udp1 : error1);
1699
1700           leaf0 =
1701             ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1702           leaf1 =
1703             ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
1704
1705           vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0 =
1706             ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1707           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
1708
1709           vnet_buffer (p1)->ip.adj_index[VLIB_RX] = lbi1 =
1710             ip4_fib_mtrie_leaf_get_adj_index (leaf1);
1711           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = lbi1;
1712
1713           lb0 = load_balance_get (lbi0);
1714           lb1 = load_balance_get (lbi1);
1715           dpo0 = load_balance_get_bucket_i (lb0, 0);
1716           dpo1 = load_balance_get_bucket_i (lb1, 0);
1717
1718           /*
1719            * Must have a route to source otherwise we drop the packet.
1720            * ip4 broadcasts are accepted, e.g. to make dhcp client work
1721            *
1722            * The checks are:
1723            *  - the source is a recieve => it's from us => bogus, do this
1724            *    first since it sets a different error code.
1725            *  - uRPF check for any route to source - accept if passes.
1726            *  - allow packets destined to the broadcast address from unknown sources
1727            */
1728           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1729                      dpo0->dpoi_type == DPO_RECEIVE) ?
1730                     IP4_ERROR_SPOOFED_LOCAL_PACKETS : error0);
1731           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1732                      !fib_urpf_check_size (lb0->lb_urpf) &&
1733                      ip0->dst_address.as_u32 != 0xFFFFFFFF)
1734                     ? IP4_ERROR_SRC_LOOKUP_MISS : error0);
1735           error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1736                      dpo1->dpoi_type == DPO_RECEIVE) ?
1737                     IP4_ERROR_SPOOFED_LOCAL_PACKETS : error1);
1738           error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1739                      !fib_urpf_check_size (lb1->lb_urpf) &&
1740                      ip1->dst_address.as_u32 != 0xFFFFFFFF)
1741                     ? IP4_ERROR_SRC_LOOKUP_MISS : error1);
1742
1743         skip_checks:
1744
1745           next0 = lm->local_next_by_ip_protocol[proto0];
1746           next1 = lm->local_next_by_ip_protocol[proto1];
1747
1748           next0 =
1749             error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
1750           next1 =
1751             error1 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1;
1752
1753           p0->error = error0 ? error_node->errors[error0] : 0;
1754           p1->error = error1 ? error_node->errors[error1] : 0;
1755
1756           if (head_of_feature_arc)
1757             {
1758               if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1759                 vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0);
1760               if (PREDICT_TRUE (error1 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1761                 vnet_feature_arc_start (arc_index, sw_if_index1, &next1, p1);
1762             }
1763
1764           vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
1765                                            n_left_to_next, pi0, pi1,
1766                                            next0, next1);
1767         }
1768
1769       while (n_left_from > 0 && n_left_to_next > 0)
1770         {
1771           vlib_buffer_t *p0;
1772           ip4_header_t *ip0;
1773           udp_header_t *udp0;
1774           ip4_fib_mtrie_t *mtrie0;
1775           ip4_fib_mtrie_leaf_t leaf0;
1776           u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, lbi0;
1777           i32 len_diff0;
1778           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
1779           load_balance_t *lb0;
1780           const dpo_id_t *dpo0;
1781           u32 sw_if_index0;
1782
1783           pi0 = to_next[0] = from[0];
1784           from += 1;
1785           n_left_from -= 1;
1786           to_next += 1;
1787           n_left_to_next -= 1;
1788
1789           next0 = IP_LOCAL_NEXT_DROP;
1790
1791           p0 = vlib_get_buffer (vm, pi0);
1792
1793           ip0 = vlib_buffer_get_current (p0);
1794
1795           vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data;
1796
1797           sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
1798
1799           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0);
1800
1801           fib_index0 =
1802             (vnet_buffer (p0)->sw_if_index[VLIB_TX] ==
1803              (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX];
1804
1805           mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
1806
1807           leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, &ip0->src_address);
1808
1809           /* Treat IP frag packets as "experimental" protocol for now
1810              until support of IP frag reassembly is implemented */
1811           proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol;
1812
1813           if (head_of_feature_arc == 0)
1814             {
1815               error0 = IP4_ERROR_UNKNOWN_PROTOCOL;
1816               goto skip_check;
1817             }
1818
1819           is_udp0 = proto0 == IP_PROTOCOL_UDP;
1820           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
1821
1822           flags0 = p0->flags;
1823
1824           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1825
1826           udp0 = ip4_next_header (ip0);
1827
1828           /* Don't verify UDP checksum for packets with explicit zero checksum. */
1829           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1830
1831           leaf0 =
1832             ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
1833
1834           /* Verify UDP length. */
1835           ip_len0 = clib_net_to_host_u16 (ip0->length);
1836           udp_len0 = clib_net_to_host_u16 (udp0->length);
1837
1838           len_diff0 = ip_len0 - udp_len0;
1839
1840           len_diff0 = is_udp0 ? len_diff0 : 0;
1841
1842           if (PREDICT_FALSE (!(is_tcp_udp0 & good_tcp_udp0)))
1843             {
1844               if (is_tcp_udp0)
1845                 {
1846                   if (is_tcp_udp0
1847                       && !(flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1848                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
1849                   good_tcp_udp0 =
1850                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1851                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1852                 }
1853             }
1854
1855           good_tcp_udp0 &= len_diff0 >= 0;
1856
1857           leaf0 =
1858             ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1859
1860           error0 = IP4_ERROR_UNKNOWN_PROTOCOL;
1861
1862           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
1863
1864           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1865           error0 = (is_tcp_udp0 && !good_tcp_udp0
1866                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0 : error0);
1867
1868           leaf0 =
1869             ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1870
1871           lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1872           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
1873
1874           lb0 = load_balance_get (lbi0);
1875           dpo0 = load_balance_get_bucket_i (lb0, 0);
1876
1877           vnet_buffer (p0)->ip.adj_index[VLIB_TX] =
1878             vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0;
1879
1880           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1881                      dpo0->dpoi_type == DPO_RECEIVE) ?
1882                     IP4_ERROR_SPOOFED_LOCAL_PACKETS : error0);
1883           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1884                      !fib_urpf_check_size (lb0->lb_urpf) &&
1885                      ip0->dst_address.as_u32 != 0xFFFFFFFF)
1886                     ? IP4_ERROR_SRC_LOOKUP_MISS : error0);
1887
1888         skip_check:
1889
1890           next0 = lm->local_next_by_ip_protocol[proto0];
1891
1892           next0 =
1893             error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
1894
1895           p0->error = error0 ? error_node->errors[error0] : 0;
1896
1897           if (head_of_feature_arc)
1898             {
1899               if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1900                 vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0);
1901             }
1902
1903           vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1904                                            n_left_to_next, pi0, next0);
1905
1906         }
1907
1908       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1909     }
1910
1911   return frame->n_vectors;
1912 }
1913
1914 static uword
1915 ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
1916 {
1917   return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ );
1918 }
1919
1920 /* *INDENT-OFF* */
1921 VLIB_REGISTER_NODE (ip4_local_node) =
1922 {
1923   .function = ip4_local,
1924   .name = "ip4-local",
1925   .vector_size = sizeof (u32),
1926   .format_trace = format_ip4_forward_next_trace,
1927   .n_next_nodes = IP_LOCAL_N_NEXT,
1928   .next_nodes =
1929   {
1930     [IP_LOCAL_NEXT_DROP] = "error-drop",
1931     [IP_LOCAL_NEXT_PUNT] = "error-punt",
1932     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1933     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",},
1934 };
1935 /* *INDENT-ON* */
1936
1937 VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local);
1938
1939 static uword
1940 ip4_local_end_of_arc (vlib_main_t * vm,
1941                       vlib_node_runtime_t * node, vlib_frame_t * frame)
1942 {
1943   return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ );
1944 }
1945
1946 /* *INDENT-OFF* */
1947 VLIB_REGISTER_NODE (ip4_local_end_of_arc_node,static) = {
1948   .function = ip4_local_end_of_arc,
1949   .name = "ip4-local-end-of-arc",
1950   .vector_size = sizeof (u32),
1951
1952   .format_trace = format_ip4_forward_next_trace,
1953   .sibling_of = "ip4-local",
1954 };
1955
1956 VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_end_of_arc_node, ip4_local_end_of_arc)
1957
1958 VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = {
1959   .arc_name = "ip4-local",
1960   .node_name = "ip4-local-end-of-arc",
1961   .runs_before = 0, /* not before any other features */
1962 };
1963 /* *INDENT-ON* */
1964
1965 void
1966 ip4_register_protocol (u32 protocol, u32 node_index)
1967 {
1968   vlib_main_t *vm = vlib_get_main ();
1969   ip4_main_t *im = &ip4_main;
1970   ip_lookup_main_t *lm = &im->lookup_main;
1971
1972   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1973   lm->local_next_by_ip_protocol[protocol] =
1974     vlib_node_add_next (vm, ip4_local_node.index, node_index);
1975 }
1976
1977 static clib_error_t *
1978 show_ip_local_command_fn (vlib_main_t * vm,
1979                           unformat_input_t * input, vlib_cli_command_t * cmd)
1980 {
1981   ip4_main_t *im = &ip4_main;
1982   ip_lookup_main_t *lm = &im->lookup_main;
1983   int i;
1984
1985   vlib_cli_output (vm, "Protocols handled by ip4_local");
1986   for (i = 0; i < ARRAY_LEN (lm->local_next_by_ip_protocol); i++)
1987     {
1988       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1989         vlib_cli_output (vm, "%d", i);
1990     }
1991   return 0;
1992 }
1993
1994
1995
1996 /*?
1997  * Display the set of protocols handled by the local IPv4 stack.
1998  *
1999  * @cliexpar
2000  * Example of how to display local protocol table:
2001  * @cliexstart{show ip local}
2002  * Protocols handled by ip4_local
2003  * 1
2004  * 17
2005  * 47
2006  * @cliexend
2007 ?*/
2008 /* *INDENT-OFF* */
2009 VLIB_CLI_COMMAND (show_ip_local, static) =
2010 {
2011   .path = "show ip local",
2012   .function = show_ip_local_command_fn,
2013   .short_help = "show ip local",
2014 };
2015 /* *INDENT-ON* */
2016
2017 always_inline uword
2018 ip4_arp_inline (vlib_main_t * vm,
2019                 vlib_node_runtime_t * node,
2020                 vlib_frame_t * frame, int is_glean)
2021 {
2022   vnet_main_t *vnm = vnet_get_main ();
2023   ip4_main_t *im = &ip4_main;
2024   ip_lookup_main_t *lm = &im->lookup_main;
2025   u32 *from, *to_next_drop;
2026   uword n_left_from, n_left_to_next_drop, next_index;
2027   static f64 time_last_seed_change = -1e100;
2028   static u32 hash_seeds[3];
2029   static uword hash_bitmap[256 / BITS (uword)];
2030   f64 time_now;
2031
2032   if (node->flags & VLIB_NODE_FLAG_TRACE)
2033     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2034
2035   time_now = vlib_time_now (vm);
2036   if (time_now - time_last_seed_change > 1e-3)
2037     {
2038       uword i;
2039       u32 *r = clib_random_buffer_get_data (&vm->random_buffer,
2040                                             sizeof (hash_seeds));
2041       for (i = 0; i < ARRAY_LEN (hash_seeds); i++)
2042         hash_seeds[i] = r[i];
2043
2044       /* Mark all hash keys as been no-seen before. */
2045       for (i = 0; i < ARRAY_LEN (hash_bitmap); i++)
2046         hash_bitmap[i] = 0;
2047
2048       time_last_seed_change = time_now;
2049     }
2050
2051   from = vlib_frame_vector_args (frame);
2052   n_left_from = frame->n_vectors;
2053   next_index = node->cached_next_index;
2054   if (next_index == IP4_ARP_NEXT_DROP)
2055     next_index = IP4_ARP_N_NEXT;        /* point to first interface */
2056
2057   while (n_left_from > 0)
2058     {
2059       vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
2060                            to_next_drop, n_left_to_next_drop);
2061
2062       while (n_left_from > 0 && n_left_to_next_drop > 0)
2063         {
2064           u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
2065           ip_adjacency_t *adj0;
2066           vlib_buffer_t *p0;
2067           ip4_header_t *ip0;
2068           uword bm0;
2069
2070           pi0 = from[0];
2071
2072           p0 = vlib_get_buffer (vm, pi0);
2073
2074           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2075           adj0 = ip_get_adjacency (lm, adj_index0);
2076           ip0 = vlib_buffer_get_current (p0);
2077
2078           a0 = hash_seeds[0];
2079           b0 = hash_seeds[1];
2080           c0 = hash_seeds[2];
2081
2082           sw_if_index0 = adj0->rewrite_header.sw_if_index;
2083           vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
2084
2085           if (is_glean)
2086             {
2087               /*
2088                * this is the Glean case, so we are ARPing for the
2089                * packet's destination
2090                */
2091               a0 ^= ip0->dst_address.data_u32;
2092             }
2093           else
2094             {
2095               a0 ^= adj0->sub_type.nbr.next_hop.ip4.data_u32;
2096             }
2097           b0 ^= sw_if_index0;
2098
2099           hash_v3_finalize32 (a0, b0, c0);
2100
2101           c0 &= BITS (hash_bitmap) - 1;
2102           c0 = c0 / BITS (uword);
2103           m0 = (uword) 1 << (c0 % BITS (uword));
2104
2105           bm0 = hash_bitmap[c0];
2106           drop0 = (bm0 & m0) != 0;
2107
2108           /* Mark it as seen. */
2109           hash_bitmap[c0] = bm0 | m0;
2110
2111           from += 1;
2112           n_left_from -= 1;
2113           to_next_drop[0] = pi0;
2114           to_next_drop += 1;
2115           n_left_to_next_drop -= 1;
2116
2117           p0->error =
2118             node->errors[drop0 ? IP4_ARP_ERROR_DROP :
2119                          IP4_ARP_ERROR_REQUEST_SENT];
2120
2121           /*
2122            * the adj has been updated to a rewrite but the node the DPO that got
2123            * us here hasn't - yet. no big deal. we'll drop while we wait.
2124            */
2125           if (IP_LOOKUP_NEXT_REWRITE == adj0->lookup_next_index)
2126             continue;
2127
2128           if (drop0)
2129             continue;
2130
2131           /*
2132            * Can happen if the control-plane is programming tables
2133            * with traffic flowing; at least that's today's lame excuse.
2134            */
2135           if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN)
2136               || (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
2137             {
2138               p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
2139             }
2140           else
2141             /* Send ARP request. */
2142             {
2143               u32 bi0 = 0;
2144               vlib_buffer_t *b0;
2145               ethernet_arp_header_t *h0;
2146               vnet_hw_interface_t *hw_if0;
2147
2148               h0 =
2149                 vlib_packet_template_get_packet (vm,
2150                                                  &im->ip4_arp_request_packet_template,
2151                                                  &bi0);
2152
2153               /* Add rewrite/encap string for ARP packet. */
2154               vnet_rewrite_one_header (adj0[0], h0,
2155                                        sizeof (ethernet_header_t));
2156
2157               hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
2158
2159               /* Src ethernet address in ARP header. */
2160               clib_memcpy (h0->ip4_over_ethernet[0].ethernet,
2161                            hw_if0->hw_address,
2162                            sizeof (h0->ip4_over_ethernet[0].ethernet));
2163
2164               if (is_glean)
2165                 {
2166                   /* The interface's source address is stashed in the Glean Adj */
2167                   h0->ip4_over_ethernet[0].ip4 =
2168                     adj0->sub_type.glean.receive_addr.ip4;
2169
2170                   /* Copy in destination address we are requesting. This is the
2171                    * glean case, so it's the packet's destination.*/
2172                   h0->ip4_over_ethernet[1].ip4.data_u32 =
2173                     ip0->dst_address.data_u32;
2174                 }
2175               else
2176                 {
2177                   /* Src IP address in ARP header. */
2178                   if (ip4_src_address_for_packet (lm, sw_if_index0,
2179                                                   &h0->
2180                                                   ip4_over_ethernet[0].ip4))
2181                     {
2182                       /* No source address available */
2183                       p0->error =
2184                         node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
2185                       vlib_buffer_free (vm, &bi0, 1);
2186                       continue;
2187                     }
2188
2189                   /* Copy in destination address we are requesting from the
2190                      incomplete adj */
2191                   h0->ip4_over_ethernet[1].ip4.data_u32 =
2192                     adj0->sub_type.nbr.next_hop.ip4.as_u32;
2193                 }
2194
2195               vlib_buffer_copy_trace_flag (vm, p0, bi0);
2196               b0 = vlib_get_buffer (vm, bi0);
2197               vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
2198
2199               vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
2200
2201               vlib_set_next_frame_buffer (vm, node,
2202                                           adj0->rewrite_header.next_index,
2203                                           bi0);
2204             }
2205         }
2206
2207       vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
2208     }
2209
2210   return frame->n_vectors;
2211 }
2212
2213 static uword
2214 ip4_arp (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
2215 {
2216   return (ip4_arp_inline (vm, node, frame, 0));
2217 }
2218
2219 static uword
2220 ip4_glean (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
2221 {
2222   return (ip4_arp_inline (vm, node, frame, 1));
2223 }
2224
2225 static char *ip4_arp_error_strings[] = {
2226   [IP4_ARP_ERROR_DROP] = "address overflow drops",
2227   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
2228   [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
2229   [IP4_ARP_ERROR_REPLICATE_DROP] = "ARP replication completed",
2230   [IP4_ARP_ERROR_REPLICATE_FAIL] = "ARP replication failed",
2231   [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
2232 };
2233
2234 VLIB_REGISTER_NODE (ip4_arp_node) =
2235 {
2236   .function = ip4_arp,.name = "ip4-arp",.vector_size =
2237     sizeof (u32),.format_trace = format_ip4_forward_next_trace,.n_errors =
2238     ARRAY_LEN (ip4_arp_error_strings),.error_strings =
2239     ip4_arp_error_strings,.n_next_nodes = IP4_ARP_N_NEXT,.next_nodes =
2240   {
2241   [IP4_ARP_NEXT_DROP] = "error-drop",}
2242 ,};
2243
2244 VLIB_REGISTER_NODE (ip4_glean_node) =
2245 {
2246   .function = ip4_glean,.name = "ip4-glean",.vector_size =
2247     sizeof (u32),.format_trace = format_ip4_forward_next_trace,.n_errors =
2248     ARRAY_LEN (ip4_arp_error_strings),.error_strings =
2249     ip4_arp_error_strings,.n_next_nodes = IP4_ARP_N_NEXT,.next_nodes =
2250   {
2251   [IP4_ARP_NEXT_DROP] = "error-drop",}
2252 ,};
2253
2254 #define foreach_notrace_ip4_arp_error           \
2255 _(DROP)                                         \
2256 _(REQUEST_SENT)                                 \
2257 _(REPLICATE_DROP)                               \
2258 _(REPLICATE_FAIL)
2259
2260 clib_error_t *
2261 arp_notrace_init (vlib_main_t * vm)
2262 {
2263   vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, ip4_arp_node.index);
2264
2265   /* don't trace ARP request packets */
2266 #define _(a)                                    \
2267     vnet_pcap_drop_trace_filter_add_del         \
2268         (rt->errors[IP4_ARP_ERROR_##a],         \
2269          1 /* is_add */);
2270   foreach_notrace_ip4_arp_error;
2271 #undef _
2272   return 0;
2273 }
2274
2275 VLIB_INIT_FUNCTION (arp_notrace_init);
2276
2277
2278 /* Send an ARP request to see if given destination is reachable on given interface. */
2279 clib_error_t *
2280 ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index)
2281 {
2282   vnet_main_t *vnm = vnet_get_main ();
2283   ip4_main_t *im = &ip4_main;
2284   ethernet_arp_header_t *h;
2285   ip4_address_t *src;
2286   ip_interface_address_t *ia;
2287   ip_adjacency_t *adj;
2288   vnet_hw_interface_t *hi;
2289   vnet_sw_interface_t *si;
2290   vlib_buffer_t *b;
2291   u32 bi = 0;
2292
2293   si = vnet_get_sw_interface (vnm, sw_if_index);
2294
2295   if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
2296     {
2297       return clib_error_return (0, "%U: interface %U down",
2298                                 format_ip4_address, dst,
2299                                 format_vnet_sw_if_index_name, vnm,
2300                                 sw_if_index);
2301     }
2302
2303   src =
2304     ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
2305   if (!src)
2306     {
2307       vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
2308       return clib_error_return
2309         (0,
2310          "no matching interface address for destination %U (interface %U)",
2311          format_ip4_address, dst, format_vnet_sw_if_index_name, vnm,
2312          sw_if_index);
2313     }
2314
2315   adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index);
2316
2317   h =
2318     vlib_packet_template_get_packet (vm,
2319                                      &im->ip4_arp_request_packet_template,
2320                                      &bi);
2321
2322   hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
2323
2324   clib_memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address,
2325                sizeof (h->ip4_over_ethernet[0].ethernet));
2326
2327   h->ip4_over_ethernet[0].ip4 = src[0];
2328   h->ip4_over_ethernet[1].ip4 = dst[0];
2329
2330   b = vlib_get_buffer (vm, bi);
2331   vnet_buffer (b)->sw_if_index[VLIB_RX] =
2332     vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
2333
2334   /* Add encapsulation string for software interface (e.g. ethernet header). */
2335   vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
2336   vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
2337
2338   {
2339     vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index);
2340     u32 *to_next = vlib_frame_vector_args (f);
2341     to_next[0] = bi;
2342     f->n_vectors = 1;
2343     vlib_put_frame_to_node (vm, hi->output_node_index, f);
2344   }
2345
2346   return /* no error */ 0;
2347 }
2348
2349 typedef enum
2350 {
2351   IP4_REWRITE_NEXT_DROP,
2352   IP4_REWRITE_NEXT_ICMP_ERROR,
2353 } ip4_rewrite_next_t;
2354
2355 always_inline uword
2356 ip4_rewrite_inline (vlib_main_t * vm,
2357                     vlib_node_runtime_t * node,
2358                     vlib_frame_t * frame,
2359                     int do_counters, int is_midchain, int is_mcast)
2360 {
2361   ip_lookup_main_t *lm = &ip4_main.lookup_main;
2362   u32 *from = vlib_frame_vector_args (frame);
2363   u32 n_left_from, n_left_to_next, *to_next, next_index;
2364   vlib_node_runtime_t *error_node =
2365     vlib_node_get_runtime (vm, ip4_input_node.index);
2366
2367   n_left_from = frame->n_vectors;
2368   next_index = node->cached_next_index;
2369   u32 cpu_index = os_get_cpu_number ();
2370
2371   while (n_left_from > 0)
2372     {
2373       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2374
2375       while (n_left_from >= 4 && n_left_to_next >= 2)
2376         {
2377           ip_adjacency_t *adj0, *adj1;
2378           vlib_buffer_t *p0, *p1;
2379           ip4_header_t *ip0, *ip1;
2380           u32 pi0, rw_len0, next0, error0, checksum0, adj_index0;
2381           u32 pi1, rw_len1, next1, error1, checksum1, adj_index1;
2382           u32 tx_sw_if_index0, tx_sw_if_index1;
2383
2384           /* Prefetch next iteration. */
2385           {
2386             vlib_buffer_t *p2, *p3;
2387
2388             p2 = vlib_get_buffer (vm, from[2]);
2389             p3 = vlib_get_buffer (vm, from[3]);
2390
2391             vlib_prefetch_buffer_header (p2, STORE);
2392             vlib_prefetch_buffer_header (p3, STORE);
2393
2394             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
2395             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
2396           }
2397
2398           pi0 = to_next[0] = from[0];
2399           pi1 = to_next[1] = from[1];
2400
2401           from += 2;
2402           n_left_from -= 2;
2403           to_next += 2;
2404           n_left_to_next -= 2;
2405
2406           p0 = vlib_get_buffer (vm, pi0);
2407           p1 = vlib_get_buffer (vm, pi1);
2408
2409           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2410           adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];
2411
2412           /*
2413            * pre-fetch the per-adjacency counters
2414            */
2415           if (do_counters)
2416             {
2417               vlib_prefetch_combined_counter (&adjacency_counters,
2418                                               cpu_index, adj_index0);
2419               vlib_prefetch_combined_counter (&adjacency_counters,
2420                                               cpu_index, adj_index1);
2421             }
2422
2423           ip0 = vlib_buffer_get_current (p0);
2424           ip1 = vlib_buffer_get_current (p1);
2425
2426           error0 = error1 = IP4_ERROR_NONE;
2427           next0 = next1 = IP4_REWRITE_NEXT_DROP;
2428
2429           /* Decrement TTL & update checksum.
2430              Works either endian, so no need for byte swap. */
2431           if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
2432             {
2433               i32 ttl0 = ip0->ttl;
2434
2435               /* Input node should have reject packets with ttl 0. */
2436               ASSERT (ip0->ttl > 0);
2437
2438               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2439               checksum0 += checksum0 >= 0xffff;
2440
2441               ip0->checksum = checksum0;
2442               ttl0 -= 1;
2443               ip0->ttl = ttl0;
2444
2445               /*
2446                * If the ttl drops below 1 when forwarding, generate
2447                * an ICMP response.
2448                */
2449               if (PREDICT_FALSE (ttl0 <= 0))
2450                 {
2451                   error0 = IP4_ERROR_TIME_EXPIRED;
2452                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2453                   icmp4_error_set_vnet_buffer (p0, ICMP4_time_exceeded,
2454                                                ICMP4_time_exceeded_ttl_exceeded_in_transit,
2455                                                0);
2456                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2457                 }
2458
2459               /* Verify checksum. */
2460               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2461             }
2462           else
2463             {
2464               p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
2465             }
2466           if (PREDICT_TRUE (!(p1->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
2467             {
2468               i32 ttl1 = ip1->ttl;
2469
2470               /* Input node should have reject packets with ttl 0. */
2471               ASSERT (ip1->ttl > 0);
2472
2473               checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100);
2474               checksum1 += checksum1 >= 0xffff;
2475
2476               ip1->checksum = checksum1;
2477               ttl1 -= 1;
2478               ip1->ttl = ttl1;
2479
2480               /*
2481                * If the ttl drops below 1 when forwarding, generate
2482                * an ICMP response.
2483                */
2484               if (PREDICT_FALSE (ttl1 <= 0))
2485                 {
2486                   error1 = IP4_ERROR_TIME_EXPIRED;
2487                   vnet_buffer (p1)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2488                   icmp4_error_set_vnet_buffer (p1, ICMP4_time_exceeded,
2489                                                ICMP4_time_exceeded_ttl_exceeded_in_transit,
2490                                                0);
2491                   next1 = IP4_REWRITE_NEXT_ICMP_ERROR;
2492                 }
2493
2494               /* Verify checksum. */
2495               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2496               ASSERT (ip1->checksum == ip4_header_checksum (ip1));
2497             }
2498           else
2499             {
2500               p1->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
2501             }
2502
2503           /* Rewrite packet header and updates lengths. */
2504           adj0 = ip_get_adjacency (lm, adj_index0);
2505           adj1 = ip_get_adjacency (lm, adj_index1);
2506
2507           /* Worth pipelining. No guarantee that adj0,1 are hot... */
2508           rw_len0 = adj0[0].rewrite_header.data_bytes;
2509           rw_len1 = adj1[0].rewrite_header.data_bytes;
2510           vnet_buffer (p0)->ip.save_rewrite_length = rw_len0;
2511           vnet_buffer (p1)->ip.save_rewrite_length = rw_len1;
2512
2513           /* Check MTU of outgoing interface. */
2514           error0 =
2515             (vlib_buffer_length_in_chain (vm, p0) >
2516              adj0[0].
2517              rewrite_header.max_l3_packet_bytes ? IP4_ERROR_MTU_EXCEEDED :
2518              error0);
2519           error1 =
2520             (vlib_buffer_length_in_chain (vm, p1) >
2521              adj1[0].
2522              rewrite_header.max_l3_packet_bytes ? IP4_ERROR_MTU_EXCEEDED :
2523              error1);
2524
2525           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2526            * to see the IP headerr */
2527           if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2528             {
2529               next0 = adj0[0].rewrite_header.next_index;
2530               p0->current_data -= rw_len0;
2531               p0->current_length += rw_len0;
2532               tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2533               vnet_buffer (p0)->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2534
2535               if (PREDICT_FALSE
2536                   (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2537                 vnet_feature_arc_start (lm->output_feature_arc_index,
2538                                         tx_sw_if_index0, &next0, p0);
2539             }
2540           if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
2541             {
2542               next1 = adj1[0].rewrite_header.next_index;
2543               p1->current_data -= rw_len1;
2544               p1->current_length += rw_len1;
2545
2546               tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2547               vnet_buffer (p1)->sw_if_index[VLIB_TX] = tx_sw_if_index1;
2548
2549               if (PREDICT_FALSE
2550                   (adj1[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2551                 vnet_feature_arc_start (lm->output_feature_arc_index,
2552                                         tx_sw_if_index1, &next1, p1);
2553             }
2554
2555           /* Guess we are only writing on simple Ethernet header. */
2556           vnet_rewrite_two_headers (adj0[0], adj1[0],
2557                                     ip0, ip1, sizeof (ethernet_header_t));
2558
2559           /*
2560            * Bump the per-adjacency counters
2561            */
2562           if (do_counters)
2563             {
2564               vlib_increment_combined_counter
2565                 (&adjacency_counters,
2566                  cpu_index,
2567                  adj_index0, 1,
2568                  vlib_buffer_length_in_chain (vm, p0) + rw_len0);
2569
2570               vlib_increment_combined_counter
2571                 (&adjacency_counters,
2572                  cpu_index,
2573                  adj_index1, 1,
2574                  vlib_buffer_length_in_chain (vm, p1) + rw_len1);
2575             }
2576
2577           if (is_midchain)
2578             {
2579               adj0->sub_type.midchain.fixup_func (vm, adj0, p0);
2580               adj1->sub_type.midchain.fixup_func (vm, adj1, p1);
2581             }
2582           if (is_mcast)
2583             {
2584               /*
2585                * copy bytes from the IP address into the MAC rewrite
2586                */
2587               vnet_fixup_one_header (adj0[0], &ip0->dst_address, ip0);
2588               vnet_fixup_one_header (adj1[0], &ip1->dst_address, ip1);
2589             }
2590
2591           vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
2592                                            to_next, n_left_to_next,
2593                                            pi0, pi1, next0, next1);
2594         }
2595
2596       while (n_left_from > 0 && n_left_to_next > 0)
2597         {
2598           ip_adjacency_t *adj0;
2599           vlib_buffer_t *p0;
2600           ip4_header_t *ip0;
2601           u32 pi0, rw_len0, adj_index0, next0, error0, checksum0;
2602           u32 tx_sw_if_index0;
2603
2604           pi0 = to_next[0] = from[0];
2605
2606           p0 = vlib_get_buffer (vm, pi0);
2607
2608           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2609
2610           adj0 = ip_get_adjacency (lm, adj_index0);
2611
2612           ip0 = vlib_buffer_get_current (p0);
2613
2614           error0 = IP4_ERROR_NONE;
2615           next0 = IP4_REWRITE_NEXT_DROP;        /* drop on error */
2616
2617           /* Decrement TTL & update checksum. */
2618           if (PREDICT_TRUE (!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
2619             {
2620               i32 ttl0 = ip0->ttl;
2621
2622               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2623
2624               checksum0 += checksum0 >= 0xffff;
2625
2626               ip0->checksum = checksum0;
2627
2628               ASSERT (ip0->ttl > 0);
2629
2630               ttl0 -= 1;
2631
2632               ip0->ttl = ttl0;
2633
2634               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2635
2636               if (PREDICT_FALSE (ttl0 <= 0))
2637                 {
2638                   /*
2639                    * If the ttl drops below 1 when forwarding, generate
2640                    * an ICMP response.
2641                    */
2642                   error0 = IP4_ERROR_TIME_EXPIRED;
2643                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2644                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2645                   icmp4_error_set_vnet_buffer (p0, ICMP4_time_exceeded,
2646                                                ICMP4_time_exceeded_ttl_exceeded_in_transit,
2647                                                0);
2648                 }
2649             }
2650           else
2651             {
2652               p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
2653             }
2654
2655           if (do_counters)
2656             vlib_prefetch_combined_counter (&adjacency_counters,
2657                                             cpu_index, adj_index0);
2658
2659           /* Guess we are only writing on simple Ethernet header. */
2660           vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
2661           if (is_mcast)
2662             {
2663               /*
2664                * copy bytes from the IP address into the MAC rewrite
2665                */
2666               vnet_fixup_one_header (adj0[0], &ip0->dst_address, ip0);
2667             }
2668
2669           /* Update packet buffer attributes/set output interface. */
2670           rw_len0 = adj0[0].rewrite_header.data_bytes;
2671           vnet_buffer (p0)->ip.save_rewrite_length = rw_len0;
2672
2673           if (do_counters)
2674             vlib_increment_combined_counter
2675               (&adjacency_counters,
2676                cpu_index, adj_index0, 1,
2677                vlib_buffer_length_in_chain (vm, p0) + rw_len0);
2678
2679           /* Check MTU of outgoing interface. */
2680           error0 = (vlib_buffer_length_in_chain (vm, p0)
2681                     > adj0[0].rewrite_header.max_l3_packet_bytes
2682                     ? IP4_ERROR_MTU_EXCEEDED : error0);
2683
2684           p0->error = error_node->errors[error0];
2685
2686           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2687            * to see the IP headerr */
2688           if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2689             {
2690               p0->current_data -= rw_len0;
2691               p0->current_length += rw_len0;
2692               tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2693
2694               vnet_buffer (p0)->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2695               next0 = adj0[0].rewrite_header.next_index;
2696
2697               if (is_midchain)
2698                 {
2699                   adj0->sub_type.midchain.fixup_func (vm, adj0, p0);
2700                 }
2701
2702               if (PREDICT_FALSE
2703                   (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2704                 vnet_feature_arc_start (lm->output_feature_arc_index,
2705                                         tx_sw_if_index0, &next0, p0);
2706
2707             }
2708
2709           from += 1;
2710           n_left_from -= 1;
2711           to_next += 1;
2712           n_left_to_next -= 1;
2713
2714           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
2715                                            to_next, n_left_to_next,
2716                                            pi0, next0);
2717         }
2718
2719       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2720     }
2721
2722   /* Need to do trace after rewrites to pick up new packet data. */
2723   if (node->flags & VLIB_NODE_FLAG_TRACE)
2724     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2725
2726   return frame->n_vectors;
2727 }
2728
2729
2730 /** @brief IPv4 rewrite node.
2731     @node ip4-rewrite
2732
2733     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2734     header checksum, fetch the ip adjacency, check the outbound mtu,
2735     apply the adjacency rewrite, and send pkts to the adjacency
2736     rewrite header's rewrite_next_index.
2737
2738     @param vm vlib_main_t corresponding to the current thread
2739     @param node vlib_node_runtime_t
2740     @param frame vlib_frame_t whose contents should be dispatched
2741
2742     @par Graph mechanics: buffer metadata, next index usage
2743
2744     @em Uses:
2745     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2746         - the rewrite adjacency index
2747     - <code>adj->lookup_next_index</code>
2748         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2749           the packet will be dropped.
2750     - <code>adj->rewrite_header</code>
2751         - Rewrite string length, rewrite string, next_index
2752
2753     @em Sets:
2754     - <code>b->current_data, b->current_length</code>
2755         - Updated net of applying the rewrite string
2756
2757     <em>Next Indices:</em>
2758     - <code> adj->rewrite_header.next_index </code>
2759       or @c error-drop
2760 */
2761 static uword
2762 ip4_rewrite (vlib_main_t * vm,
2763              vlib_node_runtime_t * node, vlib_frame_t * frame)
2764 {
2765   if (adj_are_counters_enabled ())
2766     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2767   else
2768     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2769 }
2770
2771 static uword
2772 ip4_midchain (vlib_main_t * vm,
2773               vlib_node_runtime_t * node, vlib_frame_t * frame)
2774 {
2775   if (adj_are_counters_enabled ())
2776     return ip4_rewrite_inline (vm, node, frame, 1, 1, 0);
2777   else
2778     return ip4_rewrite_inline (vm, node, frame, 0, 1, 0);
2779 }
2780
2781 static uword
2782 ip4_rewrite_mcast (vlib_main_t * vm,
2783                    vlib_node_runtime_t * node, vlib_frame_t * frame)
2784 {
2785   if (adj_are_counters_enabled ())
2786     return ip4_rewrite_inline (vm, node, frame, 1, 0, 1);
2787   else
2788     return ip4_rewrite_inline (vm, node, frame, 0, 0, 1);
2789 }
2790
2791 /* *INDENT-OFF* */
2792 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2793   .function = ip4_rewrite,
2794   .name = "ip4-rewrite",
2795   .vector_size = sizeof (u32),
2796
2797   .format_trace = format_ip4_rewrite_trace,
2798
2799   .n_next_nodes = 2,
2800   .next_nodes = {
2801     [IP4_REWRITE_NEXT_DROP] = "error-drop",
2802     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2803   },
2804 };
2805 VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_node, ip4_rewrite)
2806
2807 VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = {
2808   .function = ip4_rewrite_mcast,
2809   .name = "ip4-rewrite-mcast",
2810   .vector_size = sizeof (u32),
2811
2812   .format_trace = format_ip4_rewrite_trace,
2813   .sibling_of = "ip4-rewrite",
2814 };
2815 VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_mcast_node, ip4_rewrite_mcast)
2816
2817 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2818   .function = ip4_midchain,
2819   .name = "ip4-midchain",
2820   .vector_size = sizeof (u32),
2821   .format_trace = format_ip4_forward_next_trace,
2822   .sibling_of =  "ip4-rewrite",
2823 };
2824 VLIB_NODE_FUNCTION_MULTIARCH (ip4_midchain_node, ip4_midchain);
2825 /* *INDENT-ON */
2826
2827 static clib_error_t *
2828 add_del_interface_table (vlib_main_t * vm,
2829                          unformat_input_t * input, vlib_cli_command_t * cmd)
2830 {
2831   vnet_main_t *vnm = vnet_get_main ();
2832   ip_interface_address_t *ia;
2833   clib_error_t *error = 0;
2834   u32 sw_if_index, table_id;
2835
2836   sw_if_index = ~0;
2837
2838   if (!unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
2839     {
2840       error = clib_error_return (0, "unknown interface `%U'",
2841                                  format_unformat_error, input);
2842       goto done;
2843     }
2844
2845   if (unformat (input, "%d", &table_id))
2846     ;
2847   else
2848     {
2849       error = clib_error_return (0, "expected table id `%U'",
2850                                  format_unformat_error, input);
2851       goto done;
2852     }
2853
2854   /*
2855    * If the interface already has in IP address, then a change int
2856    * VRF is not allowed. The IP address applied must first be removed.
2857    * We do not do that automatically here, since VPP has no knowledge
2858    * of whether thoses subnets are valid in the destination VRF.
2859    */
2860   /* *INDENT-OFF* */
2861   foreach_ip_interface_address (&ip4_main.lookup_main,
2862                                 ia, sw_if_index,
2863                                 1 /* honor unnumbered */,
2864   ({
2865       ip4_address_t * a;
2866
2867       a = ip_interface_address_get_address (&ip4_main.lookup_main, ia);
2868       error = clib_error_return (0, "interface %U has address %U",
2869                                  format_vnet_sw_if_index_name, vnm,
2870                                  sw_if_index,
2871                                  format_ip4_address, a);
2872       goto done;
2873    }));
2874    /* *INDENT-ON* */
2875
2876 {
2877   ip4_main_t *im = &ip4_main;
2878   u32 fib_index;
2879
2880   fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, table_id);
2881
2882   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
2883   im->fib_index_by_sw_if_index[sw_if_index] = fib_index;
2884
2885   fib_index = mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, table_id);
2886   vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);
2887   im->mfib_index_by_sw_if_index[sw_if_index] = fib_index;
2888 }
2889
2890 done:
2891 return error;
2892 }
2893
2894 /*?
2895  * Place the indicated interface into the supplied IPv4 FIB table (also known
2896  * as a VRF). If the FIB table does not exist, this command creates it. To
2897  * display the current IPv4 FIB table, use the command '<em>show ip fib</em>'.
2898  * FIB table will only be displayed if a route has been added to the table, or
2899  * an IP Address is assigned to an interface in the table (which adds a route
2900  * automatically).
2901  *
2902  * @note IP addresses added after setting the interface IP table are added to
2903  * the indicated FIB table. If an IP address is added prior to changing the
2904  * table then this is an error. The control plane must remove these addresses
2905  * first and then change the table. VPP will not automatically move the
2906  * addresses from the old to the new table as it does not know the validity
2907  * of such a change.
2908  *
2909  * @cliexpar
2910  * Example of how to add an interface to an IPv4 FIB table (where 2 is the table-id):
2911  * @cliexcmd{set interface ip table GigabitEthernet2/0/0 2}
2912  ?*/
2913 /* *INDENT-OFF* */
2914 VLIB_CLI_COMMAND (set_interface_ip_table_command, static) =
2915 {
2916   .path = "set interface ip table",
2917   .function = add_del_interface_table,
2918   .short_help = "set interface ip table <interface> <table-id>",
2919 };
2920 /* *INDENT-ON* */
2921
2922 int
2923 ip4_lookup_validate (ip4_address_t * a, u32 fib_index0)
2924 {
2925   ip4_fib_mtrie_t *mtrie0;
2926   ip4_fib_mtrie_leaf_t leaf0;
2927   u32 lbi0;
2928
2929   mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
2930
2931   leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, a);
2932   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 1);
2933   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
2934   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
2935
2936   lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
2937
2938   return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get (fib_index0), a);
2939 }
2940
2941 static clib_error_t *
2942 test_lookup_command_fn (vlib_main_t * vm,
2943                         unformat_input_t * input, vlib_cli_command_t * cmd)
2944 {
2945   ip4_fib_t *fib;
2946   u32 table_id = 0;
2947   f64 count = 1;
2948   u32 n;
2949   int i;
2950   ip4_address_t ip4_base_address;
2951   u64 errors = 0;
2952
2953   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
2954     {
2955       if (unformat (input, "table %d", &table_id))
2956         {
2957           /* Make sure the entry exists. */
2958           fib = ip4_fib_get (table_id);
2959           if ((fib) && (fib->index != table_id))
2960             return clib_error_return (0, "<fib-index> %d does not exist",
2961                                       table_id);
2962         }
2963       else if (unformat (input, "count %f", &count))
2964         ;
2965
2966       else if (unformat (input, "%U",
2967                          unformat_ip4_address, &ip4_base_address))
2968         ;
2969       else
2970         return clib_error_return (0, "unknown input `%U'",
2971                                   format_unformat_error, input);
2972     }
2973
2974   n = count;
2975
2976   for (i = 0; i < n; i++)
2977     {
2978       if (!ip4_lookup_validate (&ip4_base_address, table_id))
2979         errors++;
2980
2981       ip4_base_address.as_u32 =
2982         clib_host_to_net_u32 (1 +
2983                               clib_net_to_host_u32 (ip4_base_address.as_u32));
2984     }
2985
2986   if (errors)
2987     vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
2988   else
2989     vlib_cli_output (vm, "No errors in %d lookups\n", n);
2990
2991   return 0;
2992 }
2993
2994 /*?
2995  * Perform a lookup of an IPv4 Address (or range of addresses) in the
2996  * given FIB table to determine if there is a conflict with the
2997  * adjacency table. The fib-id can be determined by using the
2998  * '<em>show ip fib</em>' command. If fib-id is not entered, default value
2999  * of 0 is used.
3000  *
3001  * @todo This command uses fib-id, other commands use table-id (not
3002  * just a name, they are different indexes). Would like to change this
3003  * to table-id for consistency.
3004  *
3005  * @cliexpar
3006  * Example of how to run the test lookup command:
3007  * @cliexstart{test lookup 172.16.1.1 table 1 count 2}
3008  * No errors in 2 lookups
3009  * @cliexend
3010 ?*/
3011 /* *INDENT-OFF* */
3012 VLIB_CLI_COMMAND (lookup_test_command, static) =
3013 {
3014   .path = "test lookup",
3015   .short_help = "test lookup <ipv4-addr> [table <fib-id>] [count <nn>]",
3016   .function = test_lookup_command_fn,
3017 };
3018 /* *INDENT-ON* */
3019
3020 int
3021 vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
3022 {
3023   ip4_main_t *im4 = &ip4_main;
3024   ip4_fib_t *fib;
3025   uword *p = hash_get (im4->fib_index_by_table_id, table_id);
3026
3027   if (p == 0)
3028     return VNET_API_ERROR_NO_SUCH_FIB;
3029
3030   fib = ip4_fib_get (p[0]);
3031
3032   fib->flow_hash_config = flow_hash_config;
3033   return 0;
3034 }
3035
3036 static clib_error_t *
3037 set_ip_flow_hash_command_fn (vlib_main_t * vm,
3038                              unformat_input_t * input,
3039                              vlib_cli_command_t * cmd)
3040 {
3041   int matched = 0;
3042   u32 table_id = 0;
3043   u32 flow_hash_config = 0;
3044   int rv;
3045
3046   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3047     {
3048       if (unformat (input, "table %d", &table_id))
3049         matched = 1;
3050 #define _(a,v) \
3051     else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
3052       foreach_flow_hash_bit
3053 #undef _
3054         else
3055         break;
3056     }
3057
3058   if (matched == 0)
3059     return clib_error_return (0, "unknown input `%U'",
3060                               format_unformat_error, input);
3061
3062   rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
3063   switch (rv)
3064     {
3065     case 0:
3066       break;
3067
3068     case VNET_API_ERROR_NO_SUCH_FIB:
3069       return clib_error_return (0, "no such FIB table %d", table_id);
3070
3071     default:
3072       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
3073       break;
3074     }
3075
3076   return 0;
3077 }
3078
3079 /*?
3080  * Configure the set of IPv4 fields used by the flow hash.
3081  *
3082  * @cliexpar
3083  * Example of how to set the flow hash on a given table:
3084  * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
3085  * Example of display the configured flow hash:
3086  * @cliexstart{show ip fib}
3087  * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
3088  * 0.0.0.0/0
3089  *   unicast-ip4-chain
3090  *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
3091  *     [0] [@0]: dpo-drop ip6
3092  * 0.0.0.0/32
3093  *   unicast-ip4-chain
3094  *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
3095  *     [0] [@0]: dpo-drop ip6
3096  * 224.0.0.0/8
3097  *   unicast-ip4-chain
3098  *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
3099  *     [0] [@0]: dpo-drop ip6
3100  * 6.0.1.2/32
3101  *   unicast-ip4-chain
3102  *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
3103  *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3104  * 7.0.0.1/32
3105  *   unicast-ip4-chain
3106  *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
3107  *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3108  *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3109  *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3110  *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3111  * 240.0.0.0/8
3112  *   unicast-ip4-chain
3113  *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
3114  *     [0] [@0]: dpo-drop ip6
3115  * 255.255.255.255/32
3116  *   unicast-ip4-chain
3117  *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
3118  *     [0] [@0]: dpo-drop ip6
3119  * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
3120  * 0.0.0.0/0
3121  *   unicast-ip4-chain
3122  *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
3123  *     [0] [@0]: dpo-drop ip6
3124  * 0.0.0.0/32
3125  *   unicast-ip4-chain
3126  *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
3127  *     [0] [@0]: dpo-drop ip6
3128  * 172.16.1.0/24
3129  *   unicast-ip4-chain
3130  *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
3131  *     [0] [@4]: ipv4-glean: af_packet0
3132  * 172.16.1.1/32
3133  *   unicast-ip4-chain
3134  *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
3135  *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
3136  * 172.16.1.2/32
3137  *   unicast-ip4-chain
3138  *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
3139  *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
3140  * 172.16.2.0/24
3141  *   unicast-ip4-chain
3142  *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
3143  *     [0] [@4]: ipv4-glean: af_packet1
3144  * 172.16.2.1/32
3145  *   unicast-ip4-chain
3146  *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
3147  *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
3148  * 224.0.0.0/8
3149  *   unicast-ip4-chain
3150  *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
3151  *     [0] [@0]: dpo-drop ip6
3152  * 240.0.0.0/8
3153  *   unicast-ip4-chain
3154  *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
3155  *     [0] [@0]: dpo-drop ip6
3156  * 255.255.255.255/32
3157  *   unicast-ip4-chain
3158  *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
3159  *     [0] [@0]: dpo-drop ip6
3160  * @cliexend
3161 ?*/
3162 /* *INDENT-OFF* */
3163 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) =
3164 {
3165   .path = "set ip flow-hash",
3166   .short_help =
3167   "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
3168   .function = set_ip_flow_hash_command_fn,
3169 };
3170 /* *INDENT-ON* */
3171
3172 int
3173 vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
3174                              u32 table_index)
3175 {
3176   vnet_main_t *vnm = vnet_get_main ();
3177   vnet_interface_main_t *im = &vnm->interface_main;
3178   ip4_main_t *ipm = &ip4_main;
3179   ip_lookup_main_t *lm = &ipm->lookup_main;
3180   vnet_classify_main_t *cm = &vnet_classify_main;
3181   ip4_address_t *if_addr;
3182
3183   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
3184     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
3185
3186   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
3187     return VNET_API_ERROR_NO_SUCH_ENTRY;
3188
3189   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
3190   lm->classify_table_index_by_sw_if_index[sw_if_index] = table_index;
3191
3192   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
3193
3194   if (NULL != if_addr)
3195     {
3196       fib_prefix_t pfx = {
3197         .fp_len = 32,
3198         .fp_proto = FIB_PROTOCOL_IP4,
3199         .fp_addr.ip4 = *if_addr,
3200       };
3201       u32 fib_index;
3202
3203       fib_index = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
3204                                                        sw_if_index);
3205
3206
3207       if (table_index != (u32) ~ 0)
3208         {
3209           dpo_id_t dpo = DPO_INVALID;
3210
3211           dpo_set (&dpo,
3212                    DPO_CLASSIFY,
3213                    DPO_PROTO_IP4,
3214                    classify_dpo_create (DPO_PROTO_IP4, table_index));
3215
3216           fib_table_entry_special_dpo_add (fib_index,
3217                                            &pfx,
3218                                            FIB_SOURCE_CLASSIFY,
3219                                            FIB_ENTRY_FLAG_NONE, &dpo);
3220           dpo_reset (&dpo);
3221         }
3222       else
3223         {
3224           fib_table_entry_special_remove (fib_index,
3225                                           &pfx, FIB_SOURCE_CLASSIFY);
3226         }
3227     }
3228
3229   return 0;
3230 }
3231
3232 static clib_error_t *
3233 set_ip_classify_command_fn (vlib_main_t * vm,
3234                             unformat_input_t * input,
3235                             vlib_cli_command_t * cmd)
3236 {
3237   u32 table_index = ~0;
3238   int table_index_set = 0;
3239   u32 sw_if_index = ~0;
3240   int rv;
3241
3242   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3243     {
3244       if (unformat (input, "table-index %d", &table_index))
3245         table_index_set = 1;
3246       else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
3247                          vnet_get_main (), &sw_if_index))
3248         ;
3249       else
3250         break;
3251     }
3252
3253   if (table_index_set == 0)
3254     return clib_error_return (0, "classify table-index must be specified");
3255
3256   if (sw_if_index == ~0)
3257     return clib_error_return (0, "interface / subif must be specified");
3258
3259   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
3260
3261   switch (rv)
3262     {
3263     case 0:
3264       break;
3265
3266     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
3267       return clib_error_return (0, "No such interface");
3268
3269     case VNET_API_ERROR_NO_SUCH_ENTRY:
3270       return clib_error_return (0, "No such classifier table");
3271     }
3272   return 0;
3273 }
3274
3275 /*?
3276  * Assign a classification table to an interface. The classification
3277  * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
3278  * commands. Once the table is create, use this command to filter packets
3279  * on an interface.
3280  *
3281  * @cliexpar
3282  * Example of how to assign a classification table to an interface:
3283  * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
3284 ?*/
3285 /* *INDENT-OFF* */
3286 VLIB_CLI_COMMAND (set_ip_classify_command, static) =
3287 {
3288     .path = "set ip classify",
3289     .short_help =
3290     "set ip classify intfc <interface> table-index <classify-idx>",
3291     .function = set_ip_classify_command_fn,
3292 };
3293 /* *INDENT-ON* */
3294
3295 /*
3296  * fd.io coding-style-patch-verification: ON
3297  *
3298  * Local Variables:
3299  * eval: (c-set-style "gnu")
3300  * End:
3301  */