api: autogenerate api trace print/endian
[vpp.git] / src / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ip/ip_frag.h>
43 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
44 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
45 #include <vnet/ppp/ppp.h>
46 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
47 #include <vnet/api_errno.h>     /* for API error numbers */
48 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
50 #include <vnet/fib/fib_urpf_list.h>     /* for FIB uRPF check */
51 #include <vnet/fib/ip4_fib.h>
52 #include <vnet/dpo/load_balance.h>
53 #include <vnet/dpo/load_balance_map.h>
54 #include <vnet/dpo/classify_dpo.h>
55 #include <vnet/mfib/mfib_table.h>       /* for mFIB table and entry creation */
56
57 #include <vnet/ip/ip4_forward.h>
58 #include <vnet/interface_output.h>
59
60 /** @brief IPv4 lookup node.
61     @node ip4-lookup
62
63     This is the main IPv4 lookup dispatch node.
64
65     @param vm vlib_main_t corresponding to the current thread
66     @param node vlib_node_runtime_t
67     @param frame vlib_frame_t whose contents should be dispatched
68
69     @par Graph mechanics: buffer metadata, next index usage
70
71     @em Uses:
72     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
73         - Indicates the @c sw_if_index value of the interface that the
74           packet was received on.
75     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
76         - When the value is @c ~0 then the node performs a longest prefix
77           match (LPM) for the packet destination address in the FIB attached
78           to the receive interface.
79         - Otherwise perform LPM for the packet destination address in the
80           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
81           value (0, 1, ...) and not a VRF id.
82
83     @em Sets:
84     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
85         - The lookup result adjacency index.
86
87     <em>Next Index:</em>
88     - Dispatches the packet to the node index found in
89       ip_adjacency_t @c adj->lookup_next_index
90       (where @c adj is the lookup result adjacency).
91 */
92 VLIB_NODE_FN (ip4_lookup_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
93                                 vlib_frame_t * frame)
94 {
95   return ip4_lookup_inline (vm, node, frame);
96 }
97
98 static u8 *format_ip4_lookup_trace (u8 * s, va_list * args);
99
100 /* *INDENT-OFF* */
101 VLIB_REGISTER_NODE (ip4_lookup_node) =
102 {
103   .name = "ip4-lookup",
104   .vector_size = sizeof (u32),
105   .format_trace = format_ip4_lookup_trace,
106   .n_next_nodes = IP_LOOKUP_N_NEXT,
107   .next_nodes = IP4_LOOKUP_NEXT_NODES,
108 };
109 /* *INDENT-ON* */
110
111 VLIB_NODE_FN (ip4_load_balance_node) (vlib_main_t * vm,
112                                       vlib_node_runtime_t * node,
113                                       vlib_frame_t * frame)
114 {
115   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
116   u32 n_left, *from;
117   u32 thread_index = vm->thread_index;
118   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
119   u16 nexts[VLIB_FRAME_SIZE], *next;
120
121   from = vlib_frame_vector_args (frame);
122   n_left = frame->n_vectors;
123   next = nexts;
124
125   vlib_get_buffers (vm, from, bufs, n_left);
126
127   while (n_left >= 4)
128     {
129       const load_balance_t *lb0, *lb1;
130       const ip4_header_t *ip0, *ip1;
131       u32 lbi0, hc0, lbi1, hc1;
132       const dpo_id_t *dpo0, *dpo1;
133
134       /* Prefetch next iteration. */
135       {
136         vlib_prefetch_buffer_header (b[2], LOAD);
137         vlib_prefetch_buffer_header (b[3], LOAD);
138
139         CLIB_PREFETCH (b[2]->data, sizeof (ip0[0]), LOAD);
140         CLIB_PREFETCH (b[3]->data, sizeof (ip0[0]), LOAD);
141       }
142
143       ip0 = vlib_buffer_get_current (b[0]);
144       ip1 = vlib_buffer_get_current (b[1]);
145       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
146       lbi1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
147
148       lb0 = load_balance_get (lbi0);
149       lb1 = load_balance_get (lbi1);
150
151       /*
152        * this node is for via FIBs we can re-use the hash value from the
153        * to node if present.
154        * We don't want to use the same hash value at each level in the recursion
155        * graph as that would lead to polarisation
156        */
157       hc0 = hc1 = 0;
158
159       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
160         {
161           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
162             {
163               hc0 = vnet_buffer (b[0])->ip.flow_hash =
164                 vnet_buffer (b[0])->ip.flow_hash >> 1;
165             }
166           else
167             {
168               hc0 = vnet_buffer (b[0])->ip.flow_hash =
169                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
170             }
171           dpo0 = load_balance_get_fwd_bucket
172             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
173         }
174       else
175         {
176           dpo0 = load_balance_get_bucket_i (lb0, 0);
177         }
178       if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
179         {
180           if (PREDICT_TRUE (vnet_buffer (b[1])->ip.flow_hash))
181             {
182               hc1 = vnet_buffer (b[1])->ip.flow_hash =
183                 vnet_buffer (b[1])->ip.flow_hash >> 1;
184             }
185           else
186             {
187               hc1 = vnet_buffer (b[1])->ip.flow_hash =
188                 ip4_compute_flow_hash (ip1, lb1->lb_hash_config);
189             }
190           dpo1 = load_balance_get_fwd_bucket
191             (lb1, (hc1 & (lb1->lb_n_buckets_minus_1)));
192         }
193       else
194         {
195           dpo1 = load_balance_get_bucket_i (lb1, 0);
196         }
197
198       next[0] = dpo0->dpoi_next_node;
199       next[1] = dpo1->dpoi_next_node;
200
201       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
202       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
203
204       vlib_increment_combined_counter
205         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
206       vlib_increment_combined_counter
207         (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b[1]));
208
209       b += 2;
210       next += 2;
211       n_left -= 2;
212     }
213
214   while (n_left > 0)
215     {
216       const load_balance_t *lb0;
217       const ip4_header_t *ip0;
218       const dpo_id_t *dpo0;
219       u32 lbi0, hc0;
220
221       ip0 = vlib_buffer_get_current (b[0]);
222       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
223
224       lb0 = load_balance_get (lbi0);
225
226       hc0 = 0;
227       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
228         {
229           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
230             {
231               hc0 = vnet_buffer (b[0])->ip.flow_hash =
232                 vnet_buffer (b[0])->ip.flow_hash >> 1;
233             }
234           else
235             {
236               hc0 = vnet_buffer (b[0])->ip.flow_hash =
237                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
238             }
239           dpo0 = load_balance_get_fwd_bucket
240             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
241         }
242       else
243         {
244           dpo0 = load_balance_get_bucket_i (lb0, 0);
245         }
246
247       next[0] = dpo0->dpoi_next_node;
248       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
249
250       vlib_increment_combined_counter
251         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
252
253       b += 1;
254       next += 1;
255       n_left -= 1;
256     }
257
258   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
259   if (node->flags & VLIB_NODE_FLAG_TRACE)
260     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
261
262   return frame->n_vectors;
263 }
264
265 /* *INDENT-OFF* */
266 VLIB_REGISTER_NODE (ip4_load_balance_node) =
267 {
268   .name = "ip4-load-balance",
269   .vector_size = sizeof (u32),
270   .sibling_of = "ip4-lookup",
271   .format_trace = format_ip4_lookup_trace,
272 };
273 /* *INDENT-ON* */
274
275 #ifndef CLIB_MARCH_VARIANT
276 /* get first interface address */
277 ip4_address_t *
278 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
279                              ip_interface_address_t ** result_ia)
280 {
281   ip_lookup_main_t *lm = &im->lookup_main;
282   ip_interface_address_t *ia = 0;
283   ip4_address_t *result = 0;
284
285   /* *INDENT-OFF* */
286   foreach_ip_interface_address
287     (lm, ia, sw_if_index,
288      1 /* honor unnumbered */ ,
289      ({
290        ip4_address_t * a =
291          ip_interface_address_get_address (lm, ia);
292        result = a;
293        break;
294      }));
295   /* *INDENT-OFF* */
296   if (result_ia)
297     *result_ia = result ? ia : 0;
298   return result;
299 }
300 #endif
301
302 static void
303 ip4_add_subnet_bcast_route (u32 fib_index,
304                             fib_prefix_t *pfx,
305                             u32 sw_if_index)
306 {
307   vnet_sw_interface_flags_t iflags;
308
309   iflags = vnet_sw_interface_get_flags(vnet_get_main(), sw_if_index);
310
311   fib_table_entry_special_remove(fib_index,
312                                  pfx,
313                                  FIB_SOURCE_INTERFACE);
314
315   if (iflags & VNET_SW_INTERFACE_FLAG_DIRECTED_BCAST)
316     {
317       fib_table_entry_update_one_path (fib_index, pfx,
318                                        FIB_SOURCE_INTERFACE,
319                                        FIB_ENTRY_FLAG_NONE,
320                                        DPO_PROTO_IP4,
321                                        /* No next-hop address */
322                                        &ADJ_BCAST_ADDR,
323                                        sw_if_index,
324                                        // invalid FIB index
325                                        ~0,
326                                        1,
327                                        // no out-label stack
328                                        NULL,
329                                        FIB_ROUTE_PATH_FLAG_NONE);
330     }
331   else
332     {
333         fib_table_entry_special_add(fib_index,
334                                     pfx,
335                                     FIB_SOURCE_INTERFACE,
336                                     (FIB_ENTRY_FLAG_DROP |
337                                      FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
338     }
339 }
340
341 static void
342 ip4_add_interface_prefix_routes (ip4_main_t *im,
343                                  u32 sw_if_index,
344                                  u32 fib_index,
345                                  ip_interface_address_t * a)
346 {
347   ip_lookup_main_t *lm = &im->lookup_main;
348   ip_interface_prefix_t *if_prefix;
349   ip4_address_t *address = ip_interface_address_get_address (lm, a);
350
351   ip_interface_prefix_key_t key = {
352     .prefix = {
353       .fp_len = a->address_length,
354       .fp_proto = FIB_PROTOCOL_IP4,
355       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[a->address_length],
356     },
357     .sw_if_index = sw_if_index,
358   };
359
360   fib_prefix_t pfx_special = {
361     .fp_proto = FIB_PROTOCOL_IP4,
362   };
363
364   /* If prefix already set on interface, just increment ref count & return */
365   if_prefix = ip_get_interface_prefix (lm, &key);
366   if (if_prefix)
367     {
368       if_prefix->ref_count += 1;
369       return;
370     }
371
372   /* New prefix - allocate a pool entry, initialize it, add to the hash */
373   pool_get (lm->if_prefix_pool, if_prefix);
374   if_prefix->ref_count = 1;
375   if_prefix->src_ia_index = a - lm->if_address_pool;
376   clib_memcpy (&if_prefix->key, &key, sizeof (key));
377   mhash_set (&lm->prefix_to_if_prefix_index, &key,
378              if_prefix - lm->if_prefix_pool, 0 /* old value */);
379
380   /* length <= 30 - add glean, drop first address, maybe drop bcast address */
381   if (a->address_length <= 30)
382     {
383       pfx_special.fp_len = a->address_length;
384       pfx_special.fp_addr.ip4.as_u32 = address->as_u32;
385
386       /* set the glean route for the prefix */
387       fib_table_entry_update_one_path (fib_index, &pfx_special,
388                                        FIB_SOURCE_INTERFACE,
389                                        (FIB_ENTRY_FLAG_CONNECTED |
390                                         FIB_ENTRY_FLAG_ATTACHED),
391                                        DPO_PROTO_IP4,
392                                        /* No next-hop address */
393                                        NULL,
394                                        sw_if_index,
395                                        /* invalid FIB index */
396                                        ~0,
397                                        1,
398                                        /* no out-label stack */
399                                        NULL,
400                                        FIB_ROUTE_PATH_FLAG_NONE);
401
402       /* set a drop route for the base address of the prefix */
403       pfx_special.fp_len = 32;
404       pfx_special.fp_addr.ip4.as_u32 =
405         address->as_u32 & im->fib_masks[a->address_length];
406
407       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
408         fib_table_entry_special_add (fib_index, &pfx_special,
409                                      FIB_SOURCE_INTERFACE,
410                                      (FIB_ENTRY_FLAG_DROP |
411                                       FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
412
413       /* set a route for the broadcast address of the prefix */
414       pfx_special.fp_len = 32;
415       pfx_special.fp_addr.ip4.as_u32 =
416         address->as_u32 | ~im->fib_masks[a->address_length];
417       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
418         ip4_add_subnet_bcast_route (fib_index, &pfx_special, sw_if_index);
419
420
421     }
422   /* length == 31 - add an attached route for the other address */
423   else if (a->address_length == 31)
424     {
425       pfx_special.fp_len = 32;
426       pfx_special.fp_addr.ip4.as_u32 =
427         address->as_u32 ^ clib_host_to_net_u32(1);
428
429       fib_table_entry_update_one_path (fib_index, &pfx_special,
430                                        FIB_SOURCE_INTERFACE,
431                                        (FIB_ENTRY_FLAG_ATTACHED),
432                                        DPO_PROTO_IP4,
433                                        &pfx_special.fp_addr,
434                                        sw_if_index,
435                                        /* invalid FIB index */
436                                        ~0,
437                                        1,
438                                        NULL,
439                                        FIB_ROUTE_PATH_FLAG_NONE);
440     }
441 }
442
443 static void
444 ip4_add_interface_routes (u32 sw_if_index,
445                           ip4_main_t * im, u32 fib_index,
446                           ip_interface_address_t * a)
447 {
448   ip_lookup_main_t *lm = &im->lookup_main;
449   ip4_address_t *address = ip_interface_address_get_address (lm, a);
450   fib_prefix_t pfx = {
451     .fp_len = 32,
452     .fp_proto = FIB_PROTOCOL_IP4,
453     .fp_addr.ip4 = *address,
454   };
455
456   /* set special routes for the prefix if needed */
457   ip4_add_interface_prefix_routes (im, sw_if_index, fib_index, a);
458
459   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
460     {
461       u32 classify_table_index =
462         lm->classify_table_index_by_sw_if_index[sw_if_index];
463       if (classify_table_index != (u32) ~ 0)
464         {
465           dpo_id_t dpo = DPO_INVALID;
466
467           dpo_set (&dpo,
468                    DPO_CLASSIFY,
469                    DPO_PROTO_IP4,
470                    classify_dpo_create (DPO_PROTO_IP4, classify_table_index));
471
472           fib_table_entry_special_dpo_add (fib_index,
473                                            &pfx,
474                                            FIB_SOURCE_CLASSIFY,
475                                            FIB_ENTRY_FLAG_NONE, &dpo);
476           dpo_reset (&dpo);
477         }
478     }
479
480   fib_table_entry_update_one_path (fib_index, &pfx,
481                                    FIB_SOURCE_INTERFACE,
482                                    (FIB_ENTRY_FLAG_CONNECTED |
483                                     FIB_ENTRY_FLAG_LOCAL),
484                                    DPO_PROTO_IP4,
485                                    &pfx.fp_addr,
486                                    sw_if_index,
487                                    // invalid FIB index
488                                    ~0,
489                                    1, NULL,
490                                    FIB_ROUTE_PATH_FLAG_NONE);
491 }
492
493 static void
494 ip4_del_interface_prefix_routes (ip4_main_t * im,
495                                  u32 sw_if_index,
496                                  u32 fib_index,
497                                  ip4_address_t * address,
498                                  u32 address_length)
499 {
500   ip_lookup_main_t *lm = &im->lookup_main;
501   ip_interface_prefix_t *if_prefix;
502
503   ip_interface_prefix_key_t key = {
504     .prefix = {
505       .fp_len = address_length,
506       .fp_proto = FIB_PROTOCOL_IP4,
507       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[address_length],
508     },
509     .sw_if_index = sw_if_index,
510   };
511
512   fib_prefix_t pfx_special = {
513     .fp_len = 32,
514     .fp_proto = FIB_PROTOCOL_IP4,
515   };
516
517   if_prefix = ip_get_interface_prefix (lm, &key);
518   if (!if_prefix)
519     {
520       clib_warning ("Prefix not found while deleting %U",
521                     format_ip4_address_and_length, address, address_length);
522       return;
523     }
524
525   if_prefix->ref_count -= 1;
526
527   /*
528    * Routes need to be adjusted if:
529    * - deleting last intf addr in prefix
530    * - deleting intf addr used as default source address in glean adjacency
531    *
532    * We're done now otherwise
533    */
534   if ((if_prefix->ref_count > 0) &&
535       !pool_is_free_index (lm->if_address_pool, if_prefix->src_ia_index))
536     return;
537
538   /* length <= 30, delete glean route, first address, last address */
539   if (address_length <= 30)
540     {
541
542       /* remove glean route for prefix */
543       pfx_special.fp_addr.ip4 = *address;
544       pfx_special.fp_len = address_length;
545       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
546
547       /* if no more intf addresses in prefix, remove other special routes */
548       if (!if_prefix->ref_count)
549         {
550           /* first address in prefix */
551           pfx_special.fp_addr.ip4.as_u32 =
552             address->as_u32 & im->fib_masks[address_length];
553           pfx_special.fp_len = 32;
554
555           if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
556           fib_table_entry_special_remove (fib_index,
557                                           &pfx_special,
558                                           FIB_SOURCE_INTERFACE);
559
560           /* prefix broadcast address */
561           pfx_special.fp_addr.ip4.as_u32 =
562             address->as_u32 | ~im->fib_masks[address_length];
563           pfx_special.fp_len = 32;
564
565           if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
566           fib_table_entry_special_remove (fib_index,
567                                           &pfx_special,
568                                           FIB_SOURCE_INTERFACE);
569         }
570       else
571         /* default source addr just got deleted, find another */
572         {
573           ip_interface_address_t *new_src_ia = NULL;
574           ip4_address_t *new_src_addr = NULL;
575
576           new_src_addr =
577             ip4_interface_address_matching_destination
578               (im, address, sw_if_index, &new_src_ia);
579
580           if_prefix->src_ia_index = new_src_ia - lm->if_address_pool;
581
582           pfx_special.fp_len = address_length;
583           pfx_special.fp_addr.ip4 = *new_src_addr;
584
585           /* set new glean route for the prefix */
586           fib_table_entry_update_one_path (fib_index, &pfx_special,
587                                            FIB_SOURCE_INTERFACE,
588                                            (FIB_ENTRY_FLAG_CONNECTED |
589                                             FIB_ENTRY_FLAG_ATTACHED),
590                                            DPO_PROTO_IP4,
591                                            /* No next-hop address */
592                                            NULL,
593                                            sw_if_index,
594                                            /* invalid FIB index */
595                                            ~0,
596                                            1,
597                                            /* no out-label stack */
598                                            NULL,
599                                            FIB_ROUTE_PATH_FLAG_NONE);
600           return;
601         }
602     }
603   /* length == 31, delete attached route for the other address */
604   else if (address_length == 31)
605     {
606       pfx_special.fp_addr.ip4.as_u32 =
607         address->as_u32 ^ clib_host_to_net_u32(1);
608
609       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
610     }
611
612   mhash_unset (&lm->prefix_to_if_prefix_index, &key, 0 /* old_value */);
613   pool_put (lm->if_prefix_pool, if_prefix);
614 }
615
616 static void
617 ip4_del_interface_routes (u32 sw_if_index,
618                           ip4_main_t * im,
619                           u32 fib_index,
620                           ip4_address_t * address, u32 address_length)
621 {
622   fib_prefix_t pfx = {
623     .fp_len = address_length,
624     .fp_proto = FIB_PROTOCOL_IP4,
625     .fp_addr.ip4 = *address,
626   };
627
628   ip4_del_interface_prefix_routes (im, sw_if_index, fib_index,
629                                    address, address_length);
630
631   pfx.fp_len = 32;
632   fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
633 }
634
635 #ifndef CLIB_MARCH_VARIANT
636 void
637 ip4_sw_interface_enable_disable (u32 sw_if_index, u32 is_enable)
638 {
639   ip4_main_t *im = &ip4_main;
640
641   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
642
643   /*
644    * enable/disable only on the 1<->0 transition
645    */
646   if (is_enable)
647     {
648       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
649         return;
650     }
651   else
652     {
653       ASSERT (im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
654       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
655         return;
656     }
657   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
658                                !is_enable, 0, 0);
659
660
661   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
662                                sw_if_index, !is_enable, 0, 0);
663
664   {
665     ip4_enable_disable_interface_callback_t *cb;
666     vec_foreach (cb, im->enable_disable_interface_callbacks)
667       cb->function (im, cb->function_opaque, sw_if_index, is_enable);
668   }
669 }
670
671 static clib_error_t *
672 ip4_add_del_interface_address_internal (vlib_main_t * vm,
673                                         u32 sw_if_index,
674                                         ip4_address_t * address,
675                                         u32 address_length, u32 is_del)
676 {
677   vnet_main_t *vnm = vnet_get_main ();
678   ip4_main_t *im = &ip4_main;
679   ip_lookup_main_t *lm = &im->lookup_main;
680   clib_error_t *error = 0;
681   u32 if_address_index, elts_before;
682   ip4_address_fib_t ip4_af, *addr_fib = 0;
683
684   /* local0 interface doesn't support IP addressing  */
685   if (sw_if_index == 0)
686     {
687       return
688        clib_error_create ("local0 interface doesn't support IP addressing");
689     }
690
691   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
692   ip4_addr_fib_init (&ip4_af, address,
693                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
694   vec_add1 (addr_fib, ip4_af);
695
696   /*
697    * there is no support for adj-fib handling in the presence of overlapping
698    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
699    * most routers do.
700    */
701   /* *INDENT-OFF* */
702   if (!is_del)
703     {
704       /* When adding an address check that it does not conflict
705          with an existing address on any interface in this table. */
706       ip_interface_address_t *ia;
707       vnet_sw_interface_t *sif;
708
709       pool_foreach(sif, vnm->interface_main.sw_interfaces,
710       ({
711           if (im->fib_index_by_sw_if_index[sw_if_index] ==
712               im->fib_index_by_sw_if_index[sif->sw_if_index])
713             {
714               foreach_ip_interface_address
715                 (&im->lookup_main, ia, sif->sw_if_index,
716                  0 /* honor unnumbered */ ,
717                  ({
718                    ip4_address_t * x =
719                      ip_interface_address_get_address
720                      (&im->lookup_main, ia);
721                    if (ip4_destination_matches_route
722                        (im, address, x, ia->address_length) ||
723                        ip4_destination_matches_route (im,
724                                                       x,
725                                                       address,
726                                                       address_length))
727                      {
728                        /* an intf may have >1 addr from the same prefix */
729                        if ((sw_if_index == sif->sw_if_index) &&
730                            (ia->address_length == address_length) &&
731                            (x->as_u32 != address->as_u32))
732                          continue;
733
734                        /* error if the length or intf was different */
735                        vnm->api_errno = VNET_API_ERROR_DUPLICATE_IF_ADDRESS;
736
737                        return
738                          clib_error_create
739                          ("failed to add %U on %U which conflicts with %U for interface %U",
740                           format_ip4_address_and_length, address,
741                           address_length,
742                           format_vnet_sw_if_index_name, vnm,
743                           sw_if_index,
744                           format_ip4_address_and_length, x,
745                           ia->address_length,
746                           format_vnet_sw_if_index_name, vnm,
747                           sif->sw_if_index);
748                      }
749                  }));
750             }
751       }));
752     }
753   /* *INDENT-ON* */
754
755   elts_before = pool_elts (lm->if_address_pool);
756
757   error = ip_interface_address_add_del
758     (lm, sw_if_index, addr_fib, address_length, is_del, &if_address_index);
759   if (error)
760     goto done;
761
762   ip4_sw_interface_enable_disable (sw_if_index, !is_del);
763
764   /* intf addr routes are added/deleted on admin up/down */
765   if (vnet_sw_interface_is_admin_up (vnm, sw_if_index))
766     {
767       if (is_del)
768         ip4_del_interface_routes (sw_if_index,
769                                   im, ip4_af.fib_index, address,
770                                   address_length);
771       else
772         ip4_add_interface_routes (sw_if_index,
773                                   im, ip4_af.fib_index,
774                                   pool_elt_at_index
775                                   (lm->if_address_pool, if_address_index));
776     }
777
778   /* If pool did not grow/shrink: add duplicate address. */
779   if (elts_before != pool_elts (lm->if_address_pool))
780     {
781       ip4_add_del_interface_address_callback_t *cb;
782       vec_foreach (cb, im->add_del_interface_address_callbacks)
783         cb->function (im, cb->function_opaque, sw_if_index,
784                       address, address_length, if_address_index, is_del);
785     }
786
787 done:
788   vec_free (addr_fib);
789   return error;
790 }
791
792 clib_error_t *
793 ip4_add_del_interface_address (vlib_main_t * vm,
794                                u32 sw_if_index,
795                                ip4_address_t * address,
796                                u32 address_length, u32 is_del)
797 {
798   return ip4_add_del_interface_address_internal
799     (vm, sw_if_index, address, address_length, is_del);
800 }
801
802 void
803 ip4_directed_broadcast (u32 sw_if_index, u8 enable)
804 {
805   ip_interface_address_t *ia;
806   ip4_main_t *im;
807
808   im = &ip4_main;
809
810   /*
811    * when directed broadcast is enabled, the subnet braodcast route will forward
812    * packets using an adjacency with a broadcast MAC. otherwise it drops
813    */
814   /* *INDENT-OFF* */
815   foreach_ip_interface_address(&im->lookup_main, ia,
816                                sw_if_index, 0,
817      ({
818        if (ia->address_length <= 30)
819          {
820            ip4_address_t *ipa;
821
822            ipa = ip_interface_address_get_address (&im->lookup_main, ia);
823
824            fib_prefix_t pfx = {
825              .fp_len = 32,
826              .fp_proto = FIB_PROTOCOL_IP4,
827              .fp_addr = {
828                .ip4.as_u32 = (ipa->as_u32 | ~im->fib_masks[ia->address_length]),
829              },
830            };
831
832            ip4_add_subnet_bcast_route
833              (fib_table_get_index_for_sw_if_index(FIB_PROTOCOL_IP4,
834                                                   sw_if_index),
835               &pfx, sw_if_index);
836          }
837      }));
838   /* *INDENT-ON* */
839 }
840 #endif
841
842 static clib_error_t *
843 ip4_sw_interface_admin_up_down (vnet_main_t * vnm, u32 sw_if_index, u32 flags)
844 {
845   ip4_main_t *im = &ip4_main;
846   ip_interface_address_t *ia;
847   ip4_address_t *a;
848   u32 is_admin_up, fib_index;
849
850   /* Fill in lookup tables with default table (0). */
851   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
852
853   vec_validate_init_empty (im->
854                            lookup_main.if_address_pool_index_by_sw_if_index,
855                            sw_if_index, ~0);
856
857   is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
858
859   fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
860
861   /* *INDENT-OFF* */
862   foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
863                                 0 /* honor unnumbered */,
864   ({
865     a = ip_interface_address_get_address (&im->lookup_main, ia);
866     if (is_admin_up)
867       ip4_add_interface_routes (sw_if_index,
868                                 im, fib_index,
869                                 ia);
870     else
871       ip4_del_interface_routes (sw_if_index,
872                                 im, fib_index,
873                                 a, ia->address_length);
874   }));
875   /* *INDENT-ON* */
876
877   return 0;
878 }
879
880 VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
881
882 /* Built-in ip4 unicast rx feature path definition */
883 /* *INDENT-OFF* */
884 VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
885 {
886   .arc_name = "ip4-unicast",
887   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
888   .last_in_arc = "ip4-lookup",
889   .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
890 };
891
892 VNET_FEATURE_INIT (ip4_flow_classify, static) =
893 {
894   .arc_name = "ip4-unicast",
895   .node_name = "ip4-flow-classify",
896   .runs_before = VNET_FEATURES ("ip4-inacl"),
897 };
898
899 VNET_FEATURE_INIT (ip4_inacl, static) =
900 {
901   .arc_name = "ip4-unicast",
902   .node_name = "ip4-inacl",
903   .runs_before = VNET_FEATURES ("ip4-source-check-via-rx"),
904 };
905
906 VNET_FEATURE_INIT (ip4_source_check_1, static) =
907 {
908   .arc_name = "ip4-unicast",
909   .node_name = "ip4-source-check-via-rx",
910   .runs_before = VNET_FEATURES ("ip4-source-check-via-any"),
911 };
912
913 VNET_FEATURE_INIT (ip4_source_check_2, static) =
914 {
915   .arc_name = "ip4-unicast",
916   .node_name = "ip4-source-check-via-any",
917   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
918 };
919
920 VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) =
921 {
922   .arc_name = "ip4-unicast",
923   .node_name = "ip4-source-and-port-range-check-rx",
924   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
925 };
926
927 VNET_FEATURE_INIT (ip4_policer_classify, static) =
928 {
929   .arc_name = "ip4-unicast",
930   .node_name = "ip4-policer-classify",
931   .runs_before = VNET_FEATURES ("ipsec4-input-feature"),
932 };
933
934 VNET_FEATURE_INIT (ip4_ipsec, static) =
935 {
936   .arc_name = "ip4-unicast",
937   .node_name = "ipsec4-input-feature",
938   .runs_before = VNET_FEATURES ("vpath-input-ip4"),
939 };
940
941 VNET_FEATURE_INIT (ip4_vpath, static) =
942 {
943   .arc_name = "ip4-unicast",
944   .node_name = "vpath-input-ip4",
945   .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
946 };
947
948 VNET_FEATURE_INIT (ip4_vxlan_bypass, static) =
949 {
950   .arc_name = "ip4-unicast",
951   .node_name = "ip4-vxlan-bypass",
952   .runs_before = VNET_FEATURES ("ip4-lookup"),
953 };
954
955 VNET_FEATURE_INIT (ip4_not_enabled, static) =
956 {
957   .arc_name = "ip4-unicast",
958   .node_name = "ip4-not-enabled",
959   .runs_before = VNET_FEATURES ("ip4-lookup"),
960 };
961
962 VNET_FEATURE_INIT (ip4_lookup, static) =
963 {
964   .arc_name = "ip4-unicast",
965   .node_name = "ip4-lookup",
966   .runs_before = 0,     /* not before any other features */
967 };
968
969 /* Built-in ip4 multicast rx feature path definition */
970 VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
971 {
972   .arc_name = "ip4-multicast",
973   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
974   .last_in_arc = "ip4-mfib-forward-lookup",
975   .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
976 };
977
978 VNET_FEATURE_INIT (ip4_vpath_mc, static) =
979 {
980   .arc_name = "ip4-multicast",
981   .node_name = "vpath-input-ip4",
982   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
983 };
984
985 VNET_FEATURE_INIT (ip4_mc_not_enabled, static) =
986 {
987   .arc_name = "ip4-multicast",
988   .node_name = "ip4-not-enabled",
989   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
990 };
991
992 VNET_FEATURE_INIT (ip4_lookup_mc, static) =
993 {
994   .arc_name = "ip4-multicast",
995   .node_name = "ip4-mfib-forward-lookup",
996   .runs_before = 0,     /* last feature */
997 };
998
999 /* Source and port-range check ip4 tx feature path definition */
1000 VNET_FEATURE_ARC_INIT (ip4_output, static) =
1001 {
1002   .arc_name = "ip4-output",
1003   .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain", "ip4-dvr-dpo"),
1004   .last_in_arc = "interface-output",
1005   .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
1006 };
1007
1008 VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) =
1009 {
1010   .arc_name = "ip4-output",
1011   .node_name = "ip4-source-and-port-range-check-tx",
1012   .runs_before = VNET_FEATURES ("ip4-outacl"),
1013 };
1014
1015 VNET_FEATURE_INIT (ip4_outacl, static) =
1016 {
1017   .arc_name = "ip4-output",
1018   .node_name = "ip4-outacl",
1019   .runs_before = VNET_FEATURES ("ipsec4-output-feature"),
1020 };
1021
1022 VNET_FEATURE_INIT (ip4_ipsec_output, static) =
1023 {
1024   .arc_name = "ip4-output",
1025   .node_name = "ipsec4-output-feature",
1026   .runs_before = VNET_FEATURES ("interface-output"),
1027 };
1028
1029 /* Built-in ip4 tx feature path definition */
1030 VNET_FEATURE_INIT (ip4_interface_output, static) =
1031 {
1032   .arc_name = "ip4-output",
1033   .node_name = "interface-output",
1034   .runs_before = 0,     /* not before any other features */
1035 };
1036 /* *INDENT-ON* */
1037
1038 static clib_error_t *
1039 ip4_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add)
1040 {
1041   ip4_main_t *im = &ip4_main;
1042
1043   /* Fill in lookup tables with default table (0). */
1044   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1045   vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);
1046
1047   if (!is_add)
1048     {
1049       ip4_main_t *im4 = &ip4_main;
1050       ip_lookup_main_t *lm4 = &im4->lookup_main;
1051       ip_interface_address_t *ia = 0;
1052       ip4_address_t *address;
1053       vlib_main_t *vm = vlib_get_main ();
1054
1055       vnet_sw_interface_update_unnumbered (sw_if_index, ~0, 0);
1056       /* *INDENT-OFF* */
1057       foreach_ip_interface_address (lm4, ia, sw_if_index, 0,
1058       ({
1059         address = ip_interface_address_get_address (lm4, ia);
1060         ip4_add_del_interface_address(vm, sw_if_index, address, ia->address_length, 1);
1061       }));
1062       /* *INDENT-ON* */
1063     }
1064
1065   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
1066                                is_add, 0, 0);
1067
1068   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
1069                                sw_if_index, is_add, 0, 0);
1070
1071   return /* no error */ 0;
1072 }
1073
1074 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1075
1076 /* Global IP4 main. */
1077 #ifndef CLIB_MARCH_VARIANT
1078 ip4_main_t ip4_main;
1079 #endif /* CLIB_MARCH_VARIANT */
1080
1081 static clib_error_t *
1082 ip4_lookup_init (vlib_main_t * vm)
1083 {
1084   ip4_main_t *im = &ip4_main;
1085   clib_error_t *error;
1086   uword i;
1087
1088   if ((error = vlib_call_init_function (vm, vnet_feature_init)))
1089     return error;
1090   if ((error = vlib_call_init_function (vm, ip4_mtrie_module_init)))
1091     return (error);
1092   if ((error = vlib_call_init_function (vm, fib_module_init)))
1093     return error;
1094   if ((error = vlib_call_init_function (vm, mfib_module_init)))
1095     return error;
1096
1097   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1098     {
1099       u32 m;
1100
1101       if (i < 32)
1102         m = pow2_mask (i) << (32 - i);
1103       else
1104         m = ~0;
1105       im->fib_masks[i] = clib_host_to_net_u32 (m);
1106     }
1107
1108   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1109
1110   /* Create FIB with index 0 and table id of 0. */
1111   fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1112                                      FIB_SOURCE_DEFAULT_ROUTE);
1113   mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1114                                       MFIB_SOURCE_DEFAULT_ROUTE);
1115
1116   {
1117     pg_node_t *pn;
1118     pn = pg_get_node (ip4_lookup_node.index);
1119     pn->unformat_edit = unformat_pg_ip4_header;
1120   }
1121
1122   {
1123     ethernet_arp_header_t h;
1124
1125     clib_memset (&h, 0, sizeof (h));
1126
1127 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1128 #define _8(f,v) h.f = v;
1129     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1130     _16 (l3_type, ETHERNET_TYPE_IP4);
1131     _8 (n_l2_address_bytes, 6);
1132     _8 (n_l3_address_bytes, 4);
1133     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1134 #undef _16
1135 #undef _8
1136
1137     vlib_packet_template_init (vm, &im->ip4_arp_request_packet_template,
1138                                /* data */ &h,
1139                                sizeof (h),
1140                                /* alloc chunk size */ 8,
1141                                "ip4 arp");
1142   }
1143
1144   return error;
1145 }
1146
1147 VLIB_INIT_FUNCTION (ip4_lookup_init);
1148
1149 typedef struct
1150 {
1151   /* Adjacency taken. */
1152   u32 dpo_index;
1153   u32 flow_hash;
1154   u32 fib_index;
1155
1156   /* Packet data, possibly *after* rewrite. */
1157   u8 packet_data[64 - 1 * sizeof (u32)];
1158 }
1159 ip4_forward_next_trace_t;
1160
1161 #ifndef CLIB_MARCH_VARIANT
1162 u8 *
1163 format_ip4_forward_next_trace (u8 * s, va_list * args)
1164 {
1165   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1166   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1167   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1168   u32 indent = format_get_indent (s);
1169   s = format (s, "%U%U",
1170               format_white_space, indent,
1171               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1172   return s;
1173 }
1174 #endif
1175
1176 static u8 *
1177 format_ip4_lookup_trace (u8 * s, va_list * args)
1178 {
1179   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1180   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1181   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1182   u32 indent = format_get_indent (s);
1183
1184   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1185               t->fib_index, t->dpo_index, t->flow_hash);
1186   s = format (s, "\n%U%U",
1187               format_white_space, indent,
1188               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1189   return s;
1190 }
1191
1192 static u8 *
1193 format_ip4_rewrite_trace (u8 * s, va_list * args)
1194 {
1195   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1196   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1197   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1198   u32 indent = format_get_indent (s);
1199
1200   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1201               t->fib_index, t->dpo_index, format_ip_adjacency,
1202               t->dpo_index, FORMAT_IP_ADJACENCY_NONE, t->flow_hash);
1203   s = format (s, "\n%U%U",
1204               format_white_space, indent,
1205               format_ip_adjacency_packet_data,
1206               t->dpo_index, t->packet_data, sizeof (t->packet_data));
1207   return s;
1208 }
1209
1210 #ifndef CLIB_MARCH_VARIANT
1211 /* Common trace function for all ip4-forward next nodes. */
1212 void
1213 ip4_forward_next_trace (vlib_main_t * vm,
1214                         vlib_node_runtime_t * node,
1215                         vlib_frame_t * frame, vlib_rx_or_tx_t which_adj_index)
1216 {
1217   u32 *from, n_left;
1218   ip4_main_t *im = &ip4_main;
1219
1220   n_left = frame->n_vectors;
1221   from = vlib_frame_vector_args (frame);
1222
1223   while (n_left >= 4)
1224     {
1225       u32 bi0, bi1;
1226       vlib_buffer_t *b0, *b1;
1227       ip4_forward_next_trace_t *t0, *t1;
1228
1229       /* Prefetch next iteration. */
1230       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1231       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1232
1233       bi0 = from[0];
1234       bi1 = from[1];
1235
1236       b0 = vlib_get_buffer (vm, bi0);
1237       b1 = vlib_get_buffer (vm, bi1);
1238
1239       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1240         {
1241           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1242           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1243           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1244           t0->fib_index =
1245             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1246              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1247             vec_elt (im->fib_index_by_sw_if_index,
1248                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1249
1250           clib_memcpy_fast (t0->packet_data,
1251                             vlib_buffer_get_current (b0),
1252                             sizeof (t0->packet_data));
1253         }
1254       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1255         {
1256           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1257           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1258           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1259           t1->fib_index =
1260             (vnet_buffer (b1)->sw_if_index[VLIB_TX] !=
1261              (u32) ~ 0) ? vnet_buffer (b1)->sw_if_index[VLIB_TX] :
1262             vec_elt (im->fib_index_by_sw_if_index,
1263                      vnet_buffer (b1)->sw_if_index[VLIB_RX]);
1264           clib_memcpy_fast (t1->packet_data, vlib_buffer_get_current (b1),
1265                             sizeof (t1->packet_data));
1266         }
1267       from += 2;
1268       n_left -= 2;
1269     }
1270
1271   while (n_left >= 1)
1272     {
1273       u32 bi0;
1274       vlib_buffer_t *b0;
1275       ip4_forward_next_trace_t *t0;
1276
1277       bi0 = from[0];
1278
1279       b0 = vlib_get_buffer (vm, bi0);
1280
1281       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1282         {
1283           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1284           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1285           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1286           t0->fib_index =
1287             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1288              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1289             vec_elt (im->fib_index_by_sw_if_index,
1290                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1291           clib_memcpy_fast (t0->packet_data, vlib_buffer_get_current (b0),
1292                             sizeof (t0->packet_data));
1293         }
1294       from += 1;
1295       n_left -= 1;
1296     }
1297 }
1298
1299 /* Compute TCP/UDP/ICMP4 checksum in software. */
1300 u16
1301 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1302                               ip4_header_t * ip0)
1303 {
1304   ip_csum_t sum0;
1305   u32 ip_header_length, payload_length_host_byte_order;
1306   u32 n_this_buffer, n_bytes_left, n_ip_bytes_this_buffer;
1307   u16 sum16;
1308   u8 *data_this_buffer;
1309   u8 length_odd;
1310
1311   /* Initialize checksum with ip header. */
1312   ip_header_length = ip4_header_bytes (ip0);
1313   payload_length_host_byte_order =
1314     clib_net_to_host_u16 (ip0->length) - ip_header_length;
1315   sum0 =
1316     clib_host_to_net_u32 (payload_length_host_byte_order +
1317                           (ip0->protocol << 16));
1318
1319   if (BITS (uword) == 32)
1320     {
1321       sum0 =
1322         ip_csum_with_carry (sum0,
1323                             clib_mem_unaligned (&ip0->src_address, u32));
1324       sum0 =
1325         ip_csum_with_carry (sum0,
1326                             clib_mem_unaligned (&ip0->dst_address, u32));
1327     }
1328   else
1329     sum0 =
1330       ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1331
1332   n_bytes_left = n_this_buffer = payload_length_host_byte_order;
1333   data_this_buffer = (u8 *) ip0 + ip_header_length;
1334   n_ip_bytes_this_buffer =
1335     p0->current_length - (((u8 *) ip0 - p0->data) - p0->current_data);
1336   if (n_this_buffer + ip_header_length > n_ip_bytes_this_buffer)
1337     {
1338       n_this_buffer = n_ip_bytes_this_buffer > ip_header_length ?
1339         n_ip_bytes_this_buffer - ip_header_length : 0;
1340     }
1341
1342   while (1)
1343     {
1344       sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
1345       n_bytes_left -= n_this_buffer;
1346       if (n_bytes_left == 0)
1347         break;
1348
1349       ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
1350       if (!(p0->flags & VLIB_BUFFER_NEXT_PRESENT))
1351         return 0xfefe;
1352
1353       length_odd = (n_this_buffer & 1);
1354
1355       p0 = vlib_get_buffer (vm, p0->next_buffer);
1356       data_this_buffer = vlib_buffer_get_current (p0);
1357       n_this_buffer = clib_min (p0->current_length, n_bytes_left);
1358
1359       if (PREDICT_FALSE (length_odd))
1360         {
1361           /* Prepend a 0 or the resulting checksum will be incorrect. */
1362           data_this_buffer--;
1363           n_this_buffer++;
1364           n_bytes_left++;
1365           data_this_buffer[0] = 0;
1366         }
1367     }
1368
1369   sum16 = ~ip_csum_fold (sum0);
1370   return sum16;
1371 }
1372
1373 u32
1374 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1375 {
1376   ip4_header_t *ip0 = vlib_buffer_get_current (p0);
1377   udp_header_t *udp0;
1378   u16 sum16;
1379
1380   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1381           || ip0->protocol == IP_PROTOCOL_UDP);
1382
1383   udp0 = (void *) (ip0 + 1);
1384   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1385     {
1386       p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1387                     | VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
1388       return p0->flags;
1389     }
1390
1391   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1392
1393   p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1394                 | ((sum16 == 0) << VNET_BUFFER_F_LOG2_L4_CHECKSUM_CORRECT));
1395
1396   return p0->flags;
1397 }
1398 #endif
1399
1400 /* *INDENT-OFF* */
1401 VNET_FEATURE_ARC_INIT (ip4_local) =
1402 {
1403   .arc_name  = "ip4-local",
1404   .start_nodes = VNET_FEATURES ("ip4-local"),
1405   .last_in_arc = "ip4-local-end-of-arc",
1406 };
1407 /* *INDENT-ON* */
1408
1409 static inline void
1410 ip4_local_l4_csum_validate (vlib_main_t * vm, vlib_buffer_t * p,
1411                             ip4_header_t * ip, u8 is_udp, u8 * error,
1412                             u8 * good_tcp_udp)
1413 {
1414   u32 flags0;
1415   flags0 = ip4_tcp_udp_validate_checksum (vm, p);
1416   *good_tcp_udp = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0;
1417   if (is_udp)
1418     {
1419       udp_header_t *udp;
1420       u32 ip_len, udp_len;
1421       i32 len_diff;
1422       udp = ip4_next_header (ip);
1423       /* Verify UDP length. */
1424       ip_len = clib_net_to_host_u16 (ip->length);
1425       udp_len = clib_net_to_host_u16 (udp->length);
1426
1427       len_diff = ip_len - udp_len;
1428       *good_tcp_udp &= len_diff >= 0;
1429       *error = len_diff < 0 ? IP4_ERROR_UDP_LENGTH : *error;
1430     }
1431 }
1432
1433 #define ip4_local_csum_is_offloaded(_b)                                 \
1434     _b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM                         \
1435         || _b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM
1436
1437 #define ip4_local_need_csum_check(is_tcp_udp, _b)                       \
1438     (is_tcp_udp && !(_b->flags & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED     \
1439         || ip4_local_csum_is_offloaded (_b)))
1440
1441 #define ip4_local_csum_is_valid(_b)                                     \
1442     (_b->flags & VNET_BUFFER_F_L4_CHECKSUM_CORRECT                      \
1443         || (ip4_local_csum_is_offloaded (_b))) != 0
1444
1445 static inline void
1446 ip4_local_check_l4_csum (vlib_main_t * vm, vlib_buffer_t * b,
1447                          ip4_header_t * ih, u8 * error)
1448 {
1449   u8 is_udp, is_tcp_udp, good_tcp_udp;
1450
1451   is_udp = ih->protocol == IP_PROTOCOL_UDP;
1452   is_tcp_udp = is_udp || ih->protocol == IP_PROTOCOL_TCP;
1453
1454   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp, b)))
1455     ip4_local_l4_csum_validate (vm, b, ih, is_udp, error, &good_tcp_udp);
1456   else
1457     good_tcp_udp = ip4_local_csum_is_valid (b);
1458
1459   ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1460   *error = (is_tcp_udp && !good_tcp_udp
1461             ? IP4_ERROR_TCP_CHECKSUM + is_udp : *error);
1462 }
1463
1464 static inline void
1465 ip4_local_check_l4_csum_x2 (vlib_main_t * vm, vlib_buffer_t ** b,
1466                             ip4_header_t ** ih, u8 * error)
1467 {
1468   u8 is_udp[2], is_tcp_udp[2], good_tcp_udp[2];
1469
1470   is_udp[0] = ih[0]->protocol == IP_PROTOCOL_UDP;
1471   is_udp[1] = ih[1]->protocol == IP_PROTOCOL_UDP;
1472
1473   is_tcp_udp[0] = is_udp[0] || ih[0]->protocol == IP_PROTOCOL_TCP;
1474   is_tcp_udp[1] = is_udp[1] || ih[1]->protocol == IP_PROTOCOL_TCP;
1475
1476   good_tcp_udp[0] = ip4_local_csum_is_valid (b[0]);
1477   good_tcp_udp[1] = ip4_local_csum_is_valid (b[1]);
1478
1479   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp[0], b[0])
1480                      || ip4_local_need_csum_check (is_tcp_udp[1], b[1])))
1481     {
1482       if (is_tcp_udp[0])
1483         ip4_local_l4_csum_validate (vm, b[0], ih[0], is_udp[0], &error[0],
1484                                     &good_tcp_udp[0]);
1485       if (is_tcp_udp[1])
1486         ip4_local_l4_csum_validate (vm, b[1], ih[1], is_udp[1], &error[1],
1487                                     &good_tcp_udp[1]);
1488     }
1489
1490   error[0] = (is_tcp_udp[0] && !good_tcp_udp[0] ?
1491               IP4_ERROR_TCP_CHECKSUM + is_udp[0] : error[0]);
1492   error[1] = (is_tcp_udp[1] && !good_tcp_udp[1] ?
1493               IP4_ERROR_TCP_CHECKSUM + is_udp[1] : error[1]);
1494 }
1495
1496 static inline void
1497 ip4_local_set_next_and_error (vlib_node_runtime_t * error_node,
1498                               vlib_buffer_t * b, u16 * next, u8 error,
1499                               u8 head_of_feature_arc)
1500 {
1501   u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;
1502   u32 next_index;
1503
1504   *next = error != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : *next;
1505   b->error = error ? error_node->errors[error] : 0;
1506   if (head_of_feature_arc)
1507     {
1508       next_index = *next;
1509       if (PREDICT_TRUE (error == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1510         {
1511           vnet_feature_arc_start (arc_index,
1512                                   vnet_buffer (b)->sw_if_index[VLIB_RX],
1513                                   &next_index, b);
1514           *next = next_index;
1515         }
1516     }
1517 }
1518
1519 typedef struct
1520 {
1521   ip4_address_t src;
1522   u32 lbi;
1523   u8 error;
1524   u8 first;
1525 } ip4_local_last_check_t;
1526
1527 static inline void
1528 ip4_local_check_src (vlib_buffer_t * b, ip4_header_t * ip0,
1529                      ip4_local_last_check_t * last_check, u8 * error0)
1530 {
1531   ip4_fib_mtrie_leaf_t leaf0;
1532   ip4_fib_mtrie_t *mtrie0;
1533   const dpo_id_t *dpo0;
1534   load_balance_t *lb0;
1535   u32 lbi0;
1536
1537   vnet_buffer (b)->ip.fib_index =
1538     vnet_buffer (b)->sw_if_index[VLIB_TX] != ~0 ?
1539     vnet_buffer (b)->sw_if_index[VLIB_TX] : vnet_buffer (b)->ip.fib_index;
1540
1541   /*
1542    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1543    *  adjacency for the destination address (the local interface address).
1544    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1545    *  adjacency for the source address (the remote sender's address)
1546    */
1547   if (PREDICT_FALSE (last_check->first ||
1548                      (last_check->src.as_u32 != ip0->src_address.as_u32)))
1549     {
1550       mtrie0 = &ip4_fib_get (vnet_buffer (b)->ip.fib_index)->mtrie;
1551       leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, &ip0->src_address);
1552       leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1553       leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1554       lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1555
1556       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1557         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1558       vnet_buffer (b)->ip.adj_index[VLIB_TX] = lbi0;
1559
1560       lb0 = load_balance_get (lbi0);
1561       dpo0 = load_balance_get_bucket_i (lb0, 0);
1562
1563       /*
1564        * Must have a route to source otherwise we drop the packet.
1565        * ip4 broadcasts are accepted, e.g. to make dhcp client work
1566        *
1567        * The checks are:
1568        *  - the source is a recieve => it's from us => bogus, do this
1569        *    first since it sets a different error code.
1570        *  - uRPF check for any route to source - accept if passes.
1571        *  - allow packets destined to the broadcast address from unknown sources
1572        */
1573
1574       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1575                   && dpo0->dpoi_type == DPO_RECEIVE) ?
1576                  IP4_ERROR_SPOOFED_LOCAL_PACKETS : *error0);
1577       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1578                   && !fib_urpf_check_size (lb0->lb_urpf)
1579                   && ip0->dst_address.as_u32 != 0xFFFFFFFF) ?
1580                  IP4_ERROR_SRC_LOOKUP_MISS : *error0);
1581
1582       last_check->src.as_u32 = ip0->src_address.as_u32;
1583       last_check->lbi = lbi0;
1584       last_check->error = *error0;
1585     }
1586   else
1587     {
1588       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1589         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1590       vnet_buffer (b)->ip.adj_index[VLIB_TX] = last_check->lbi;
1591       *error0 = last_check->error;
1592       last_check->first = 0;
1593     }
1594 }
1595
1596 static inline void
1597 ip4_local_check_src_x2 (vlib_buffer_t ** b, ip4_header_t ** ip,
1598                         ip4_local_last_check_t * last_check, u8 * error)
1599 {
1600   ip4_fib_mtrie_leaf_t leaf[2];
1601   ip4_fib_mtrie_t *mtrie[2];
1602   const dpo_id_t *dpo[2];
1603   load_balance_t *lb[2];
1604   u32 not_last_hit;
1605   u32 lbi[2];
1606
1607   not_last_hit = last_check->first;
1608   not_last_hit |= ip[0]->src_address.as_u32 ^ last_check->src.as_u32;
1609   not_last_hit |= ip[1]->src_address.as_u32 ^ last_check->src.as_u32;
1610
1611   vnet_buffer (b[0])->ip.fib_index =
1612     vnet_buffer (b[0])->sw_if_index[VLIB_TX] != ~0 ?
1613     vnet_buffer (b[0])->sw_if_index[VLIB_TX] :
1614     vnet_buffer (b[0])->ip.fib_index;
1615
1616   vnet_buffer (b[1])->ip.fib_index =
1617     vnet_buffer (b[1])->sw_if_index[VLIB_TX] != ~0 ?
1618     vnet_buffer (b[1])->sw_if_index[VLIB_TX] :
1619     vnet_buffer (b[1])->ip.fib_index;
1620
1621   /*
1622    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1623    *  adjacency for the destination address (the local interface address).
1624    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1625    *  adjacency for the source address (the remote sender's address)
1626    */
1627   if (PREDICT_FALSE (not_last_hit))
1628     {
1629       mtrie[0] = &ip4_fib_get (vnet_buffer (b[0])->ip.fib_index)->mtrie;
1630       mtrie[1] = &ip4_fib_get (vnet_buffer (b[1])->ip.fib_index)->mtrie;
1631
1632       leaf[0] = ip4_fib_mtrie_lookup_step_one (mtrie[0], &ip[0]->src_address);
1633       leaf[1] = ip4_fib_mtrie_lookup_step_one (mtrie[1], &ip[1]->src_address);
1634
1635       leaf[0] = ip4_fib_mtrie_lookup_step (mtrie[0], leaf[0],
1636                                            &ip[0]->src_address, 2);
1637       leaf[1] = ip4_fib_mtrie_lookup_step (mtrie[1], leaf[1],
1638                                            &ip[1]->src_address, 2);
1639
1640       leaf[0] = ip4_fib_mtrie_lookup_step (mtrie[0], leaf[0],
1641                                            &ip[0]->src_address, 3);
1642       leaf[1] = ip4_fib_mtrie_lookup_step (mtrie[1], leaf[1],
1643                                            &ip[1]->src_address, 3);
1644
1645       lbi[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf[0]);
1646       lbi[1] = ip4_fib_mtrie_leaf_get_adj_index (leaf[1]);
1647
1648       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1649         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1650       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = lbi[0];
1651
1652       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1653         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1654       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = lbi[1];
1655
1656       lb[0] = load_balance_get (lbi[0]);
1657       lb[1] = load_balance_get (lbi[1]);
1658
1659       dpo[0] = load_balance_get_bucket_i (lb[0], 0);
1660       dpo[1] = load_balance_get_bucket_i (lb[1], 0);
1661
1662       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1663                    dpo[0]->dpoi_type == DPO_RECEIVE) ?
1664                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[0]);
1665       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1666                    !fib_urpf_check_size (lb[0]->lb_urpf) &&
1667                    ip[0]->dst_address.as_u32 != 0xFFFFFFFF)
1668                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[0]);
1669
1670       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1671                    dpo[1]->dpoi_type == DPO_RECEIVE) ?
1672                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[1]);
1673       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1674                    !fib_urpf_check_size (lb[1]->lb_urpf) &&
1675                    ip[1]->dst_address.as_u32 != 0xFFFFFFFF)
1676                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[1]);
1677
1678       last_check->src.as_u32 = ip[1]->src_address.as_u32;
1679       last_check->lbi = lbi[1];
1680       last_check->error = error[1];
1681     }
1682   else
1683     {
1684       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1685         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1686       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = last_check->lbi;
1687
1688       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1689         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1690       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = last_check->lbi;
1691
1692       error[0] = last_check->error;
1693       error[1] = last_check->error;
1694       last_check->first = 0;
1695     }
1696 }
1697
1698 enum ip_local_packet_type_e
1699 {
1700   IP_LOCAL_PACKET_TYPE_L4,
1701   IP_LOCAL_PACKET_TYPE_NAT,
1702   IP_LOCAL_PACKET_TYPE_FRAG,
1703 };
1704
1705 /**
1706  * Determine packet type and next node.
1707  *
1708  * The expectation is that all packets that are not L4 will skip
1709  * checksums and source checks.
1710  */
1711 always_inline u8
1712 ip4_local_classify (vlib_buffer_t * b, ip4_header_t * ip, u16 * next)
1713 {
1714   ip_lookup_main_t *lm = &ip4_main.lookup_main;
1715
1716   if (PREDICT_FALSE (ip4_is_fragment (ip)))
1717     {
1718       *next = IP_LOCAL_NEXT_REASSEMBLY;
1719       return IP_LOCAL_PACKET_TYPE_FRAG;
1720     }
1721   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_IS_NATED))
1722     {
1723       *next = lm->local_next_by_ip_protocol[ip->protocol];
1724       return IP_LOCAL_PACKET_TYPE_NAT;
1725     }
1726
1727   *next = lm->local_next_by_ip_protocol[ip->protocol];
1728   return IP_LOCAL_PACKET_TYPE_L4;
1729 }
1730
1731 static inline uword
1732 ip4_local_inline (vlib_main_t * vm,
1733                   vlib_node_runtime_t * node,
1734                   vlib_frame_t * frame, int head_of_feature_arc)
1735 {
1736   u32 *from, n_left_from;
1737   vlib_node_runtime_t *error_node =
1738     vlib_node_get_runtime (vm, ip4_input_node.index);
1739   u16 nexts[VLIB_FRAME_SIZE], *next;
1740   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1741   ip4_header_t *ip[2];
1742   u8 error[2], pt[2];
1743
1744   ip4_local_last_check_t last_check = {
1745     /*
1746      * 0.0.0.0 can appear as the source address of an IP packet,
1747      * as can any other address, hence the need to use the 'first'
1748      * member to make sure the .lbi is initialised for the first
1749      * packet.
1750      */
1751     .src = {.as_u32 = 0},
1752     .lbi = ~0,
1753     .error = IP4_ERROR_UNKNOWN_PROTOCOL,
1754     .first = 1,
1755   };
1756
1757   from = vlib_frame_vector_args (frame);
1758   n_left_from = frame->n_vectors;
1759
1760   if (node->flags & VLIB_NODE_FLAG_TRACE)
1761     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1762
1763   vlib_get_buffers (vm, from, bufs, n_left_from);
1764   b = bufs;
1765   next = nexts;
1766
1767   while (n_left_from >= 6)
1768     {
1769       u8 not_batch = 0;
1770
1771       /* Prefetch next iteration. */
1772       {
1773         vlib_prefetch_buffer_header (b[4], LOAD);
1774         vlib_prefetch_buffer_header (b[5], LOAD);
1775
1776         CLIB_PREFETCH (b[4]->data, CLIB_CACHE_LINE_BYTES, LOAD);
1777         CLIB_PREFETCH (b[5]->data, CLIB_CACHE_LINE_BYTES, LOAD);
1778       }
1779
1780       error[0] = error[1] = IP4_ERROR_UNKNOWN_PROTOCOL;
1781
1782       ip[0] = vlib_buffer_get_current (b[0]);
1783       ip[1] = vlib_buffer_get_current (b[1]);
1784
1785       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1786       vnet_buffer (b[1])->l3_hdr_offset = b[1]->current_data;
1787
1788       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1789       pt[1] = ip4_local_classify (b[1], ip[1], &next[1]);
1790
1791       not_batch = pt[0] ^ pt[1];
1792
1793       if (head_of_feature_arc == 0 || (pt[0] && not_batch == 0))
1794         goto skip_checks;
1795
1796       if (PREDICT_TRUE (not_batch == 0))
1797         {
1798           ip4_local_check_l4_csum_x2 (vm, b, ip, error);
1799           ip4_local_check_src_x2 (b, ip, &last_check, error);
1800         }
1801       else
1802         {
1803           if (!pt[0])
1804             {
1805               ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1806               ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1807             }
1808           if (!pt[1])
1809             {
1810               ip4_local_check_l4_csum (vm, b[1], ip[1], &error[1]);
1811               ip4_local_check_src (b[1], ip[1], &last_check, &error[1]);
1812             }
1813         }
1814
1815     skip_checks:
1816
1817       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1818                                     head_of_feature_arc);
1819       ip4_local_set_next_and_error (error_node, b[1], &next[1], error[1],
1820                                     head_of_feature_arc);
1821
1822       b += 2;
1823       next += 2;
1824       n_left_from -= 2;
1825     }
1826
1827   while (n_left_from > 0)
1828     {
1829       error[0] = IP4_ERROR_UNKNOWN_PROTOCOL;
1830
1831       ip[0] = vlib_buffer_get_current (b[0]);
1832       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1833       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1834
1835       if (head_of_feature_arc == 0 || pt[0])
1836         goto skip_check;
1837
1838       ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1839       ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1840
1841     skip_check:
1842
1843       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1844                                     head_of_feature_arc);
1845
1846       b += 1;
1847       next += 1;
1848       n_left_from -= 1;
1849     }
1850
1851   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
1852   return frame->n_vectors;
1853 }
1854
1855 VLIB_NODE_FN (ip4_local_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1856                                vlib_frame_t * frame)
1857 {
1858   return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ );
1859 }
1860
1861 /* *INDENT-OFF* */
1862 VLIB_REGISTER_NODE (ip4_local_node) =
1863 {
1864   .name = "ip4-local",
1865   .vector_size = sizeof (u32),
1866   .format_trace = format_ip4_forward_next_trace,
1867   .n_next_nodes = IP_LOCAL_N_NEXT,
1868   .next_nodes =
1869   {
1870     [IP_LOCAL_NEXT_DROP] = "ip4-drop",
1871     [IP_LOCAL_NEXT_PUNT] = "ip4-punt",
1872     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1873     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
1874     [IP_LOCAL_NEXT_REASSEMBLY] = "ip4-reassembly",
1875   },
1876 };
1877 /* *INDENT-ON* */
1878
1879
1880 VLIB_NODE_FN (ip4_local_end_of_arc_node) (vlib_main_t * vm,
1881                                           vlib_node_runtime_t * node,
1882                                           vlib_frame_t * frame)
1883 {
1884   return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ );
1885 }
1886
1887 /* *INDENT-OFF* */
1888 VLIB_REGISTER_NODE (ip4_local_end_of_arc_node) = {
1889   .name = "ip4-local-end-of-arc",
1890   .vector_size = sizeof (u32),
1891
1892   .format_trace = format_ip4_forward_next_trace,
1893   .sibling_of = "ip4-local",
1894 };
1895
1896 VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = {
1897   .arc_name = "ip4-local",
1898   .node_name = "ip4-local-end-of-arc",
1899   .runs_before = 0, /* not before any other features */
1900 };
1901 /* *INDENT-ON* */
1902
1903 #ifndef CLIB_MARCH_VARIANT
1904 void
1905 ip4_register_protocol (u32 protocol, u32 node_index)
1906 {
1907   vlib_main_t *vm = vlib_get_main ();
1908   ip4_main_t *im = &ip4_main;
1909   ip_lookup_main_t *lm = &im->lookup_main;
1910
1911   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1912   lm->local_next_by_ip_protocol[protocol] =
1913     vlib_node_add_next (vm, ip4_local_node.index, node_index);
1914 }
1915
1916 void
1917 ip4_unregister_protocol (u32 protocol)
1918 {
1919   ip4_main_t *im = &ip4_main;
1920   ip_lookup_main_t *lm = &im->lookup_main;
1921
1922   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1923   lm->local_next_by_ip_protocol[protocol] = IP_LOCAL_NEXT_PUNT;
1924 }
1925 #endif
1926
1927 static clib_error_t *
1928 show_ip_local_command_fn (vlib_main_t * vm,
1929                           unformat_input_t * input, vlib_cli_command_t * cmd)
1930 {
1931   ip4_main_t *im = &ip4_main;
1932   ip_lookup_main_t *lm = &im->lookup_main;
1933   int i;
1934
1935   vlib_cli_output (vm, "Protocols handled by ip4_local");
1936   for (i = 0; i < ARRAY_LEN (lm->local_next_by_ip_protocol); i++)
1937     {
1938       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1939         {
1940           u32 node_index = vlib_get_node (vm,
1941                                           ip4_local_node.index)->
1942             next_nodes[lm->local_next_by_ip_protocol[i]];
1943           vlib_cli_output (vm, "%U: %U", format_ip_protocol, i,
1944                            format_vlib_node_name, vm, node_index);
1945         }
1946     }
1947   return 0;
1948 }
1949
1950
1951
1952 /*?
1953  * Display the set of protocols handled by the local IPv4 stack.
1954  *
1955  * @cliexpar
1956  * Example of how to display local protocol table:
1957  * @cliexstart{show ip local}
1958  * Protocols handled by ip4_local
1959  * 1
1960  * 17
1961  * 47
1962  * @cliexend
1963 ?*/
1964 /* *INDENT-OFF* */
1965 VLIB_CLI_COMMAND (show_ip_local, static) =
1966 {
1967   .path = "show ip local",
1968   .function = show_ip_local_command_fn,
1969   .short_help = "show ip local",
1970 };
1971 /* *INDENT-ON* */
1972
1973 always_inline uword
1974 ip4_arp_inline (vlib_main_t * vm,
1975                 vlib_node_runtime_t * node,
1976                 vlib_frame_t * frame, int is_glean)
1977 {
1978   vnet_main_t *vnm = vnet_get_main ();
1979   ip4_main_t *im = &ip4_main;
1980   ip_lookup_main_t *lm = &im->lookup_main;
1981   u32 *from, *to_next_drop;
1982   uword n_left_from, n_left_to_next_drop, next_index;
1983   u32 thread_index = vm->thread_index;
1984   u64 seed;
1985
1986   if (node->flags & VLIB_NODE_FLAG_TRACE)
1987     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1988
1989   seed = throttle_seed (&im->arp_throttle, thread_index, vlib_time_now (vm));
1990
1991   from = vlib_frame_vector_args (frame);
1992   n_left_from = frame->n_vectors;
1993   next_index = node->cached_next_index;
1994   if (next_index == IP4_ARP_NEXT_DROP)
1995     next_index = IP4_ARP_N_NEXT;        /* point to first interface */
1996
1997   while (n_left_from > 0)
1998     {
1999       vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
2000                            to_next_drop, n_left_to_next_drop);
2001
2002       while (n_left_from > 0 && n_left_to_next_drop > 0)
2003         {
2004           u32 pi0, bi0, adj_index0, sw_if_index0;
2005           ip_adjacency_t *adj0;
2006           vlib_buffer_t *p0, *b0;
2007           ip4_address_t resolve0;
2008           ethernet_arp_header_t *h0;
2009           vnet_hw_interface_t *hw_if0;
2010           u64 r0;
2011
2012           pi0 = from[0];
2013           p0 = vlib_get_buffer (vm, pi0);
2014
2015           from += 1;
2016           n_left_from -= 1;
2017           to_next_drop[0] = pi0;
2018           to_next_drop += 1;
2019           n_left_to_next_drop -= 1;
2020
2021           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2022           adj0 = adj_get (adj_index0);
2023
2024           if (is_glean)
2025             {
2026               /* resolve the packet's destination */
2027               ip4_header_t *ip0 = vlib_buffer_get_current (p0);
2028               resolve0 = ip0->dst_address;
2029             }
2030           else
2031             {
2032               /* resolve the incomplete adj */
2033               resolve0 = adj0->sub_type.nbr.next_hop.ip4;
2034             }
2035
2036           /* combine the address and interface for the hash key */
2037           sw_if_index0 = adj0->rewrite_header.sw_if_index;
2038           r0 = (u64) resolve0.data_u32 << 32;
2039           r0 |= sw_if_index0;
2040
2041           if (throttle_check (&im->arp_throttle, thread_index, r0, seed))
2042             {
2043               p0->error = node->errors[IP4_ARP_ERROR_THROTTLED];
2044               continue;
2045             }
2046
2047           /*
2048            * the adj has been updated to a rewrite but the node the DPO that got
2049            * us here hasn't - yet. no big deal. we'll drop while we wait.
2050            */
2051           if (IP_LOOKUP_NEXT_REWRITE == adj0->lookup_next_index)
2052             {
2053               p0->error = node->errors[IP4_ARP_ERROR_RESOLVED];
2054               continue;
2055             }
2056
2057           /*
2058            * Can happen if the control-plane is programming tables
2059            * with traffic flowing; at least that's today's lame excuse.
2060            */
2061           if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN)
2062               || (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
2063             {
2064               p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
2065               continue;
2066             }
2067           /* Send ARP request. */
2068           h0 =
2069             vlib_packet_template_get_packet (vm,
2070                                              &im->ip4_arp_request_packet_template,
2071                                              &bi0);
2072           /* Seems we're out of buffers */
2073           if (PREDICT_FALSE (!h0))
2074             {
2075               p0->error = node->errors[IP4_ARP_ERROR_NO_BUFFERS];
2076               continue;
2077             }
2078
2079           b0 = vlib_get_buffer (vm, bi0);
2080
2081           /* copy the persistent fields from the original */
2082           clib_memcpy_fast (b0->opaque2, p0->opaque2, sizeof (p0->opaque2));
2083
2084           /* Add rewrite/encap string for ARP packet. */
2085           vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t));
2086
2087           hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
2088
2089           /* Src ethernet address in ARP header. */
2090           mac_address_from_bytes (&h0->ip4_over_ethernet[0].mac,
2091                                   hw_if0->hw_address);
2092           if (is_glean)
2093             {
2094               /* The interface's source address is stashed in the Glean Adj */
2095               h0->ip4_over_ethernet[0].ip4 =
2096                 adj0->sub_type.glean.receive_addr.ip4;
2097             }
2098           else
2099             {
2100               /* Src IP address in ARP header. */
2101               if (ip4_src_address_for_packet (lm, sw_if_index0,
2102                                               &h0->ip4_over_ethernet[0].ip4))
2103                 {
2104                   /* No source address available */
2105                   p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
2106                   vlib_buffer_free (vm, &bi0, 1);
2107                   continue;
2108                 }
2109             }
2110           h0->ip4_over_ethernet[1].ip4 = resolve0;
2111
2112           p0->error = node->errors[IP4_ARP_ERROR_REQUEST_SENT];
2113
2114           vlib_buffer_copy_trace_flag (vm, p0, bi0);
2115           VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
2116           vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
2117
2118           vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
2119
2120           vlib_set_next_frame_buffer (vm, node,
2121                                       adj0->rewrite_header.next_index, bi0);
2122         }
2123
2124       vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
2125     }
2126
2127   return frame->n_vectors;
2128 }
2129
2130 VLIB_NODE_FN (ip4_arp_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2131                              vlib_frame_t * frame)
2132 {
2133   return (ip4_arp_inline (vm, node, frame, 0));
2134 }
2135
2136 VLIB_NODE_FN (ip4_glean_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2137                                vlib_frame_t * frame)
2138 {
2139   return (ip4_arp_inline (vm, node, frame, 1));
2140 }
2141
2142 static char *ip4_arp_error_strings[] = {
2143   [IP4_ARP_ERROR_THROTTLED] = "ARP requests throttled",
2144   [IP4_ARP_ERROR_RESOLVED] = "ARP requests resolved",
2145   [IP4_ARP_ERROR_NO_BUFFERS] = "ARP requests out of buffer",
2146   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
2147   [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
2148   [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
2149 };
2150
2151 /* *INDENT-OFF* */
2152 VLIB_REGISTER_NODE (ip4_arp_node) =
2153 {
2154   .name = "ip4-arp",
2155   .vector_size = sizeof (u32),
2156   .format_trace = format_ip4_forward_next_trace,
2157   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2158   .error_strings = ip4_arp_error_strings,
2159   .n_next_nodes = IP4_ARP_N_NEXT,
2160   .next_nodes =
2161   {
2162     [IP4_ARP_NEXT_DROP] = "error-drop",
2163   },
2164 };
2165
2166 VLIB_REGISTER_NODE (ip4_glean_node) =
2167 {
2168   .name = "ip4-glean",
2169   .vector_size = sizeof (u32),
2170   .format_trace = format_ip4_forward_next_trace,
2171   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2172   .error_strings = ip4_arp_error_strings,
2173   .n_next_nodes = IP4_ARP_N_NEXT,
2174   .next_nodes = {
2175   [IP4_ARP_NEXT_DROP] = "error-drop",
2176   },
2177 };
2178 /* *INDENT-ON* */
2179
2180 #define foreach_notrace_ip4_arp_error           \
2181 _(THROTTLED)                                    \
2182 _(RESOLVED)                                     \
2183 _(NO_BUFFERS)                                   \
2184 _(REQUEST_SENT)                                 \
2185 _(NON_ARP_ADJ)                                  \
2186 _(NO_SOURCE_ADDRESS)
2187
2188 static clib_error_t *
2189 arp_notrace_init (vlib_main_t * vm)
2190 {
2191   vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, ip4_arp_node.index);
2192
2193   /* don't trace ARP request packets */
2194 #define _(a)                                    \
2195     vnet_pcap_drop_trace_filter_add_del         \
2196         (rt->errors[IP4_ARP_ERROR_##a],         \
2197          1 /* is_add */);
2198   foreach_notrace_ip4_arp_error;
2199 #undef _
2200   return 0;
2201 }
2202
2203 VLIB_INIT_FUNCTION (arp_notrace_init);
2204
2205
2206 #ifndef CLIB_MARCH_VARIANT
2207 /* Send an ARP request to see if given destination is reachable on given interface. */
2208 clib_error_t *
2209 ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index,
2210                     u8 refresh)
2211 {
2212   vnet_main_t *vnm = vnet_get_main ();
2213   ip4_main_t *im = &ip4_main;
2214   ethernet_arp_header_t *h;
2215   ip4_address_t *src;
2216   ip_interface_address_t *ia;
2217   ip_adjacency_t *adj;
2218   vnet_hw_interface_t *hi;
2219   vnet_sw_interface_t *si;
2220   vlib_buffer_t *b;
2221   adj_index_t ai;
2222   u32 bi = 0;
2223   u8 unicast_rewrite = 0;
2224
2225   si = vnet_get_sw_interface (vnm, sw_if_index);
2226
2227   if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
2228     {
2229       return clib_error_return (0, "%U: interface %U down",
2230                                 format_ip4_address, dst,
2231                                 format_vnet_sw_if_index_name, vnm,
2232                                 sw_if_index);
2233     }
2234
2235   src =
2236     ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
2237   if (!src)
2238     {
2239       vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
2240       return clib_error_return
2241         (0,
2242          "no matching interface address for destination %U (interface %U)",
2243          format_ip4_address, dst, format_vnet_sw_if_index_name, vnm,
2244          sw_if_index);
2245     }
2246
2247   h = vlib_packet_template_get_packet (vm,
2248                                        &im->ip4_arp_request_packet_template,
2249                                        &bi);
2250
2251   if (!h)
2252     return clib_error_return (0, "ARP request packet allocation failed");
2253
2254   hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
2255   if (PREDICT_FALSE (!hi->hw_address))
2256     {
2257       return clib_error_return (0, "%U: interface %U do not support ip probe",
2258                                 format_ip4_address, dst,
2259                                 format_vnet_sw_if_index_name, vnm,
2260                                 sw_if_index);
2261     }
2262
2263   mac_address_from_bytes (&h->ip4_over_ethernet[0].mac, hi->hw_address);
2264
2265   h->ip4_over_ethernet[0].ip4 = src[0];
2266   h->ip4_over_ethernet[1].ip4 = dst[0];
2267
2268   b = vlib_get_buffer (vm, bi);
2269   vnet_buffer (b)->sw_if_index[VLIB_RX] =
2270     vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
2271
2272   ip46_address_t nh = {
2273     .ip4 = *dst,
2274   };
2275
2276   ai = adj_nbr_add_or_lock (FIB_PROTOCOL_IP4,
2277                             VNET_LINK_IP4, &nh, sw_if_index);
2278   adj = adj_get (ai);
2279
2280   /* Peer has been previously resolved, retrieve glean adj instead */
2281   if (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE)
2282     {
2283       if (refresh)
2284         unicast_rewrite = 1;
2285       else
2286         {
2287           adj_unlock (ai);
2288           ai = adj_glean_add_or_lock (FIB_PROTOCOL_IP4,
2289                                       VNET_LINK_IP4, sw_if_index, &nh);
2290           adj = adj_get (ai);
2291         }
2292     }
2293
2294   /* Add encapsulation string for software interface (e.g. ethernet header). */
2295   vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
2296   if (unicast_rewrite)
2297     {
2298       u16 *etype = vlib_buffer_get_current (b) - 2;
2299       etype[0] = clib_host_to_net_u16 (ETHERNET_TYPE_ARP);
2300     }
2301   vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
2302
2303   {
2304     vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index);
2305     u32 *to_next = vlib_frame_vector_args (f);
2306     to_next[0] = bi;
2307     f->n_vectors = 1;
2308     vlib_put_frame_to_node (vm, hi->output_node_index, f);
2309   }
2310
2311   adj_unlock (ai);
2312   return /* no error */ 0;
2313 }
2314 #endif
2315
2316 typedef enum
2317 {
2318   IP4_REWRITE_NEXT_DROP,
2319   IP4_REWRITE_NEXT_ICMP_ERROR,
2320   IP4_REWRITE_NEXT_FRAGMENT,
2321   IP4_REWRITE_N_NEXT            /* Last */
2322 } ip4_rewrite_next_t;
2323
2324 /**
2325  * This bits of an IPv4 address to mask to construct a multicast
2326  * MAC address
2327  */
2328 #if CLIB_ARCH_IS_BIG_ENDIAN
2329 #define IP4_MCAST_ADDR_MASK 0x007fffff
2330 #else
2331 #define IP4_MCAST_ADDR_MASK 0xffff7f00
2332 #endif
2333
2334 always_inline void
2335 ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
2336                u16 adj_packet_bytes, bool df, u16 * next, u32 * error)
2337 {
2338   if (packet_len > adj_packet_bytes)
2339     {
2340       *error = IP4_ERROR_MTU_EXCEEDED;
2341       if (df)
2342         {
2343           icmp4_error_set_vnet_buffer
2344             (b, ICMP4_destination_unreachable,
2345              ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
2346              adj_packet_bytes);
2347           *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2348         }
2349       else
2350         {
2351           /* IP fragmentation */
2352           ip_frag_set_vnet_buffer (b, adj_packet_bytes,
2353                                    IP4_FRAG_NEXT_IP4_REWRITE, 0);
2354           *next = IP4_REWRITE_NEXT_FRAGMENT;
2355         }
2356     }
2357 }
2358
2359 /* Decrement TTL & update checksum.
2360    Works either endian, so no need for byte swap. */
2361 static_always_inline void
2362 ip4_ttl_and_checksum_check (vlib_buffer_t * b, ip4_header_t * ip, u16 * next,
2363                             u32 * error)
2364 {
2365   i32 ttl;
2366   u32 checksum;
2367   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
2368     {
2369       b->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED;
2370       return;
2371     }
2372
2373   ttl = ip->ttl;
2374
2375   /* Input node should have reject packets with ttl 0. */
2376   ASSERT (ip->ttl > 0);
2377
2378   checksum = ip->checksum + clib_host_to_net_u16 (0x0100);
2379   checksum += checksum >= 0xffff;
2380
2381   ip->checksum = checksum;
2382   ttl -= 1;
2383   ip->ttl = ttl;
2384
2385   /*
2386    * If the ttl drops below 1 when forwarding, generate
2387    * an ICMP response.
2388    */
2389   if (PREDICT_FALSE (ttl <= 0))
2390     {
2391       *error = IP4_ERROR_TIME_EXPIRED;
2392       vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2393       icmp4_error_set_vnet_buffer (b, ICMP4_time_exceeded,
2394                                    ICMP4_time_exceeded_ttl_exceeded_in_transit,
2395                                    0);
2396       *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2397     }
2398
2399   /* Verify checksum. */
2400   ASSERT ((ip->checksum == ip4_header_checksum (ip)) ||
2401           (b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM));
2402 }
2403
2404
2405 always_inline uword
2406 ip4_rewrite_inline_with_gso (vlib_main_t * vm,
2407                              vlib_node_runtime_t * node,
2408                              vlib_frame_t * frame,
2409                              int do_counters, int is_midchain, int is_mcast,
2410                              int do_gso)
2411 {
2412   ip_lookup_main_t *lm = &ip4_main.lookup_main;
2413   u32 *from = vlib_frame_vector_args (frame);
2414   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2415   u16 nexts[VLIB_FRAME_SIZE], *next;
2416   u32 n_left_from;
2417   vlib_node_runtime_t *error_node =
2418     vlib_node_get_runtime (vm, ip4_input_node.index);
2419
2420   n_left_from = frame->n_vectors;
2421   u32 thread_index = vm->thread_index;
2422
2423   vlib_get_buffers (vm, from, bufs, n_left_from);
2424   clib_memset_u16 (nexts, IP4_REWRITE_NEXT_DROP, n_left_from);
2425
2426 #if (CLIB_N_PREFETCHES >= 8)
2427   if (n_left_from >= 6)
2428     {
2429       int i;
2430       for (i = 2; i < 6; i++)
2431         vlib_prefetch_buffer_header (bufs[i], LOAD);
2432     }
2433
2434   next = nexts;
2435   b = bufs;
2436   while (n_left_from >= 8)
2437     {
2438       ip_adjacency_t *adj0, *adj1;
2439       ip4_header_t *ip0, *ip1;
2440       u32 rw_len0, error0, adj_index0;
2441       u32 rw_len1, error1, adj_index1;
2442       u32 tx_sw_if_index0, tx_sw_if_index1;
2443       u8 *p;
2444
2445       vlib_prefetch_buffer_header (b[6], LOAD);
2446       vlib_prefetch_buffer_header (b[7], LOAD);
2447
2448       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2449       adj_index1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
2450
2451       /*
2452        * pre-fetch the per-adjacency counters
2453        */
2454       if (do_counters)
2455         {
2456           vlib_prefetch_combined_counter (&adjacency_counters,
2457                                           thread_index, adj_index0);
2458           vlib_prefetch_combined_counter (&adjacency_counters,
2459                                           thread_index, adj_index1);
2460         }
2461
2462       ip0 = vlib_buffer_get_current (b[0]);
2463       ip1 = vlib_buffer_get_current (b[1]);
2464
2465       error0 = error1 = IP4_ERROR_NONE;
2466
2467       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2468       ip4_ttl_and_checksum_check (b[1], ip1, next + 1, &error1);
2469
2470       /* Rewrite packet header and updates lengths. */
2471       adj0 = adj_get (adj_index0);
2472       adj1 = adj_get (adj_index1);
2473
2474       /* Worth pipelining. No guarantee that adj0,1 are hot... */
2475       rw_len0 = adj0[0].rewrite_header.data_bytes;
2476       rw_len1 = adj1[0].rewrite_header.data_bytes;
2477       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2478       vnet_buffer (b[1])->ip.save_rewrite_length = rw_len1;
2479
2480       p = vlib_buffer_get_current (b[2]);
2481       CLIB_PREFETCH (p - CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES, STORE);
2482       CLIB_PREFETCH (p, CLIB_CACHE_LINE_BYTES, LOAD);
2483
2484       p = vlib_buffer_get_current (b[3]);
2485       CLIB_PREFETCH (p - CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES, STORE);
2486       CLIB_PREFETCH (p, CLIB_CACHE_LINE_BYTES, LOAD);
2487
2488       /* Check MTU of outgoing interface. */
2489       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2490       u16 ip1_len = clib_net_to_host_u16 (ip1->length);
2491
2492       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2493         ip0_len = gso_mtu_sz (b[0]);
2494       if (do_gso && (b[1]->flags & VNET_BUFFER_F_GSO))
2495         ip1_len = gso_mtu_sz (b[1]);
2496
2497       ip4_mtu_check (b[0], ip0_len,
2498                      adj0[0].rewrite_header.max_l3_packet_bytes,
2499                      ip0->flags_and_fragment_offset &
2500                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2501                      next + 0, &error0);
2502       ip4_mtu_check (b[1], ip1_len,
2503                      adj1[0].rewrite_header.max_l3_packet_bytes,
2504                      ip1->flags_and_fragment_offset &
2505                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2506                      next + 1, &error1);
2507
2508       if (is_mcast)
2509         {
2510           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2511                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2512                     IP4_ERROR_SAME_INTERFACE : error0);
2513           error1 = ((adj1[0].rewrite_header.sw_if_index ==
2514                      vnet_buffer (b[1])->sw_if_index[VLIB_RX]) ?
2515                     IP4_ERROR_SAME_INTERFACE : error1);
2516         }
2517
2518       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2519        * to see the IP header */
2520       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2521         {
2522           u32 next_index = adj0[0].rewrite_header.next_index;
2523           vlib_buffer_advance (b[0], -(word) rw_len0);
2524           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2525           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2526
2527           if (PREDICT_FALSE
2528               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2529             vnet_feature_arc_start (lm->output_feature_arc_index,
2530                                     tx_sw_if_index0, &next_index, b[0]);
2531           next[0] = next_index;
2532         }
2533       else
2534         {
2535           b[0]->error = error_node->errors[error0];
2536         }
2537       if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
2538         {
2539           u32 next_index = adj1[0].rewrite_header.next_index;
2540           vlib_buffer_advance (b[1], -(word) rw_len1);
2541
2542           tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2543           vnet_buffer (b[1])->sw_if_index[VLIB_TX] = tx_sw_if_index1;
2544
2545           if (PREDICT_FALSE
2546               (adj1[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2547             vnet_feature_arc_start (lm->output_feature_arc_index,
2548                                     tx_sw_if_index1, &next_index, b[1]);
2549           next[1] = next_index;
2550         }
2551       else
2552         {
2553           b[1]->error = error_node->errors[error1];
2554         }
2555       if (is_midchain)
2556         {
2557           calc_checksums (vm, b[0]);
2558           calc_checksums (vm, b[1]);
2559         }
2560       /* Guess we are only writing on simple Ethernet header. */
2561       vnet_rewrite_two_headers (adj0[0], adj1[0],
2562                                 ip0, ip1, sizeof (ethernet_header_t));
2563
2564       /*
2565        * Bump the per-adjacency counters
2566        */
2567       if (do_counters)
2568         {
2569           vlib_increment_combined_counter
2570             (&adjacency_counters,
2571              thread_index,
2572              adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2573
2574           vlib_increment_combined_counter
2575             (&adjacency_counters,
2576              thread_index,
2577              adj_index1, 1, vlib_buffer_length_in_chain (vm, b[1]) + rw_len1);
2578         }
2579
2580       if (is_midchain)
2581         {
2582           if (adj0->sub_type.midchain.fixup_func)
2583             adj0->sub_type.midchain.fixup_func
2584               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2585           if (adj1->sub_type.midchain.fixup_func)
2586             adj1->sub_type.midchain.fixup_func
2587               (vm, adj1, b[1], adj1->sub_type.midchain.fixup_data);
2588         }
2589
2590       if (is_mcast)
2591         {
2592           /*
2593            * copy bytes from the IP address into the MAC rewrite
2594            */
2595           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2596                                       adj0->rewrite_header.dst_mcast_offset,
2597                                       &ip0->dst_address.as_u32, (u8 *) ip0);
2598           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2599                                       adj1->rewrite_header.dst_mcast_offset,
2600                                       &ip1->dst_address.as_u32, (u8 *) ip1);
2601         }
2602
2603       next += 2;
2604       b += 2;
2605       n_left_from -= 2;
2606     }
2607 #elif (CLIB_N_PREFETCHES >= 4)
2608   next = nexts;
2609   b = bufs;
2610   while (n_left_from >= 1)
2611     {
2612       ip_adjacency_t *adj0;
2613       ip4_header_t *ip0;
2614       u32 rw_len0, error0, adj_index0;
2615       u32 tx_sw_if_index0;
2616       u8 *p;
2617
2618       /* Prefetch next iteration */
2619       if (PREDICT_TRUE (n_left_from >= 4))
2620         {
2621           ip_adjacency_t *adj2;
2622           u32 adj_index2;
2623
2624           vlib_prefetch_buffer_header (b[3], LOAD);
2625           vlib_prefetch_buffer_data (b[2], LOAD);
2626
2627           /* Prefetch adj->rewrite_header */
2628           adj_index2 = vnet_buffer (b[2])->ip.adj_index[VLIB_TX];
2629           adj2 = adj_get (adj_index2);
2630           p = (u8 *) adj2;
2631           CLIB_PREFETCH (p + CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES,
2632                          LOAD);
2633         }
2634
2635       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2636
2637       /*
2638        * Prefetch the per-adjacency counters
2639        */
2640       if (do_counters)
2641         {
2642           vlib_prefetch_combined_counter (&adjacency_counters,
2643                                           thread_index, adj_index0);
2644         }
2645
2646       ip0 = vlib_buffer_get_current (b[0]);
2647
2648       error0 = IP4_ERROR_NONE;
2649
2650       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2651
2652       /* Rewrite packet header and updates lengths. */
2653       adj0 = adj_get (adj_index0);
2654
2655       /* Rewrite header was prefetched. */
2656       rw_len0 = adj0[0].rewrite_header.data_bytes;
2657       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2658
2659       /* Check MTU of outgoing interface. */
2660       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2661
2662       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2663         ip0_len = gso_mtu_sz (b[0]);
2664
2665       ip4_mtu_check (b[0], ip0_len,
2666                      adj0[0].rewrite_header.max_l3_packet_bytes,
2667                      ip0->flags_and_fragment_offset &
2668                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2669                      next + 0, &error0);
2670
2671       if (is_mcast)
2672         {
2673           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2674                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2675                     IP4_ERROR_SAME_INTERFACE : error0);
2676         }
2677
2678       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2679        * to see the IP header */
2680       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2681         {
2682           u32 next_index = adj0[0].rewrite_header.next_index;
2683           vlib_buffer_advance (b[0], -(word) rw_len0);
2684           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2685           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2686
2687           if (PREDICT_FALSE
2688               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2689             vnet_feature_arc_start (lm->output_feature_arc_index,
2690                                     tx_sw_if_index0, &next_index, b[0]);
2691           next[0] = next_index;
2692         }
2693       else
2694         {
2695           b[0]->error = error_node->errors[error0];
2696         }
2697       if (is_midchain)
2698         {
2699           calc_checksums (vm, b[0]);
2700         }
2701       /* Guess we are only writing on simple Ethernet header. */
2702       vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
2703
2704       /*
2705        * Bump the per-adjacency counters
2706        */
2707       if (do_counters)
2708         {
2709           vlib_increment_combined_counter
2710             (&adjacency_counters,
2711              thread_index,
2712              adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2713         }
2714
2715       if (is_midchain)
2716         {
2717           if (adj0->sub_type.midchain.fixup_func)
2718             adj0->sub_type.midchain.fixup_func
2719               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2720         }
2721
2722       if (is_mcast)
2723         {
2724           /*
2725            * copy bytes from the IP address into the MAC rewrite
2726            */
2727           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2728                                       adj0->rewrite_header.dst_mcast_offset,
2729                                       &ip0->dst_address.as_u32, (u8 *) ip0);
2730         }
2731
2732       next += 1;
2733       b += 1;
2734       n_left_from -= 1;
2735     }
2736 #endif
2737
2738   while (n_left_from > 0)
2739     {
2740       ip_adjacency_t *adj0;
2741       ip4_header_t *ip0;
2742       u32 rw_len0, adj_index0, error0;
2743       u32 tx_sw_if_index0;
2744
2745       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2746
2747       adj0 = adj_get (adj_index0);
2748
2749       if (do_counters)
2750         vlib_prefetch_combined_counter (&adjacency_counters,
2751                                         thread_index, adj_index0);
2752
2753       ip0 = vlib_buffer_get_current (b[0]);
2754
2755       error0 = IP4_ERROR_NONE;
2756
2757       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2758
2759
2760       /* Update packet buffer attributes/set output interface. */
2761       rw_len0 = adj0[0].rewrite_header.data_bytes;
2762       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2763
2764       /* Check MTU of outgoing interface. */
2765       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2766       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2767         ip0_len = gso_mtu_sz (b[0]);
2768
2769       ip4_mtu_check (b[0], ip0_len,
2770                      adj0[0].rewrite_header.max_l3_packet_bytes,
2771                      ip0->flags_and_fragment_offset &
2772                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2773                      next + 0, &error0);
2774
2775       if (is_mcast)
2776         {
2777           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2778                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2779                     IP4_ERROR_SAME_INTERFACE : error0);
2780         }
2781
2782       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2783        * to see the IP header */
2784       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2785         {
2786           u32 next_index = adj0[0].rewrite_header.next_index;
2787           vlib_buffer_advance (b[0], -(word) rw_len0);
2788           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2789           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2790
2791           if (PREDICT_FALSE
2792               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2793             vnet_feature_arc_start (lm->output_feature_arc_index,
2794                                     tx_sw_if_index0, &next_index, b[0]);
2795           next[0] = next_index;
2796         }
2797       else
2798         {
2799           b[0]->error = error_node->errors[error0];
2800         }
2801       if (is_midchain)
2802         {
2803           calc_checksums (vm, b[0]);
2804         }
2805       /* Guess we are only writing on simple Ethernet header. */
2806       vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
2807
2808       if (do_counters)
2809         vlib_increment_combined_counter
2810           (&adjacency_counters,
2811            thread_index, adj_index0, 1,
2812            vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2813
2814       if (is_midchain)
2815         {
2816           if (adj0->sub_type.midchain.fixup_func)
2817             adj0->sub_type.midchain.fixup_func
2818               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2819         }
2820
2821       if (is_mcast)
2822         {
2823           /*
2824            * copy bytes from the IP address into the MAC rewrite
2825            */
2826           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2827                                       adj0->rewrite_header.dst_mcast_offset,
2828                                       &ip0->dst_address.as_u32, (u8 *) ip0);
2829         }
2830
2831       next += 1;
2832       b += 1;
2833       n_left_from -= 1;
2834     }
2835
2836
2837   /* Need to do trace after rewrites to pick up new packet data. */
2838   if (node->flags & VLIB_NODE_FLAG_TRACE)
2839     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2840
2841   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2842   return frame->n_vectors;
2843 }
2844
2845 always_inline uword
2846 ip4_rewrite_inline (vlib_main_t * vm,
2847                     vlib_node_runtime_t * node,
2848                     vlib_frame_t * frame,
2849                     int do_counters, int is_midchain, int is_mcast)
2850 {
2851   vnet_main_t *vnm = vnet_get_main ();
2852   if (PREDICT_FALSE (vnm->interface_main.gso_interface_count > 0))
2853     return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
2854                                         is_midchain, is_mcast,
2855                                         1 /* do_gso */ );
2856   else
2857     return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
2858                                         is_midchain, is_mcast,
2859                                         0 /* no do_gso */ );
2860 }
2861
2862
2863 /** @brief IPv4 rewrite node.
2864     @node ip4-rewrite
2865
2866     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2867     header checksum, fetch the ip adjacency, check the outbound mtu,
2868     apply the adjacency rewrite, and send pkts to the adjacency
2869     rewrite header's rewrite_next_index.
2870
2871     @param vm vlib_main_t corresponding to the current thread
2872     @param node vlib_node_runtime_t
2873     @param frame vlib_frame_t whose contents should be dispatched
2874
2875     @par Graph mechanics: buffer metadata, next index usage
2876
2877     @em Uses:
2878     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2879         - the rewrite adjacency index
2880     - <code>adj->lookup_next_index</code>
2881         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2882           the packet will be dropped.
2883     - <code>adj->rewrite_header</code>
2884         - Rewrite string length, rewrite string, next_index
2885
2886     @em Sets:
2887     - <code>b->current_data, b->current_length</code>
2888         - Updated net of applying the rewrite string
2889
2890     <em>Next Indices:</em>
2891     - <code> adj->rewrite_header.next_index </code>
2892       or @c ip4-drop
2893 */
2894
2895 VLIB_NODE_FN (ip4_rewrite_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2896                                  vlib_frame_t * frame)
2897 {
2898   if (adj_are_counters_enabled ())
2899     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2900   else
2901     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2902 }
2903
2904 VLIB_NODE_FN (ip4_rewrite_bcast_node) (vlib_main_t * vm,
2905                                        vlib_node_runtime_t * node,
2906                                        vlib_frame_t * frame)
2907 {
2908   if (adj_are_counters_enabled ())
2909     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2910   else
2911     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2912 }
2913
2914 VLIB_NODE_FN (ip4_midchain_node) (vlib_main_t * vm,
2915                                   vlib_node_runtime_t * node,
2916                                   vlib_frame_t * frame)
2917 {
2918   if (adj_are_counters_enabled ())
2919     return ip4_rewrite_inline (vm, node, frame, 1, 1, 0);
2920   else
2921     return ip4_rewrite_inline (vm, node, frame, 0, 1, 0);
2922 }
2923
2924 VLIB_NODE_FN (ip4_rewrite_mcast_node) (vlib_main_t * vm,
2925                                        vlib_node_runtime_t * node,
2926                                        vlib_frame_t * frame)
2927 {
2928   if (adj_are_counters_enabled ())
2929     return ip4_rewrite_inline (vm, node, frame, 1, 0, 1);
2930   else
2931     return ip4_rewrite_inline (vm, node, frame, 0, 0, 1);
2932 }
2933
2934 VLIB_NODE_FN (ip4_mcast_midchain_node) (vlib_main_t * vm,
2935                                         vlib_node_runtime_t * node,
2936                                         vlib_frame_t * frame)
2937 {
2938   if (adj_are_counters_enabled ())
2939     return ip4_rewrite_inline (vm, node, frame, 1, 1, 1);
2940   else
2941     return ip4_rewrite_inline (vm, node, frame, 0, 1, 1);
2942 }
2943
2944 /* *INDENT-OFF* */
2945 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2946   .name = "ip4-rewrite",
2947   .vector_size = sizeof (u32),
2948
2949   .format_trace = format_ip4_rewrite_trace,
2950
2951   .n_next_nodes = IP4_REWRITE_N_NEXT,
2952   .next_nodes = {
2953     [IP4_REWRITE_NEXT_DROP] = "ip4-drop",
2954     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2955     [IP4_REWRITE_NEXT_FRAGMENT] = "ip4-frag",
2956   },
2957 };
2958
2959 VLIB_REGISTER_NODE (ip4_rewrite_bcast_node) = {
2960   .name = "ip4-rewrite-bcast",
2961   .vector_size = sizeof (u32),
2962
2963   .format_trace = format_ip4_rewrite_trace,
2964   .sibling_of = "ip4-rewrite",
2965 };
2966
2967 VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = {
2968   .name = "ip4-rewrite-mcast",
2969   .vector_size = sizeof (u32),
2970
2971   .format_trace = format_ip4_rewrite_trace,
2972   .sibling_of = "ip4-rewrite",
2973 };
2974
2975 VLIB_REGISTER_NODE (ip4_mcast_midchain_node) = {
2976   .name = "ip4-mcast-midchain",
2977   .vector_size = sizeof (u32),
2978
2979   .format_trace = format_ip4_rewrite_trace,
2980   .sibling_of = "ip4-rewrite",
2981 };
2982
2983 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2984   .name = "ip4-midchain",
2985   .vector_size = sizeof (u32),
2986   .format_trace = format_ip4_forward_next_trace,
2987   .sibling_of =  "ip4-rewrite",
2988 };
2989 /* *INDENT-ON */
2990
2991 static int
2992 ip4_lookup_validate (ip4_address_t * a, u32 fib_index0)
2993 {
2994   ip4_fib_mtrie_t *mtrie0;
2995   ip4_fib_mtrie_leaf_t leaf0;
2996   u32 lbi0;
2997
2998   mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
2999
3000   leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, a);
3001   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
3002   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
3003
3004   lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
3005
3006   return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get (fib_index0), a);
3007 }
3008
3009 static clib_error_t *
3010 test_lookup_command_fn (vlib_main_t * vm,
3011                         unformat_input_t * input, vlib_cli_command_t * cmd)
3012 {
3013   ip4_fib_t *fib;
3014   u32 table_id = 0;
3015   f64 count = 1;
3016   u32 n;
3017   int i;
3018   ip4_address_t ip4_base_address;
3019   u64 errors = 0;
3020
3021   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3022     {
3023       if (unformat (input, "table %d", &table_id))
3024         {
3025           /* Make sure the entry exists. */
3026           fib = ip4_fib_get (table_id);
3027           if ((fib) && (fib->index != table_id))
3028             return clib_error_return (0, "<fib-index> %d does not exist",
3029                                       table_id);
3030         }
3031       else if (unformat (input, "count %f", &count))
3032         ;
3033
3034       else if (unformat (input, "%U",
3035                          unformat_ip4_address, &ip4_base_address))
3036         ;
3037       else
3038         return clib_error_return (0, "unknown input `%U'",
3039                                   format_unformat_error, input);
3040     }
3041
3042   n = count;
3043
3044   for (i = 0; i < n; i++)
3045     {
3046       if (!ip4_lookup_validate (&ip4_base_address, table_id))
3047         errors++;
3048
3049       ip4_base_address.as_u32 =
3050         clib_host_to_net_u32 (1 +
3051                               clib_net_to_host_u32 (ip4_base_address.as_u32));
3052     }
3053
3054   if (errors)
3055     vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
3056   else
3057     vlib_cli_output (vm, "No errors in %d lookups\n", n);
3058
3059   return 0;
3060 }
3061
3062 /*?
3063  * Perform a lookup of an IPv4 Address (or range of addresses) in the
3064  * given FIB table to determine if there is a conflict with the
3065  * adjacency table. The fib-id can be determined by using the
3066  * '<em>show ip fib</em>' command. If fib-id is not entered, default value
3067  * of 0 is used.
3068  *
3069  * @todo This command uses fib-id, other commands use table-id (not
3070  * just a name, they are different indexes). Would like to change this
3071  * to table-id for consistency.
3072  *
3073  * @cliexpar
3074  * Example of how to run the test lookup command:
3075  * @cliexstart{test lookup 172.16.1.1 table 1 count 2}
3076  * No errors in 2 lookups
3077  * @cliexend
3078 ?*/
3079 /* *INDENT-OFF* */
3080 VLIB_CLI_COMMAND (lookup_test_command, static) =
3081 {
3082   .path = "test lookup",
3083   .short_help = "test lookup <ipv4-addr> [table <fib-id>] [count <nn>]",
3084   .function = test_lookup_command_fn,
3085 };
3086 /* *INDENT-ON* */
3087
3088 #ifndef CLIB_MARCH_VARIANT
3089 int
3090 vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
3091 {
3092   u32 fib_index;
3093
3094   fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id);
3095
3096   if (~0 == fib_index)
3097     return VNET_API_ERROR_NO_SUCH_FIB;
3098
3099   fib_table_set_flow_hash_config (fib_index, FIB_PROTOCOL_IP4,
3100                                   flow_hash_config);
3101
3102   return 0;
3103 }
3104 #endif
3105
3106 static clib_error_t *
3107 set_ip_flow_hash_command_fn (vlib_main_t * vm,
3108                              unformat_input_t * input,
3109                              vlib_cli_command_t * cmd)
3110 {
3111   int matched = 0;
3112   u32 table_id = 0;
3113   u32 flow_hash_config = 0;
3114   int rv;
3115
3116   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3117     {
3118       if (unformat (input, "table %d", &table_id))
3119         matched = 1;
3120 #define _(a,v) \
3121     else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
3122       foreach_flow_hash_bit
3123 #undef _
3124         else
3125         break;
3126     }
3127
3128   if (matched == 0)
3129     return clib_error_return (0, "unknown input `%U'",
3130                               format_unformat_error, input);
3131
3132   rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
3133   switch (rv)
3134     {
3135     case 0:
3136       break;
3137
3138     case VNET_API_ERROR_NO_SUCH_FIB:
3139       return clib_error_return (0, "no such FIB table %d", table_id);
3140
3141     default:
3142       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
3143       break;
3144     }
3145
3146   return 0;
3147 }
3148
3149 /*?
3150  * Configure the set of IPv4 fields used by the flow hash.
3151  *
3152  * @cliexpar
3153  * Example of how to set the flow hash on a given table:
3154  * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
3155  * Example of display the configured flow hash:
3156  * @cliexstart{show ip fib}
3157  * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
3158  * 0.0.0.0/0
3159  *   unicast-ip4-chain
3160  *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
3161  *     [0] [@0]: dpo-drop ip6
3162  * 0.0.0.0/32
3163  *   unicast-ip4-chain
3164  *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
3165  *     [0] [@0]: dpo-drop ip6
3166  * 224.0.0.0/8
3167  *   unicast-ip4-chain
3168  *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
3169  *     [0] [@0]: dpo-drop ip6
3170  * 6.0.1.2/32
3171  *   unicast-ip4-chain
3172  *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
3173  *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3174  * 7.0.0.1/32
3175  *   unicast-ip4-chain
3176  *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
3177  *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3178  *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3179  *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3180  *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3181  * 240.0.0.0/8
3182  *   unicast-ip4-chain
3183  *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
3184  *     [0] [@0]: dpo-drop ip6
3185  * 255.255.255.255/32
3186  *   unicast-ip4-chain
3187  *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
3188  *     [0] [@0]: dpo-drop ip6
3189  * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
3190  * 0.0.0.0/0
3191  *   unicast-ip4-chain
3192  *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
3193  *     [0] [@0]: dpo-drop ip6
3194  * 0.0.0.0/32
3195  *   unicast-ip4-chain
3196  *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
3197  *     [0] [@0]: dpo-drop ip6
3198  * 172.16.1.0/24
3199  *   unicast-ip4-chain
3200  *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
3201  *     [0] [@4]: ipv4-glean: af_packet0
3202  * 172.16.1.1/32
3203  *   unicast-ip4-chain
3204  *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
3205  *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
3206  * 172.16.1.2/32
3207  *   unicast-ip4-chain
3208  *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
3209  *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
3210  * 172.16.2.0/24
3211  *   unicast-ip4-chain
3212  *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
3213  *     [0] [@4]: ipv4-glean: af_packet1
3214  * 172.16.2.1/32
3215  *   unicast-ip4-chain
3216  *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
3217  *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
3218  * 224.0.0.0/8
3219  *   unicast-ip4-chain
3220  *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
3221  *     [0] [@0]: dpo-drop ip6
3222  * 240.0.0.0/8
3223  *   unicast-ip4-chain
3224  *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
3225  *     [0] [@0]: dpo-drop ip6
3226  * 255.255.255.255/32
3227  *   unicast-ip4-chain
3228  *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
3229  *     [0] [@0]: dpo-drop ip6
3230  * @cliexend
3231 ?*/
3232 /* *INDENT-OFF* */
3233 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) =
3234 {
3235   .path = "set ip flow-hash",
3236   .short_help =
3237   "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
3238   .function = set_ip_flow_hash_command_fn,
3239 };
3240 /* *INDENT-ON* */
3241
3242 #ifndef CLIB_MARCH_VARIANT
3243 int
3244 vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
3245                              u32 table_index)
3246 {
3247   vnet_main_t *vnm = vnet_get_main ();
3248   vnet_interface_main_t *im = &vnm->interface_main;
3249   ip4_main_t *ipm = &ip4_main;
3250   ip_lookup_main_t *lm = &ipm->lookup_main;
3251   vnet_classify_main_t *cm = &vnet_classify_main;
3252   ip4_address_t *if_addr;
3253
3254   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
3255     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
3256
3257   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
3258     return VNET_API_ERROR_NO_SUCH_ENTRY;
3259
3260   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
3261   lm->classify_table_index_by_sw_if_index[sw_if_index] = table_index;
3262
3263   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
3264
3265   if (NULL != if_addr)
3266     {
3267       fib_prefix_t pfx = {
3268         .fp_len = 32,
3269         .fp_proto = FIB_PROTOCOL_IP4,
3270         .fp_addr.ip4 = *if_addr,
3271       };
3272       u32 fib_index;
3273
3274       fib_index = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
3275                                                        sw_if_index);
3276
3277
3278       if (table_index != (u32) ~ 0)
3279         {
3280           dpo_id_t dpo = DPO_INVALID;
3281
3282           dpo_set (&dpo,
3283                    DPO_CLASSIFY,
3284                    DPO_PROTO_IP4,
3285                    classify_dpo_create (DPO_PROTO_IP4, table_index));
3286
3287           fib_table_entry_special_dpo_add (fib_index,
3288                                            &pfx,
3289                                            FIB_SOURCE_CLASSIFY,
3290                                            FIB_ENTRY_FLAG_NONE, &dpo);
3291           dpo_reset (&dpo);
3292         }
3293       else
3294         {
3295           fib_table_entry_special_remove (fib_index,
3296                                           &pfx, FIB_SOURCE_CLASSIFY);
3297         }
3298     }
3299
3300   return 0;
3301 }
3302 #endif
3303
3304 static clib_error_t *
3305 set_ip_classify_command_fn (vlib_main_t * vm,
3306                             unformat_input_t * input,
3307                             vlib_cli_command_t * cmd)
3308 {
3309   u32 table_index = ~0;
3310   int table_index_set = 0;
3311   u32 sw_if_index = ~0;
3312   int rv;
3313
3314   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3315     {
3316       if (unformat (input, "table-index %d", &table_index))
3317         table_index_set = 1;
3318       else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
3319                          vnet_get_main (), &sw_if_index))
3320         ;
3321       else
3322         break;
3323     }
3324
3325   if (table_index_set == 0)
3326     return clib_error_return (0, "classify table-index must be specified");
3327
3328   if (sw_if_index == ~0)
3329     return clib_error_return (0, "interface / subif must be specified");
3330
3331   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
3332
3333   switch (rv)
3334     {
3335     case 0:
3336       break;
3337
3338     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
3339       return clib_error_return (0, "No such interface");
3340
3341     case VNET_API_ERROR_NO_SUCH_ENTRY:
3342       return clib_error_return (0, "No such classifier table");
3343     }
3344   return 0;
3345 }
3346
3347 /*?
3348  * Assign a classification table to an interface. The classification
3349  * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
3350  * commands. Once the table is create, use this command to filter packets
3351  * on an interface.
3352  *
3353  * @cliexpar
3354  * Example of how to assign a classification table to an interface:
3355  * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
3356 ?*/
3357 /* *INDENT-OFF* */
3358 VLIB_CLI_COMMAND (set_ip_classify_command, static) =
3359 {
3360     .path = "set ip classify",
3361     .short_help =
3362     "set ip classify intfc <interface> table-index <classify-idx>",
3363     .function = set_ip_classify_command_fn,
3364 };
3365 /* *INDENT-ON* */
3366
3367 static clib_error_t *
3368 ip4_config (vlib_main_t * vm, unformat_input_t * input)
3369 {
3370   ip4_main_t *im = &ip4_main;
3371   uword heapsize = 0;
3372
3373   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3374     {
3375       if (unformat (input, "heap-size %U", unformat_memory_size, &heapsize))
3376         ;
3377       else
3378         return clib_error_return (0,
3379                                   "invalid heap-size parameter `%U'",
3380                                   format_unformat_error, input);
3381     }
3382
3383   im->mtrie_heap_size = heapsize;
3384
3385   return 0;
3386 }
3387
3388 VLIB_EARLY_CONFIG_FUNCTION (ip4_config, "ip");
3389
3390 /*
3391  * fd.io coding-style-patch-verification: ON
3392  *
3393  * Local Variables:
3394  * eval: (c-set-style "gnu")
3395  * End:
3396  */