ip: Fragmentation fixes
[vpp.git] / src / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ip/ip_frag.h>
43 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
44 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
45 #include <vnet/ppp/ppp.h>
46 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
47 #include <vnet/api_errno.h>     /* for API error numbers */
48 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
50 #include <vnet/fib/fib_urpf_list.h>     /* for FIB uRPF check */
51 #include <vnet/fib/ip4_fib.h>
52 #include <vnet/dpo/load_balance.h>
53 #include <vnet/dpo/load_balance_map.h>
54 #include <vnet/dpo/classify_dpo.h>
55 #include <vnet/mfib/mfib_table.h>       /* for mFIB table and entry creation */
56
57 #include <vnet/ip/ip4_forward.h>
58 #include <vnet/interface_output.h>
59
60 /** @brief IPv4 lookup node.
61     @node ip4-lookup
62
63     This is the main IPv4 lookup dispatch node.
64
65     @param vm vlib_main_t corresponding to the current thread
66     @param node vlib_node_runtime_t
67     @param frame vlib_frame_t whose contents should be dispatched
68
69     @par Graph mechanics: buffer metadata, next index usage
70
71     @em Uses:
72     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
73         - Indicates the @c sw_if_index value of the interface that the
74           packet was received on.
75     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
76         - When the value is @c ~0 then the node performs a longest prefix
77           match (LPM) for the packet destination address in the FIB attached
78           to the receive interface.
79         - Otherwise perform LPM for the packet destination address in the
80           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
81           value (0, 1, ...) and not a VRF id.
82
83     @em Sets:
84     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
85         - The lookup result adjacency index.
86
87     <em>Next Index:</em>
88     - Dispatches the packet to the node index found in
89       ip_adjacency_t @c adj->lookup_next_index
90       (where @c adj is the lookup result adjacency).
91 */
92 VLIB_NODE_FN (ip4_lookup_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
93                                 vlib_frame_t * frame)
94 {
95   return ip4_lookup_inline (vm, node, frame);
96 }
97
98 static u8 *format_ip4_lookup_trace (u8 * s, va_list * args);
99
100 /* *INDENT-OFF* */
101 VLIB_REGISTER_NODE (ip4_lookup_node) =
102 {
103   .name = "ip4-lookup",
104   .vector_size = sizeof (u32),
105   .format_trace = format_ip4_lookup_trace,
106   .n_next_nodes = IP_LOOKUP_N_NEXT,
107   .next_nodes = IP4_LOOKUP_NEXT_NODES,
108 };
109 /* *INDENT-ON* */
110
111 VLIB_NODE_FN (ip4_load_balance_node) (vlib_main_t * vm,
112                                       vlib_node_runtime_t * node,
113                                       vlib_frame_t * frame)
114 {
115   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
116   u32 n_left, *from;
117   u32 thread_index = vm->thread_index;
118   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
119   u16 nexts[VLIB_FRAME_SIZE], *next;
120
121   from = vlib_frame_vector_args (frame);
122   n_left = frame->n_vectors;
123   next = nexts;
124
125   vlib_get_buffers (vm, from, bufs, n_left);
126
127   while (n_left >= 4)
128     {
129       const load_balance_t *lb0, *lb1;
130       const ip4_header_t *ip0, *ip1;
131       u32 lbi0, hc0, lbi1, hc1;
132       const dpo_id_t *dpo0, *dpo1;
133
134       /* Prefetch next iteration. */
135       {
136         vlib_prefetch_buffer_header (b[2], LOAD);
137         vlib_prefetch_buffer_header (b[3], LOAD);
138
139         CLIB_PREFETCH (b[2]->data, sizeof (ip0[0]), LOAD);
140         CLIB_PREFETCH (b[3]->data, sizeof (ip0[0]), LOAD);
141       }
142
143       ip0 = vlib_buffer_get_current (b[0]);
144       ip1 = vlib_buffer_get_current (b[1]);
145       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
146       lbi1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
147
148       lb0 = load_balance_get (lbi0);
149       lb1 = load_balance_get (lbi1);
150
151       /*
152        * this node is for via FIBs we can re-use the hash value from the
153        * to node if present.
154        * We don't want to use the same hash value at each level in the recursion
155        * graph as that would lead to polarisation
156        */
157       hc0 = hc1 = 0;
158
159       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
160         {
161           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
162             {
163               hc0 = vnet_buffer (b[0])->ip.flow_hash =
164                 vnet_buffer (b[0])->ip.flow_hash >> 1;
165             }
166           else
167             {
168               hc0 = vnet_buffer (b[0])->ip.flow_hash =
169                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
170             }
171           dpo0 = load_balance_get_fwd_bucket
172             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
173         }
174       else
175         {
176           dpo0 = load_balance_get_bucket_i (lb0, 0);
177         }
178       if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
179         {
180           if (PREDICT_TRUE (vnet_buffer (b[1])->ip.flow_hash))
181             {
182               hc1 = vnet_buffer (b[1])->ip.flow_hash =
183                 vnet_buffer (b[1])->ip.flow_hash >> 1;
184             }
185           else
186             {
187               hc1 = vnet_buffer (b[1])->ip.flow_hash =
188                 ip4_compute_flow_hash (ip1, lb1->lb_hash_config);
189             }
190           dpo1 = load_balance_get_fwd_bucket
191             (lb1, (hc1 & (lb1->lb_n_buckets_minus_1)));
192         }
193       else
194         {
195           dpo1 = load_balance_get_bucket_i (lb1, 0);
196         }
197
198       next[0] = dpo0->dpoi_next_node;
199       next[1] = dpo1->dpoi_next_node;
200
201       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
202       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
203
204       vlib_increment_combined_counter
205         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
206       vlib_increment_combined_counter
207         (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b[1]));
208
209       b += 2;
210       next += 2;
211       n_left -= 2;
212     }
213
214   while (n_left > 0)
215     {
216       const load_balance_t *lb0;
217       const ip4_header_t *ip0;
218       const dpo_id_t *dpo0;
219       u32 lbi0, hc0;
220
221       ip0 = vlib_buffer_get_current (b[0]);
222       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
223
224       lb0 = load_balance_get (lbi0);
225
226       hc0 = 0;
227       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
228         {
229           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
230             {
231               hc0 = vnet_buffer (b[0])->ip.flow_hash =
232                 vnet_buffer (b[0])->ip.flow_hash >> 1;
233             }
234           else
235             {
236               hc0 = vnet_buffer (b[0])->ip.flow_hash =
237                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
238             }
239           dpo0 = load_balance_get_fwd_bucket
240             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
241         }
242       else
243         {
244           dpo0 = load_balance_get_bucket_i (lb0, 0);
245         }
246
247       next[0] = dpo0->dpoi_next_node;
248       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
249
250       vlib_increment_combined_counter
251         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
252
253       b += 1;
254       next += 1;
255       n_left -= 1;
256     }
257
258   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
259   if (node->flags & VLIB_NODE_FLAG_TRACE)
260     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
261
262   return frame->n_vectors;
263 }
264
265 /* *INDENT-OFF* */
266 VLIB_REGISTER_NODE (ip4_load_balance_node) =
267 {
268   .name = "ip4-load-balance",
269   .vector_size = sizeof (u32),
270   .sibling_of = "ip4-lookup",
271   .format_trace = format_ip4_lookup_trace,
272 };
273 /* *INDENT-ON* */
274
275 #ifndef CLIB_MARCH_VARIANT
276 /* get first interface address */
277 ip4_address_t *
278 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
279                              ip_interface_address_t ** result_ia)
280 {
281   ip_lookup_main_t *lm = &im->lookup_main;
282   ip_interface_address_t *ia = 0;
283   ip4_address_t *result = 0;
284
285   /* *INDENT-OFF* */
286   foreach_ip_interface_address
287     (lm, ia, sw_if_index,
288      1 /* honor unnumbered */ ,
289      ({
290        ip4_address_t * a =
291          ip_interface_address_get_address (lm, ia);
292        result = a;
293        break;
294      }));
295   /* *INDENT-OFF* */
296   if (result_ia)
297     *result_ia = result ? ia : 0;
298   return result;
299 }
300 #endif
301
302 static void
303 ip4_add_subnet_bcast_route (u32 fib_index,
304                             fib_prefix_t *pfx,
305                             u32 sw_if_index)
306 {
307   vnet_sw_interface_flags_t iflags;
308
309   iflags = vnet_sw_interface_get_flags(vnet_get_main(), sw_if_index);
310
311   fib_table_entry_special_remove(fib_index,
312                                  pfx,
313                                  FIB_SOURCE_INTERFACE);
314
315   if (iflags & VNET_SW_INTERFACE_FLAG_DIRECTED_BCAST)
316     {
317       fib_table_entry_update_one_path (fib_index, pfx,
318                                        FIB_SOURCE_INTERFACE,
319                                        FIB_ENTRY_FLAG_NONE,
320                                        DPO_PROTO_IP4,
321                                        /* No next-hop address */
322                                        &ADJ_BCAST_ADDR,
323                                        sw_if_index,
324                                        // invalid FIB index
325                                        ~0,
326                                        1,
327                                        // no out-label stack
328                                        NULL,
329                                        FIB_ROUTE_PATH_FLAG_NONE);
330     }
331   else
332     {
333         fib_table_entry_special_add(fib_index,
334                                     pfx,
335                                     FIB_SOURCE_INTERFACE,
336                                     (FIB_ENTRY_FLAG_DROP |
337                                      FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
338     }
339 }
340
341 static void
342 ip4_add_interface_prefix_routes (ip4_main_t *im,
343                                  u32 sw_if_index,
344                                  u32 fib_index,
345                                  ip_interface_address_t * a)
346 {
347   ip_lookup_main_t *lm = &im->lookup_main;
348   ip_interface_prefix_t *if_prefix;
349   ip4_address_t *address = ip_interface_address_get_address (lm, a);
350
351   ip_interface_prefix_key_t key = {
352     .prefix = {
353       .fp_len = a->address_length,
354       .fp_proto = FIB_PROTOCOL_IP4,
355       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[a->address_length],
356     },
357     .sw_if_index = sw_if_index,
358   };
359
360   fib_prefix_t pfx_special = {
361     .fp_proto = FIB_PROTOCOL_IP4,
362   };
363
364   /* If prefix already set on interface, just increment ref count & return */
365   if_prefix = ip_get_interface_prefix (lm, &key);
366   if (if_prefix)
367     {
368       if_prefix->ref_count += 1;
369       return;
370     }
371
372   /* New prefix - allocate a pool entry, initialize it, add to the hash */
373   pool_get (lm->if_prefix_pool, if_prefix);
374   if_prefix->ref_count = 1;
375   if_prefix->src_ia_index = a - lm->if_address_pool;
376   clib_memcpy (&if_prefix->key, &key, sizeof (key));
377   mhash_set (&lm->prefix_to_if_prefix_index, &key,
378              if_prefix - lm->if_prefix_pool, 0 /* old value */);
379
380   /* length <= 30 - add glean, drop first address, maybe drop bcast address */
381   if (a->address_length <= 30)
382     {
383       pfx_special.fp_len = a->address_length;
384       pfx_special.fp_addr.ip4.as_u32 = address->as_u32;
385
386       /* set the glean route for the prefix */
387       fib_table_entry_update_one_path (fib_index, &pfx_special,
388                                        FIB_SOURCE_INTERFACE,
389                                        (FIB_ENTRY_FLAG_CONNECTED |
390                                         FIB_ENTRY_FLAG_ATTACHED),
391                                        DPO_PROTO_IP4,
392                                        /* No next-hop address */
393                                        NULL,
394                                        sw_if_index,
395                                        /* invalid FIB index */
396                                        ~0,
397                                        1,
398                                        /* no out-label stack */
399                                        NULL,
400                                        FIB_ROUTE_PATH_FLAG_NONE);
401
402       /* set a drop route for the base address of the prefix */
403       pfx_special.fp_len = 32;
404       pfx_special.fp_addr.ip4.as_u32 =
405         address->as_u32 & im->fib_masks[a->address_length];
406
407       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
408         fib_table_entry_special_add (fib_index, &pfx_special,
409                                      FIB_SOURCE_INTERFACE,
410                                      (FIB_ENTRY_FLAG_DROP |
411                                       FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
412
413       /* set a route for the broadcast address of the prefix */
414       pfx_special.fp_len = 32;
415       pfx_special.fp_addr.ip4.as_u32 =
416         address->as_u32 | ~im->fib_masks[a->address_length];
417       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
418         ip4_add_subnet_bcast_route (fib_index, &pfx_special, sw_if_index);
419
420
421     }
422   /* length == 31 - add an attached route for the other address */
423   else if (a->address_length == 31)
424     {
425       pfx_special.fp_len = 32;
426       pfx_special.fp_addr.ip4.as_u32 =
427         address->as_u32 ^ clib_host_to_net_u32(1);
428
429       fib_table_entry_update_one_path (fib_index, &pfx_special,
430                                        FIB_SOURCE_INTERFACE,
431                                        (FIB_ENTRY_FLAG_ATTACHED),
432                                        DPO_PROTO_IP4,
433                                        &pfx_special.fp_addr,
434                                        sw_if_index,
435                                        /* invalid FIB index */
436                                        ~0,
437                                        1,
438                                        NULL,
439                                        FIB_ROUTE_PATH_FLAG_NONE);
440     }
441 }
442
443 static void
444 ip4_add_interface_routes (u32 sw_if_index,
445                           ip4_main_t * im, u32 fib_index,
446                           ip_interface_address_t * a)
447 {
448   ip_lookup_main_t *lm = &im->lookup_main;
449   ip4_address_t *address = ip_interface_address_get_address (lm, a);
450   fib_prefix_t pfx = {
451     .fp_len = 32,
452     .fp_proto = FIB_PROTOCOL_IP4,
453     .fp_addr.ip4 = *address,
454   };
455
456   /* set special routes for the prefix if needed */
457   ip4_add_interface_prefix_routes (im, sw_if_index, fib_index, a);
458
459   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
460     {
461       u32 classify_table_index =
462         lm->classify_table_index_by_sw_if_index[sw_if_index];
463       if (classify_table_index != (u32) ~ 0)
464         {
465           dpo_id_t dpo = DPO_INVALID;
466
467           dpo_set (&dpo,
468                    DPO_CLASSIFY,
469                    DPO_PROTO_IP4,
470                    classify_dpo_create (DPO_PROTO_IP4, classify_table_index));
471
472           fib_table_entry_special_dpo_add (fib_index,
473                                            &pfx,
474                                            FIB_SOURCE_CLASSIFY,
475                                            FIB_ENTRY_FLAG_NONE, &dpo);
476           dpo_reset (&dpo);
477         }
478     }
479
480   fib_table_entry_update_one_path (fib_index, &pfx,
481                                    FIB_SOURCE_INTERFACE,
482                                    (FIB_ENTRY_FLAG_CONNECTED |
483                                     FIB_ENTRY_FLAG_LOCAL),
484                                    DPO_PROTO_IP4,
485                                    &pfx.fp_addr,
486                                    sw_if_index,
487                                    // invalid FIB index
488                                    ~0,
489                                    1, NULL,
490                                    FIB_ROUTE_PATH_FLAG_NONE);
491 }
492
493 static void
494 ip4_del_interface_prefix_routes (ip4_main_t * im,
495                                  u32 sw_if_index,
496                                  u32 fib_index,
497                                  ip4_address_t * address,
498                                  u32 address_length)
499 {
500   ip_lookup_main_t *lm = &im->lookup_main;
501   ip_interface_prefix_t *if_prefix;
502
503   ip_interface_prefix_key_t key = {
504     .prefix = {
505       .fp_len = address_length,
506       .fp_proto = FIB_PROTOCOL_IP4,
507       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[address_length],
508     },
509     .sw_if_index = sw_if_index,
510   };
511
512   fib_prefix_t pfx_special = {
513     .fp_len = 32,
514     .fp_proto = FIB_PROTOCOL_IP4,
515   };
516
517   if_prefix = ip_get_interface_prefix (lm, &key);
518   if (!if_prefix)
519     {
520       clib_warning ("Prefix not found while deleting %U",
521                     format_ip4_address_and_length, address, address_length);
522       return;
523     }
524
525   if_prefix->ref_count -= 1;
526
527   /*
528    * Routes need to be adjusted if:
529    * - deleting last intf addr in prefix
530    * - deleting intf addr used as default source address in glean adjacency
531    *
532    * We're done now otherwise
533    */
534   if ((if_prefix->ref_count > 0) &&
535       !pool_is_free_index (lm->if_address_pool, if_prefix->src_ia_index))
536     return;
537
538   /* length <= 30, delete glean route, first address, last address */
539   if (address_length <= 30)
540     {
541
542       /* remove glean route for prefix */
543       pfx_special.fp_addr.ip4 = *address;
544       pfx_special.fp_len = address_length;
545       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
546
547       /* if no more intf addresses in prefix, remove other special routes */
548       if (!if_prefix->ref_count)
549         {
550           /* first address in prefix */
551           pfx_special.fp_addr.ip4.as_u32 =
552             address->as_u32 & im->fib_masks[address_length];
553           pfx_special.fp_len = 32;
554
555           if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
556           fib_table_entry_special_remove (fib_index,
557                                           &pfx_special,
558                                           FIB_SOURCE_INTERFACE);
559
560           /* prefix broadcast address */
561           pfx_special.fp_addr.ip4.as_u32 =
562             address->as_u32 | ~im->fib_masks[address_length];
563           pfx_special.fp_len = 32;
564
565           if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
566           fib_table_entry_special_remove (fib_index,
567                                           &pfx_special,
568                                           FIB_SOURCE_INTERFACE);
569         }
570       else
571         /* default source addr just got deleted, find another */
572         {
573           ip_interface_address_t *new_src_ia = NULL;
574           ip4_address_t *new_src_addr = NULL;
575
576           new_src_addr =
577             ip4_interface_address_matching_destination
578               (im, address, sw_if_index, &new_src_ia);
579
580           if_prefix->src_ia_index = new_src_ia - lm->if_address_pool;
581
582           pfx_special.fp_len = address_length;
583           pfx_special.fp_addr.ip4 = *new_src_addr;
584
585           /* set new glean route for the prefix */
586           fib_table_entry_update_one_path (fib_index, &pfx_special,
587                                            FIB_SOURCE_INTERFACE,
588                                            (FIB_ENTRY_FLAG_CONNECTED |
589                                             FIB_ENTRY_FLAG_ATTACHED),
590                                            DPO_PROTO_IP4,
591                                            /* No next-hop address */
592                                            NULL,
593                                            sw_if_index,
594                                            /* invalid FIB index */
595                                            ~0,
596                                            1,
597                                            /* no out-label stack */
598                                            NULL,
599                                            FIB_ROUTE_PATH_FLAG_NONE);
600           return;
601         }
602     }
603   /* length == 31, delete attached route for the other address */
604   else if (address_length == 31)
605     {
606       pfx_special.fp_addr.ip4.as_u32 =
607         address->as_u32 ^ clib_host_to_net_u32(1);
608
609       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
610     }
611
612   mhash_unset (&lm->prefix_to_if_prefix_index, &key, 0 /* old_value */);
613   pool_put (lm->if_prefix_pool, if_prefix);
614 }
615
616 static void
617 ip4_del_interface_routes (u32 sw_if_index,
618                           ip4_main_t * im,
619                           u32 fib_index,
620                           ip4_address_t * address, u32 address_length)
621 {
622   fib_prefix_t pfx = {
623     .fp_len = address_length,
624     .fp_proto = FIB_PROTOCOL_IP4,
625     .fp_addr.ip4 = *address,
626   };
627
628   ip4_del_interface_prefix_routes (im, sw_if_index, fib_index,
629                                    address, address_length);
630
631   pfx.fp_len = 32;
632   fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
633 }
634
635 #ifndef CLIB_MARCH_VARIANT
636 void
637 ip4_sw_interface_enable_disable (u32 sw_if_index, u32 is_enable)
638 {
639   ip4_main_t *im = &ip4_main;
640
641   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
642
643   /*
644    * enable/disable only on the 1<->0 transition
645    */
646   if (is_enable)
647     {
648       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
649         return;
650     }
651   else
652     {
653       ASSERT (im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
654       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
655         return;
656     }
657   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
658                                !is_enable, 0, 0);
659
660
661   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
662                                sw_if_index, !is_enable, 0, 0);
663
664   {
665     ip4_enable_disable_interface_callback_t *cb;
666     vec_foreach (cb, im->enable_disable_interface_callbacks)
667       cb->function (im, cb->function_opaque, sw_if_index, is_enable);
668   }
669 }
670
671 static clib_error_t *
672 ip4_add_del_interface_address_internal (vlib_main_t * vm,
673                                         u32 sw_if_index,
674                                         ip4_address_t * address,
675                                         u32 address_length, u32 is_del)
676 {
677   vnet_main_t *vnm = vnet_get_main ();
678   ip4_main_t *im = &ip4_main;
679   ip_lookup_main_t *lm = &im->lookup_main;
680   clib_error_t *error = 0;
681   u32 if_address_index, elts_before;
682   ip4_address_fib_t ip4_af, *addr_fib = 0;
683
684   /* local0 interface doesn't support IP addressing  */
685   if (sw_if_index == 0)
686     {
687       return
688        clib_error_create ("local0 interface doesn't support IP addressing");
689     }
690
691   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
692   ip4_addr_fib_init (&ip4_af, address,
693                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
694   vec_add1 (addr_fib, ip4_af);
695
696   /*
697    * there is no support for adj-fib handling in the presence of overlapping
698    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
699    * most routers do.
700    */
701   /* *INDENT-OFF* */
702   if (!is_del)
703     {
704       /* When adding an address check that it does not conflict
705          with an existing address on any interface in this table. */
706       ip_interface_address_t *ia;
707       vnet_sw_interface_t *sif;
708
709       pool_foreach(sif, vnm->interface_main.sw_interfaces,
710       ({
711           if (im->fib_index_by_sw_if_index[sw_if_index] ==
712               im->fib_index_by_sw_if_index[sif->sw_if_index])
713             {
714               foreach_ip_interface_address
715                 (&im->lookup_main, ia, sif->sw_if_index,
716                  0 /* honor unnumbered */ ,
717                  ({
718                    ip4_address_t * x =
719                      ip_interface_address_get_address
720                      (&im->lookup_main, ia);
721                    if (ip4_destination_matches_route
722                        (im, address, x, ia->address_length) ||
723                        ip4_destination_matches_route (im,
724                                                       x,
725                                                       address,
726                                                       address_length))
727                      {
728                        /* an intf may have >1 addr from the same prefix */
729                        if ((sw_if_index == sif->sw_if_index) &&
730                            (ia->address_length == address_length) &&
731                            (x->as_u32 != address->as_u32))
732                          continue;
733
734                        /* error if the length or intf was different */
735                        vnm->api_errno = VNET_API_ERROR_DUPLICATE_IF_ADDRESS;
736
737                        return
738                          clib_error_create
739                          ("failed to add %U on %U which conflicts with %U for interface %U",
740                           format_ip4_address_and_length, address,
741                           address_length,
742                           format_vnet_sw_if_index_name, vnm,
743                           sw_if_index,
744                           format_ip4_address_and_length, x,
745                           ia->address_length,
746                           format_vnet_sw_if_index_name, vnm,
747                           sif->sw_if_index);
748                      }
749                  }));
750             }
751       }));
752     }
753   /* *INDENT-ON* */
754
755   elts_before = pool_elts (lm->if_address_pool);
756
757   error = ip_interface_address_add_del
758     (lm, sw_if_index, addr_fib, address_length, is_del, &if_address_index);
759   if (error)
760     goto done;
761
762   ip4_sw_interface_enable_disable (sw_if_index, !is_del);
763
764   /* intf addr routes are added/deleted on admin up/down */
765   if (vnet_sw_interface_is_admin_up (vnm, sw_if_index))
766     {
767       if (is_del)
768         ip4_del_interface_routes (sw_if_index,
769                                   im, ip4_af.fib_index, address,
770                                   address_length);
771       else
772         ip4_add_interface_routes (sw_if_index,
773                                   im, ip4_af.fib_index,
774                                   pool_elt_at_index
775                                   (lm->if_address_pool, if_address_index));
776     }
777
778   /* If pool did not grow/shrink: add duplicate address. */
779   if (elts_before != pool_elts (lm->if_address_pool))
780     {
781       ip4_add_del_interface_address_callback_t *cb;
782       vec_foreach (cb, im->add_del_interface_address_callbacks)
783         cb->function (im, cb->function_opaque, sw_if_index,
784                       address, address_length, if_address_index, is_del);
785     }
786
787 done:
788   vec_free (addr_fib);
789   return error;
790 }
791
792 clib_error_t *
793 ip4_add_del_interface_address (vlib_main_t * vm,
794                                u32 sw_if_index,
795                                ip4_address_t * address,
796                                u32 address_length, u32 is_del)
797 {
798   return ip4_add_del_interface_address_internal
799     (vm, sw_if_index, address, address_length, is_del);
800 }
801
802 void
803 ip4_directed_broadcast (u32 sw_if_index, u8 enable)
804 {
805   ip_interface_address_t *ia;
806   ip4_main_t *im;
807
808   im = &ip4_main;
809
810   /*
811    * when directed broadcast is enabled, the subnet braodcast route will forward
812    * packets using an adjacency with a broadcast MAC. otherwise it drops
813    */
814   /* *INDENT-OFF* */
815   foreach_ip_interface_address(&im->lookup_main, ia,
816                                sw_if_index, 0,
817      ({
818        if (ia->address_length <= 30)
819          {
820            ip4_address_t *ipa;
821
822            ipa = ip_interface_address_get_address (&im->lookup_main, ia);
823
824            fib_prefix_t pfx = {
825              .fp_len = 32,
826              .fp_proto = FIB_PROTOCOL_IP4,
827              .fp_addr = {
828                .ip4.as_u32 = (ipa->as_u32 | ~im->fib_masks[ia->address_length]),
829              },
830            };
831
832            ip4_add_subnet_bcast_route
833              (fib_table_get_index_for_sw_if_index(FIB_PROTOCOL_IP4,
834                                                   sw_if_index),
835               &pfx, sw_if_index);
836          }
837      }));
838   /* *INDENT-ON* */
839 }
840 #endif
841
842 static clib_error_t *
843 ip4_sw_interface_admin_up_down (vnet_main_t * vnm, u32 sw_if_index, u32 flags)
844 {
845   ip4_main_t *im = &ip4_main;
846   ip_interface_address_t *ia;
847   ip4_address_t *a;
848   u32 is_admin_up, fib_index;
849
850   /* Fill in lookup tables with default table (0). */
851   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
852
853   vec_validate_init_empty (im->
854                            lookup_main.if_address_pool_index_by_sw_if_index,
855                            sw_if_index, ~0);
856
857   is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
858
859   fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
860
861   /* *INDENT-OFF* */
862   foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
863                                 0 /* honor unnumbered */,
864   ({
865     a = ip_interface_address_get_address (&im->lookup_main, ia);
866     if (is_admin_up)
867       ip4_add_interface_routes (sw_if_index,
868                                 im, fib_index,
869                                 ia);
870     else
871       ip4_del_interface_routes (sw_if_index,
872                                 im, fib_index,
873                                 a, ia->address_length);
874   }));
875   /* *INDENT-ON* */
876
877   return 0;
878 }
879
880 VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
881
882 /* Built-in ip4 unicast rx feature path definition */
883 /* *INDENT-OFF* */
884 VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
885 {
886   .arc_name = "ip4-unicast",
887   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
888   .last_in_arc = "ip4-lookup",
889   .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
890 };
891
892 VNET_FEATURE_INIT (ip4_flow_classify, static) =
893 {
894   .arc_name = "ip4-unicast",
895   .node_name = "ip4-flow-classify",
896   .runs_before = VNET_FEATURES ("ip4-inacl"),
897 };
898
899 VNET_FEATURE_INIT (ip4_inacl, static) =
900 {
901   .arc_name = "ip4-unicast",
902   .node_name = "ip4-inacl",
903   .runs_before = VNET_FEATURES ("ip4-source-check-via-rx"),
904 };
905
906 VNET_FEATURE_INIT (ip4_source_check_1, static) =
907 {
908   .arc_name = "ip4-unicast",
909   .node_name = "ip4-source-check-via-rx",
910   .runs_before = VNET_FEATURES ("ip4-source-check-via-any"),
911 };
912
913 VNET_FEATURE_INIT (ip4_source_check_2, static) =
914 {
915   .arc_name = "ip4-unicast",
916   .node_name = "ip4-source-check-via-any",
917   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
918 };
919
920 VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) =
921 {
922   .arc_name = "ip4-unicast",
923   .node_name = "ip4-source-and-port-range-check-rx",
924   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
925 };
926
927 VNET_FEATURE_INIT (ip4_policer_classify, static) =
928 {
929   .arc_name = "ip4-unicast",
930   .node_name = "ip4-policer-classify",
931   .runs_before = VNET_FEATURES ("ipsec4-input-feature"),
932 };
933
934 VNET_FEATURE_INIT (ip4_ipsec, static) =
935 {
936   .arc_name = "ip4-unicast",
937   .node_name = "ipsec4-input-feature",
938   .runs_before = VNET_FEATURES ("vpath-input-ip4"),
939 };
940
941 VNET_FEATURE_INIT (ip4_vpath, static) =
942 {
943   .arc_name = "ip4-unicast",
944   .node_name = "vpath-input-ip4",
945   .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
946 };
947
948 VNET_FEATURE_INIT (ip4_vxlan_bypass, static) =
949 {
950   .arc_name = "ip4-unicast",
951   .node_name = "ip4-vxlan-bypass",
952   .runs_before = VNET_FEATURES ("ip4-lookup"),
953 };
954
955 VNET_FEATURE_INIT (ip4_not_enabled, static) =
956 {
957   .arc_name = "ip4-unicast",
958   .node_name = "ip4-not-enabled",
959   .runs_before = VNET_FEATURES ("ip4-lookup"),
960 };
961
962 VNET_FEATURE_INIT (ip4_lookup, static) =
963 {
964   .arc_name = "ip4-unicast",
965   .node_name = "ip4-lookup",
966   .runs_before = 0,     /* not before any other features */
967 };
968
969 /* Built-in ip4 multicast rx feature path definition */
970 VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
971 {
972   .arc_name = "ip4-multicast",
973   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
974   .last_in_arc = "ip4-mfib-forward-lookup",
975   .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
976 };
977
978 VNET_FEATURE_INIT (ip4_vpath_mc, static) =
979 {
980   .arc_name = "ip4-multicast",
981   .node_name = "vpath-input-ip4",
982   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
983 };
984
985 VNET_FEATURE_INIT (ip4_mc_not_enabled, static) =
986 {
987   .arc_name = "ip4-multicast",
988   .node_name = "ip4-not-enabled",
989   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
990 };
991
992 VNET_FEATURE_INIT (ip4_lookup_mc, static) =
993 {
994   .arc_name = "ip4-multicast",
995   .node_name = "ip4-mfib-forward-lookup",
996   .runs_before = 0,     /* last feature */
997 };
998
999 /* Source and port-range check ip4 tx feature path definition */
1000 VNET_FEATURE_ARC_INIT (ip4_output, static) =
1001 {
1002   .arc_name = "ip4-output",
1003   .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain", "ip4-dvr-dpo"),
1004   .last_in_arc = "interface-output",
1005   .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
1006 };
1007
1008 VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) =
1009 {
1010   .arc_name = "ip4-output",
1011   .node_name = "ip4-source-and-port-range-check-tx",
1012   .runs_before = VNET_FEATURES ("ip4-outacl"),
1013 };
1014
1015 VNET_FEATURE_INIT (ip4_outacl, static) =
1016 {
1017   .arc_name = "ip4-output",
1018   .node_name = "ip4-outacl",
1019   .runs_before = VNET_FEATURES ("ipsec4-output-feature"),
1020 };
1021
1022 VNET_FEATURE_INIT (ip4_ipsec_output, static) =
1023 {
1024   .arc_name = "ip4-output",
1025   .node_name = "ipsec4-output-feature",
1026   .runs_before = VNET_FEATURES ("interface-output"),
1027 };
1028
1029 /* Built-in ip4 tx feature path definition */
1030 VNET_FEATURE_INIT (ip4_interface_output, static) =
1031 {
1032   .arc_name = "ip4-output",
1033   .node_name = "interface-output",
1034   .runs_before = 0,     /* not before any other features */
1035 };
1036 /* *INDENT-ON* */
1037
1038 static clib_error_t *
1039 ip4_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add)
1040 {
1041   ip4_main_t *im = &ip4_main;
1042
1043   /* Fill in lookup tables with default table (0). */
1044   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1045   vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);
1046
1047   if (!is_add)
1048     {
1049       ip4_main_t *im4 = &ip4_main;
1050       ip_lookup_main_t *lm4 = &im4->lookup_main;
1051       ip_interface_address_t *ia = 0;
1052       ip4_address_t *address;
1053       vlib_main_t *vm = vlib_get_main ();
1054
1055       vnet_sw_interface_update_unnumbered (sw_if_index, ~0, 0);
1056       /* *INDENT-OFF* */
1057       foreach_ip_interface_address (lm4, ia, sw_if_index, 0,
1058       ({
1059         address = ip_interface_address_get_address (lm4, ia);
1060         ip4_add_del_interface_address(vm, sw_if_index, address, ia->address_length, 1);
1061       }));
1062       /* *INDENT-ON* */
1063     }
1064
1065   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
1066                                is_add, 0, 0);
1067
1068   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
1069                                sw_if_index, is_add, 0, 0);
1070
1071   return /* no error */ 0;
1072 }
1073
1074 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1075
1076 /* Global IP4 main. */
1077 #ifndef CLIB_MARCH_VARIANT
1078 ip4_main_t ip4_main;
1079 #endif /* CLIB_MARCH_VARIANT */
1080
1081 static clib_error_t *
1082 ip4_lookup_init (vlib_main_t * vm)
1083 {
1084   ip4_main_t *im = &ip4_main;
1085   clib_error_t *error;
1086   uword i;
1087
1088   if ((error = vlib_call_init_function (vm, vnet_feature_init)))
1089     return error;
1090   if ((error = vlib_call_init_function (vm, ip4_mtrie_module_init)))
1091     return (error);
1092   if ((error = vlib_call_init_function (vm, fib_module_init)))
1093     return error;
1094   if ((error = vlib_call_init_function (vm, mfib_module_init)))
1095     return error;
1096
1097   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1098     {
1099       u32 m;
1100
1101       if (i < 32)
1102         m = pow2_mask (i) << (32 - i);
1103       else
1104         m = ~0;
1105       im->fib_masks[i] = clib_host_to_net_u32 (m);
1106     }
1107
1108   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1109
1110   /* Create FIB with index 0 and table id of 0. */
1111   fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1112                                      FIB_SOURCE_DEFAULT_ROUTE);
1113   mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1114                                       MFIB_SOURCE_DEFAULT_ROUTE);
1115
1116   {
1117     pg_node_t *pn;
1118     pn = pg_get_node (ip4_lookup_node.index);
1119     pn->unformat_edit = unformat_pg_ip4_header;
1120   }
1121
1122   {
1123     ethernet_arp_header_t h;
1124
1125     clib_memset (&h, 0, sizeof (h));
1126
1127 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1128 #define _8(f,v) h.f = v;
1129     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1130     _16 (l3_type, ETHERNET_TYPE_IP4);
1131     _8 (n_l2_address_bytes, 6);
1132     _8 (n_l3_address_bytes, 4);
1133     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1134 #undef _16
1135 #undef _8
1136
1137     vlib_packet_template_init (vm, &im->ip4_arp_request_packet_template,
1138                                /* data */ &h,
1139                                sizeof (h),
1140                                /* alloc chunk size */ 8,
1141                                "ip4 arp");
1142   }
1143
1144   return error;
1145 }
1146
1147 VLIB_INIT_FUNCTION (ip4_lookup_init);
1148
1149 typedef struct
1150 {
1151   /* Adjacency taken. */
1152   u32 dpo_index;
1153   u32 flow_hash;
1154   u32 fib_index;
1155
1156   /* Packet data, possibly *after* rewrite. */
1157   u8 packet_data[64 - 1 * sizeof (u32)];
1158 }
1159 ip4_forward_next_trace_t;
1160
1161 #ifndef CLIB_MARCH_VARIANT
1162 u8 *
1163 format_ip4_forward_next_trace (u8 * s, va_list * args)
1164 {
1165   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1166   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1167   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1168   u32 indent = format_get_indent (s);
1169   s = format (s, "%U%U",
1170               format_white_space, indent,
1171               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1172   return s;
1173 }
1174 #endif
1175
1176 static u8 *
1177 format_ip4_lookup_trace (u8 * s, va_list * args)
1178 {
1179   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1180   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1181   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1182   u32 indent = format_get_indent (s);
1183
1184   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1185               t->fib_index, t->dpo_index, t->flow_hash);
1186   s = format (s, "\n%U%U",
1187               format_white_space, indent,
1188               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1189   return s;
1190 }
1191
1192 static u8 *
1193 format_ip4_rewrite_trace (u8 * s, va_list * args)
1194 {
1195   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1196   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1197   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1198   u32 indent = format_get_indent (s);
1199
1200   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1201               t->fib_index, t->dpo_index, format_ip_adjacency,
1202               t->dpo_index, FORMAT_IP_ADJACENCY_NONE, t->flow_hash);
1203   s = format (s, "\n%U%U",
1204               format_white_space, indent,
1205               format_ip_adjacency_packet_data,
1206               t->packet_data, sizeof (t->packet_data));
1207   return s;
1208 }
1209
1210 #ifndef CLIB_MARCH_VARIANT
1211 /* Common trace function for all ip4-forward next nodes. */
1212 void
1213 ip4_forward_next_trace (vlib_main_t * vm,
1214                         vlib_node_runtime_t * node,
1215                         vlib_frame_t * frame, vlib_rx_or_tx_t which_adj_index)
1216 {
1217   u32 *from, n_left;
1218   ip4_main_t *im = &ip4_main;
1219
1220   n_left = frame->n_vectors;
1221   from = vlib_frame_vector_args (frame);
1222
1223   while (n_left >= 4)
1224     {
1225       u32 bi0, bi1;
1226       vlib_buffer_t *b0, *b1;
1227       ip4_forward_next_trace_t *t0, *t1;
1228
1229       /* Prefetch next iteration. */
1230       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1231       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1232
1233       bi0 = from[0];
1234       bi1 = from[1];
1235
1236       b0 = vlib_get_buffer (vm, bi0);
1237       b1 = vlib_get_buffer (vm, bi1);
1238
1239       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1240         {
1241           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1242           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1243           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1244           t0->fib_index =
1245             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1246              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1247             vec_elt (im->fib_index_by_sw_if_index,
1248                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1249
1250           clib_memcpy_fast (t0->packet_data,
1251                             vlib_buffer_get_current (b0),
1252                             sizeof (t0->packet_data));
1253         }
1254       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1255         {
1256           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1257           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1258           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1259           t1->fib_index =
1260             (vnet_buffer (b1)->sw_if_index[VLIB_TX] !=
1261              (u32) ~ 0) ? vnet_buffer (b1)->sw_if_index[VLIB_TX] :
1262             vec_elt (im->fib_index_by_sw_if_index,
1263                      vnet_buffer (b1)->sw_if_index[VLIB_RX]);
1264           clib_memcpy_fast (t1->packet_data, vlib_buffer_get_current (b1),
1265                             sizeof (t1->packet_data));
1266         }
1267       from += 2;
1268       n_left -= 2;
1269     }
1270
1271   while (n_left >= 1)
1272     {
1273       u32 bi0;
1274       vlib_buffer_t *b0;
1275       ip4_forward_next_trace_t *t0;
1276
1277       bi0 = from[0];
1278
1279       b0 = vlib_get_buffer (vm, bi0);
1280
1281       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1282         {
1283           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1284           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1285           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1286           t0->fib_index =
1287             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1288              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1289             vec_elt (im->fib_index_by_sw_if_index,
1290                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1291           clib_memcpy_fast (t0->packet_data, vlib_buffer_get_current (b0),
1292                             sizeof (t0->packet_data));
1293         }
1294       from += 1;
1295       n_left -= 1;
1296     }
1297 }
1298
1299 /* Compute TCP/UDP/ICMP4 checksum in software. */
1300 u16
1301 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1302                               ip4_header_t * ip0)
1303 {
1304   ip_csum_t sum0;
1305   u32 ip_header_length, payload_length_host_byte_order;
1306
1307   /* Initialize checksum with ip header. */
1308   ip_header_length = ip4_header_bytes (ip0);
1309   payload_length_host_byte_order =
1310     clib_net_to_host_u16 (ip0->length) - ip_header_length;
1311   sum0 =
1312     clib_host_to_net_u32 (payload_length_host_byte_order +
1313                           (ip0->protocol << 16));
1314
1315   if (BITS (uword) == 32)
1316     {
1317       sum0 =
1318         ip_csum_with_carry (sum0,
1319                             clib_mem_unaligned (&ip0->src_address, u32));
1320       sum0 =
1321         ip_csum_with_carry (sum0,
1322                             clib_mem_unaligned (&ip0->dst_address, u32));
1323     }
1324   else
1325     sum0 =
1326       ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1327
1328   return ip_calculate_l4_checksum (vm, p0, sum0,
1329                                    payload_length_host_byte_order, (u8 *) ip0,
1330                                    ip_header_length, NULL);
1331 }
1332
1333 u32
1334 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1335 {
1336   ip4_header_t *ip0 = vlib_buffer_get_current (p0);
1337   udp_header_t *udp0;
1338   u16 sum16;
1339
1340   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1341           || ip0->protocol == IP_PROTOCOL_UDP);
1342
1343   udp0 = (void *) (ip0 + 1);
1344   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1345     {
1346       p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1347                     | VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
1348       return p0->flags;
1349     }
1350
1351   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1352
1353   p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1354                 | ((sum16 == 0) << VNET_BUFFER_F_LOG2_L4_CHECKSUM_CORRECT));
1355
1356   return p0->flags;
1357 }
1358 #endif
1359
1360 /* *INDENT-OFF* */
1361 VNET_FEATURE_ARC_INIT (ip4_local) =
1362 {
1363   .arc_name  = "ip4-local",
1364   .start_nodes = VNET_FEATURES ("ip4-local"),
1365   .last_in_arc = "ip4-local-end-of-arc",
1366 };
1367 /* *INDENT-ON* */
1368
1369 static inline void
1370 ip4_local_l4_csum_validate (vlib_main_t * vm, vlib_buffer_t * p,
1371                             ip4_header_t * ip, u8 is_udp, u8 * error,
1372                             u8 * good_tcp_udp)
1373 {
1374   u32 flags0;
1375   flags0 = ip4_tcp_udp_validate_checksum (vm, p);
1376   *good_tcp_udp = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0;
1377   if (is_udp)
1378     {
1379       udp_header_t *udp;
1380       u32 ip_len, udp_len;
1381       i32 len_diff;
1382       udp = ip4_next_header (ip);
1383       /* Verify UDP length. */
1384       ip_len = clib_net_to_host_u16 (ip->length);
1385       udp_len = clib_net_to_host_u16 (udp->length);
1386
1387       len_diff = ip_len - udp_len;
1388       *good_tcp_udp &= len_diff >= 0;
1389       *error = len_diff < 0 ? IP4_ERROR_UDP_LENGTH : *error;
1390     }
1391 }
1392
1393 #define ip4_local_csum_is_offloaded(_b)                                 \
1394     _b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM                         \
1395         || _b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM
1396
1397 #define ip4_local_need_csum_check(is_tcp_udp, _b)                       \
1398     (is_tcp_udp && !(_b->flags & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED     \
1399         || ip4_local_csum_is_offloaded (_b)))
1400
1401 #define ip4_local_csum_is_valid(_b)                                     \
1402     (_b->flags & VNET_BUFFER_F_L4_CHECKSUM_CORRECT                      \
1403         || (ip4_local_csum_is_offloaded (_b))) != 0
1404
1405 static inline void
1406 ip4_local_check_l4_csum (vlib_main_t * vm, vlib_buffer_t * b,
1407                          ip4_header_t * ih, u8 * error)
1408 {
1409   u8 is_udp, is_tcp_udp, good_tcp_udp;
1410
1411   is_udp = ih->protocol == IP_PROTOCOL_UDP;
1412   is_tcp_udp = is_udp || ih->protocol == IP_PROTOCOL_TCP;
1413
1414   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp, b)))
1415     ip4_local_l4_csum_validate (vm, b, ih, is_udp, error, &good_tcp_udp);
1416   else
1417     good_tcp_udp = ip4_local_csum_is_valid (b);
1418
1419   ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1420   *error = (is_tcp_udp && !good_tcp_udp
1421             ? IP4_ERROR_TCP_CHECKSUM + is_udp : *error);
1422 }
1423
1424 static inline void
1425 ip4_local_check_l4_csum_x2 (vlib_main_t * vm, vlib_buffer_t ** b,
1426                             ip4_header_t ** ih, u8 * error)
1427 {
1428   u8 is_udp[2], is_tcp_udp[2], good_tcp_udp[2];
1429
1430   is_udp[0] = ih[0]->protocol == IP_PROTOCOL_UDP;
1431   is_udp[1] = ih[1]->protocol == IP_PROTOCOL_UDP;
1432
1433   is_tcp_udp[0] = is_udp[0] || ih[0]->protocol == IP_PROTOCOL_TCP;
1434   is_tcp_udp[1] = is_udp[1] || ih[1]->protocol == IP_PROTOCOL_TCP;
1435
1436   good_tcp_udp[0] = ip4_local_csum_is_valid (b[0]);
1437   good_tcp_udp[1] = ip4_local_csum_is_valid (b[1]);
1438
1439   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp[0], b[0])
1440                      || ip4_local_need_csum_check (is_tcp_udp[1], b[1])))
1441     {
1442       if (is_tcp_udp[0])
1443         ip4_local_l4_csum_validate (vm, b[0], ih[0], is_udp[0], &error[0],
1444                                     &good_tcp_udp[0]);
1445       if (is_tcp_udp[1])
1446         ip4_local_l4_csum_validate (vm, b[1], ih[1], is_udp[1], &error[1],
1447                                     &good_tcp_udp[1]);
1448     }
1449
1450   error[0] = (is_tcp_udp[0] && !good_tcp_udp[0] ?
1451               IP4_ERROR_TCP_CHECKSUM + is_udp[0] : error[0]);
1452   error[1] = (is_tcp_udp[1] && !good_tcp_udp[1] ?
1453               IP4_ERROR_TCP_CHECKSUM + is_udp[1] : error[1]);
1454 }
1455
1456 static inline void
1457 ip4_local_set_next_and_error (vlib_node_runtime_t * error_node,
1458                               vlib_buffer_t * b, u16 * next, u8 error,
1459                               u8 head_of_feature_arc)
1460 {
1461   u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;
1462   u32 next_index;
1463
1464   *next = error != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : *next;
1465   b->error = error ? error_node->errors[error] : 0;
1466   if (head_of_feature_arc)
1467     {
1468       next_index = *next;
1469       if (PREDICT_TRUE (error == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1470         {
1471           vnet_feature_arc_start (arc_index,
1472                                   vnet_buffer (b)->sw_if_index[VLIB_RX],
1473                                   &next_index, b);
1474           *next = next_index;
1475         }
1476     }
1477 }
1478
1479 typedef struct
1480 {
1481   ip4_address_t src;
1482   u32 lbi;
1483   u8 error;
1484   u8 first;
1485 } ip4_local_last_check_t;
1486
1487 static inline void
1488 ip4_local_check_src (vlib_buffer_t * b, ip4_header_t * ip0,
1489                      ip4_local_last_check_t * last_check, u8 * error0)
1490 {
1491   ip4_fib_mtrie_leaf_t leaf0;
1492   ip4_fib_mtrie_t *mtrie0;
1493   const dpo_id_t *dpo0;
1494   load_balance_t *lb0;
1495   u32 lbi0;
1496
1497   vnet_buffer (b)->ip.fib_index =
1498     vnet_buffer (b)->sw_if_index[VLIB_TX] != ~0 ?
1499     vnet_buffer (b)->sw_if_index[VLIB_TX] : vnet_buffer (b)->ip.fib_index;
1500
1501   /*
1502    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1503    *  adjacency for the destination address (the local interface address).
1504    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1505    *  adjacency for the source address (the remote sender's address)
1506    */
1507   if (PREDICT_FALSE (last_check->first ||
1508                      (last_check->src.as_u32 != ip0->src_address.as_u32)))
1509     {
1510       mtrie0 = &ip4_fib_get (vnet_buffer (b)->ip.fib_index)->mtrie;
1511       leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, &ip0->src_address);
1512       leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1513       leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1514       lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1515
1516       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1517         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1518       vnet_buffer (b)->ip.adj_index[VLIB_TX] = lbi0;
1519
1520       lb0 = load_balance_get (lbi0);
1521       dpo0 = load_balance_get_bucket_i (lb0, 0);
1522
1523       /*
1524        * Must have a route to source otherwise we drop the packet.
1525        * ip4 broadcasts are accepted, e.g. to make dhcp client work
1526        *
1527        * The checks are:
1528        *  - the source is a recieve => it's from us => bogus, do this
1529        *    first since it sets a different error code.
1530        *  - uRPF check for any route to source - accept if passes.
1531        *  - allow packets destined to the broadcast address from unknown sources
1532        */
1533
1534       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1535                   && dpo0->dpoi_type == DPO_RECEIVE) ?
1536                  IP4_ERROR_SPOOFED_LOCAL_PACKETS : *error0);
1537       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1538                   && !fib_urpf_check_size (lb0->lb_urpf)
1539                   && ip0->dst_address.as_u32 != 0xFFFFFFFF) ?
1540                  IP4_ERROR_SRC_LOOKUP_MISS : *error0);
1541
1542       last_check->src.as_u32 = ip0->src_address.as_u32;
1543       last_check->lbi = lbi0;
1544       last_check->error = *error0;
1545     }
1546   else
1547     {
1548       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1549         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1550       vnet_buffer (b)->ip.adj_index[VLIB_TX] = last_check->lbi;
1551       *error0 = last_check->error;
1552       last_check->first = 0;
1553     }
1554 }
1555
1556 static inline void
1557 ip4_local_check_src_x2 (vlib_buffer_t ** b, ip4_header_t ** ip,
1558                         ip4_local_last_check_t * last_check, u8 * error)
1559 {
1560   ip4_fib_mtrie_leaf_t leaf[2];
1561   ip4_fib_mtrie_t *mtrie[2];
1562   const dpo_id_t *dpo[2];
1563   load_balance_t *lb[2];
1564   u32 not_last_hit;
1565   u32 lbi[2];
1566
1567   not_last_hit = last_check->first;
1568   not_last_hit |= ip[0]->src_address.as_u32 ^ last_check->src.as_u32;
1569   not_last_hit |= ip[1]->src_address.as_u32 ^ last_check->src.as_u32;
1570
1571   vnet_buffer (b[0])->ip.fib_index =
1572     vnet_buffer (b[0])->sw_if_index[VLIB_TX] != ~0 ?
1573     vnet_buffer (b[0])->sw_if_index[VLIB_TX] :
1574     vnet_buffer (b[0])->ip.fib_index;
1575
1576   vnet_buffer (b[1])->ip.fib_index =
1577     vnet_buffer (b[1])->sw_if_index[VLIB_TX] != ~0 ?
1578     vnet_buffer (b[1])->sw_if_index[VLIB_TX] :
1579     vnet_buffer (b[1])->ip.fib_index;
1580
1581   /*
1582    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1583    *  adjacency for the destination address (the local interface address).
1584    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1585    *  adjacency for the source address (the remote sender's address)
1586    */
1587   if (PREDICT_FALSE (not_last_hit))
1588     {
1589       mtrie[0] = &ip4_fib_get (vnet_buffer (b[0])->ip.fib_index)->mtrie;
1590       mtrie[1] = &ip4_fib_get (vnet_buffer (b[1])->ip.fib_index)->mtrie;
1591
1592       leaf[0] = ip4_fib_mtrie_lookup_step_one (mtrie[0], &ip[0]->src_address);
1593       leaf[1] = ip4_fib_mtrie_lookup_step_one (mtrie[1], &ip[1]->src_address);
1594
1595       leaf[0] = ip4_fib_mtrie_lookup_step (mtrie[0], leaf[0],
1596                                            &ip[0]->src_address, 2);
1597       leaf[1] = ip4_fib_mtrie_lookup_step (mtrie[1], leaf[1],
1598                                            &ip[1]->src_address, 2);
1599
1600       leaf[0] = ip4_fib_mtrie_lookup_step (mtrie[0], leaf[0],
1601                                            &ip[0]->src_address, 3);
1602       leaf[1] = ip4_fib_mtrie_lookup_step (mtrie[1], leaf[1],
1603                                            &ip[1]->src_address, 3);
1604
1605       lbi[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf[0]);
1606       lbi[1] = ip4_fib_mtrie_leaf_get_adj_index (leaf[1]);
1607
1608       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1609         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1610       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = lbi[0];
1611
1612       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1613         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1614       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = lbi[1];
1615
1616       lb[0] = load_balance_get (lbi[0]);
1617       lb[1] = load_balance_get (lbi[1]);
1618
1619       dpo[0] = load_balance_get_bucket_i (lb[0], 0);
1620       dpo[1] = load_balance_get_bucket_i (lb[1], 0);
1621
1622       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1623                    dpo[0]->dpoi_type == DPO_RECEIVE) ?
1624                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[0]);
1625       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1626                    !fib_urpf_check_size (lb[0]->lb_urpf) &&
1627                    ip[0]->dst_address.as_u32 != 0xFFFFFFFF)
1628                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[0]);
1629
1630       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1631                    dpo[1]->dpoi_type == DPO_RECEIVE) ?
1632                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[1]);
1633       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1634                    !fib_urpf_check_size (lb[1]->lb_urpf) &&
1635                    ip[1]->dst_address.as_u32 != 0xFFFFFFFF)
1636                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[1]);
1637
1638       last_check->src.as_u32 = ip[1]->src_address.as_u32;
1639       last_check->lbi = lbi[1];
1640       last_check->error = error[1];
1641     }
1642   else
1643     {
1644       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1645         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1646       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = last_check->lbi;
1647
1648       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1649         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1650       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = last_check->lbi;
1651
1652       error[0] = last_check->error;
1653       error[1] = last_check->error;
1654       last_check->first = 0;
1655     }
1656 }
1657
1658 enum ip_local_packet_type_e
1659 {
1660   IP_LOCAL_PACKET_TYPE_L4,
1661   IP_LOCAL_PACKET_TYPE_NAT,
1662   IP_LOCAL_PACKET_TYPE_FRAG,
1663 };
1664
1665 /**
1666  * Determine packet type and next node.
1667  *
1668  * The expectation is that all packets that are not L4 will skip
1669  * checksums and source checks.
1670  */
1671 always_inline u8
1672 ip4_local_classify (vlib_buffer_t * b, ip4_header_t * ip, u16 * next)
1673 {
1674   ip_lookup_main_t *lm = &ip4_main.lookup_main;
1675
1676   if (PREDICT_FALSE (ip4_is_fragment (ip)))
1677     {
1678       *next = IP_LOCAL_NEXT_REASSEMBLY;
1679       return IP_LOCAL_PACKET_TYPE_FRAG;
1680     }
1681   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_IS_NATED))
1682     {
1683       *next = lm->local_next_by_ip_protocol[ip->protocol];
1684       return IP_LOCAL_PACKET_TYPE_NAT;
1685     }
1686
1687   *next = lm->local_next_by_ip_protocol[ip->protocol];
1688   return IP_LOCAL_PACKET_TYPE_L4;
1689 }
1690
1691 static inline uword
1692 ip4_local_inline (vlib_main_t * vm,
1693                   vlib_node_runtime_t * node,
1694                   vlib_frame_t * frame, int head_of_feature_arc)
1695 {
1696   u32 *from, n_left_from;
1697   vlib_node_runtime_t *error_node =
1698     vlib_node_get_runtime (vm, ip4_input_node.index);
1699   u16 nexts[VLIB_FRAME_SIZE], *next;
1700   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1701   ip4_header_t *ip[2];
1702   u8 error[2], pt[2];
1703
1704   ip4_local_last_check_t last_check = {
1705     /*
1706      * 0.0.0.0 can appear as the source address of an IP packet,
1707      * as can any other address, hence the need to use the 'first'
1708      * member to make sure the .lbi is initialised for the first
1709      * packet.
1710      */
1711     .src = {.as_u32 = 0},
1712     .lbi = ~0,
1713     .error = IP4_ERROR_UNKNOWN_PROTOCOL,
1714     .first = 1,
1715   };
1716
1717   from = vlib_frame_vector_args (frame);
1718   n_left_from = frame->n_vectors;
1719
1720   if (node->flags & VLIB_NODE_FLAG_TRACE)
1721     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1722
1723   vlib_get_buffers (vm, from, bufs, n_left_from);
1724   b = bufs;
1725   next = nexts;
1726
1727   while (n_left_from >= 6)
1728     {
1729       u8 not_batch = 0;
1730
1731       /* Prefetch next iteration. */
1732       {
1733         vlib_prefetch_buffer_header (b[4], LOAD);
1734         vlib_prefetch_buffer_header (b[5], LOAD);
1735
1736         CLIB_PREFETCH (b[4]->data, CLIB_CACHE_LINE_BYTES, LOAD);
1737         CLIB_PREFETCH (b[5]->data, CLIB_CACHE_LINE_BYTES, LOAD);
1738       }
1739
1740       error[0] = error[1] = IP4_ERROR_UNKNOWN_PROTOCOL;
1741
1742       ip[0] = vlib_buffer_get_current (b[0]);
1743       ip[1] = vlib_buffer_get_current (b[1]);
1744
1745       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1746       vnet_buffer (b[1])->l3_hdr_offset = b[1]->current_data;
1747
1748       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1749       pt[1] = ip4_local_classify (b[1], ip[1], &next[1]);
1750
1751       not_batch = pt[0] ^ pt[1];
1752
1753       if (head_of_feature_arc == 0 || (pt[0] && not_batch == 0))
1754         goto skip_checks;
1755
1756       if (PREDICT_TRUE (not_batch == 0))
1757         {
1758           ip4_local_check_l4_csum_x2 (vm, b, ip, error);
1759           ip4_local_check_src_x2 (b, ip, &last_check, error);
1760         }
1761       else
1762         {
1763           if (!pt[0])
1764             {
1765               ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1766               ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1767             }
1768           if (!pt[1])
1769             {
1770               ip4_local_check_l4_csum (vm, b[1], ip[1], &error[1]);
1771               ip4_local_check_src (b[1], ip[1], &last_check, &error[1]);
1772             }
1773         }
1774
1775     skip_checks:
1776
1777       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1778                                     head_of_feature_arc);
1779       ip4_local_set_next_and_error (error_node, b[1], &next[1], error[1],
1780                                     head_of_feature_arc);
1781
1782       b += 2;
1783       next += 2;
1784       n_left_from -= 2;
1785     }
1786
1787   while (n_left_from > 0)
1788     {
1789       error[0] = IP4_ERROR_UNKNOWN_PROTOCOL;
1790
1791       ip[0] = vlib_buffer_get_current (b[0]);
1792       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1793       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1794
1795       if (head_of_feature_arc == 0 || pt[0])
1796         goto skip_check;
1797
1798       ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1799       ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1800
1801     skip_check:
1802
1803       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1804                                     head_of_feature_arc);
1805
1806       b += 1;
1807       next += 1;
1808       n_left_from -= 1;
1809     }
1810
1811   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
1812   return frame->n_vectors;
1813 }
1814
1815 VLIB_NODE_FN (ip4_local_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1816                                vlib_frame_t * frame)
1817 {
1818   return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ );
1819 }
1820
1821 /* *INDENT-OFF* */
1822 VLIB_REGISTER_NODE (ip4_local_node) =
1823 {
1824   .name = "ip4-local",
1825   .vector_size = sizeof (u32),
1826   .format_trace = format_ip4_forward_next_trace,
1827   .n_next_nodes = IP_LOCAL_N_NEXT,
1828   .next_nodes =
1829   {
1830     [IP_LOCAL_NEXT_DROP] = "ip4-drop",
1831     [IP_LOCAL_NEXT_PUNT] = "ip4-punt",
1832     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1833     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
1834     [IP_LOCAL_NEXT_REASSEMBLY] = "ip4-full-reassembly",
1835   },
1836 };
1837 /* *INDENT-ON* */
1838
1839
1840 VLIB_NODE_FN (ip4_local_end_of_arc_node) (vlib_main_t * vm,
1841                                           vlib_node_runtime_t * node,
1842                                           vlib_frame_t * frame)
1843 {
1844   return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ );
1845 }
1846
1847 /* *INDENT-OFF* */
1848 VLIB_REGISTER_NODE (ip4_local_end_of_arc_node) = {
1849   .name = "ip4-local-end-of-arc",
1850   .vector_size = sizeof (u32),
1851
1852   .format_trace = format_ip4_forward_next_trace,
1853   .sibling_of = "ip4-local",
1854 };
1855
1856 VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = {
1857   .arc_name = "ip4-local",
1858   .node_name = "ip4-local-end-of-arc",
1859   .runs_before = 0, /* not before any other features */
1860 };
1861 /* *INDENT-ON* */
1862
1863 #ifndef CLIB_MARCH_VARIANT
1864 void
1865 ip4_register_protocol (u32 protocol, u32 node_index)
1866 {
1867   vlib_main_t *vm = vlib_get_main ();
1868   ip4_main_t *im = &ip4_main;
1869   ip_lookup_main_t *lm = &im->lookup_main;
1870
1871   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1872   lm->local_next_by_ip_protocol[protocol] =
1873     vlib_node_add_next (vm, ip4_local_node.index, node_index);
1874 }
1875
1876 void
1877 ip4_unregister_protocol (u32 protocol)
1878 {
1879   ip4_main_t *im = &ip4_main;
1880   ip_lookup_main_t *lm = &im->lookup_main;
1881
1882   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1883   lm->local_next_by_ip_protocol[protocol] = IP_LOCAL_NEXT_PUNT;
1884 }
1885 #endif
1886
1887 static clib_error_t *
1888 show_ip_local_command_fn (vlib_main_t * vm,
1889                           unformat_input_t * input, vlib_cli_command_t * cmd)
1890 {
1891   ip4_main_t *im = &ip4_main;
1892   ip_lookup_main_t *lm = &im->lookup_main;
1893   int i;
1894
1895   vlib_cli_output (vm, "Protocols handled by ip4_local");
1896   for (i = 0; i < ARRAY_LEN (lm->local_next_by_ip_protocol); i++)
1897     {
1898       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1899         {
1900           u32 node_index = vlib_get_node (vm,
1901                                           ip4_local_node.index)->
1902             next_nodes[lm->local_next_by_ip_protocol[i]];
1903           vlib_cli_output (vm, "%U: %U", format_ip_protocol, i,
1904                            format_vlib_node_name, vm, node_index);
1905         }
1906     }
1907   return 0;
1908 }
1909
1910
1911
1912 /*?
1913  * Display the set of protocols handled by the local IPv4 stack.
1914  *
1915  * @cliexpar
1916  * Example of how to display local protocol table:
1917  * @cliexstart{show ip local}
1918  * Protocols handled by ip4_local
1919  * 1
1920  * 17
1921  * 47
1922  * @cliexend
1923 ?*/
1924 /* *INDENT-OFF* */
1925 VLIB_CLI_COMMAND (show_ip_local, static) =
1926 {
1927   .path = "show ip local",
1928   .function = show_ip_local_command_fn,
1929   .short_help = "show ip local",
1930 };
1931 /* *INDENT-ON* */
1932
1933 always_inline uword
1934 ip4_arp_inline (vlib_main_t * vm,
1935                 vlib_node_runtime_t * node,
1936                 vlib_frame_t * frame, int is_glean)
1937 {
1938   vnet_main_t *vnm = vnet_get_main ();
1939   ip4_main_t *im = &ip4_main;
1940   ip_lookup_main_t *lm = &im->lookup_main;
1941   u32 *from, *to_next_drop;
1942   uword n_left_from, n_left_to_next_drop, next_index;
1943   u32 thread_index = vm->thread_index;
1944   u64 seed;
1945
1946   if (node->flags & VLIB_NODE_FLAG_TRACE)
1947     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1948
1949   seed = throttle_seed (&im->arp_throttle, thread_index, vlib_time_now (vm));
1950
1951   from = vlib_frame_vector_args (frame);
1952   n_left_from = frame->n_vectors;
1953   next_index = node->cached_next_index;
1954   if (next_index == IP4_ARP_NEXT_DROP)
1955     next_index = IP4_ARP_N_NEXT;        /* point to first interface */
1956
1957   while (n_left_from > 0)
1958     {
1959       vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
1960                            to_next_drop, n_left_to_next_drop);
1961
1962       while (n_left_from > 0 && n_left_to_next_drop > 0)
1963         {
1964           u32 pi0, bi0, adj_index0, sw_if_index0;
1965           ip_adjacency_t *adj0;
1966           vlib_buffer_t *p0, *b0;
1967           ip4_address_t resolve0;
1968           ethernet_arp_header_t *h0;
1969           vnet_hw_interface_t *hw_if0;
1970           u64 r0;
1971
1972           pi0 = from[0];
1973           p0 = vlib_get_buffer (vm, pi0);
1974
1975           from += 1;
1976           n_left_from -= 1;
1977           to_next_drop[0] = pi0;
1978           to_next_drop += 1;
1979           n_left_to_next_drop -= 1;
1980
1981           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
1982           adj0 = adj_get (adj_index0);
1983
1984           if (is_glean)
1985             {
1986               /* resolve the packet's destination */
1987               ip4_header_t *ip0 = vlib_buffer_get_current (p0);
1988               resolve0 = ip0->dst_address;
1989             }
1990           else
1991             {
1992               /* resolve the incomplete adj */
1993               resolve0 = adj0->sub_type.nbr.next_hop.ip4;
1994             }
1995
1996           /* combine the address and interface for the hash key */
1997           sw_if_index0 = adj0->rewrite_header.sw_if_index;
1998           r0 = (u64) resolve0.data_u32 << 32;
1999           r0 |= sw_if_index0;
2000
2001           if (throttle_check (&im->arp_throttle, thread_index, r0, seed))
2002             {
2003               p0->error = node->errors[IP4_ARP_ERROR_THROTTLED];
2004               continue;
2005             }
2006
2007           /*
2008            * the adj has been updated to a rewrite but the node the DPO that got
2009            * us here hasn't - yet. no big deal. we'll drop while we wait.
2010            */
2011           if (IP_LOOKUP_NEXT_REWRITE == adj0->lookup_next_index)
2012             {
2013               p0->error = node->errors[IP4_ARP_ERROR_RESOLVED];
2014               continue;
2015             }
2016
2017           /*
2018            * Can happen if the control-plane is programming tables
2019            * with traffic flowing; at least that's today's lame excuse.
2020            */
2021           if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN)
2022               || (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
2023             {
2024               p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
2025               continue;
2026             }
2027           /* Send ARP request. */
2028           h0 =
2029             vlib_packet_template_get_packet (vm,
2030                                              &im->ip4_arp_request_packet_template,
2031                                              &bi0);
2032           /* Seems we're out of buffers */
2033           if (PREDICT_FALSE (!h0))
2034             {
2035               p0->error = node->errors[IP4_ARP_ERROR_NO_BUFFERS];
2036               continue;
2037             }
2038
2039           b0 = vlib_get_buffer (vm, bi0);
2040
2041           /* copy the persistent fields from the original */
2042           clib_memcpy_fast (b0->opaque2, p0->opaque2, sizeof (p0->opaque2));
2043
2044           /* Add rewrite/encap string for ARP packet. */
2045           vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t));
2046
2047           hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
2048
2049           /* Src ethernet address in ARP header. */
2050           mac_address_from_bytes (&h0->ip4_over_ethernet[0].mac,
2051                                   hw_if0->hw_address);
2052           if (is_glean)
2053             {
2054               /* The interface's source address is stashed in the Glean Adj */
2055               h0->ip4_over_ethernet[0].ip4 =
2056                 adj0->sub_type.glean.receive_addr.ip4;
2057             }
2058           else
2059             {
2060               /* Src IP address in ARP header. */
2061               if (ip4_src_address_for_packet (lm, sw_if_index0,
2062                                               &h0->ip4_over_ethernet[0].ip4))
2063                 {
2064                   /* No source address available */
2065                   p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
2066                   vlib_buffer_free (vm, &bi0, 1);
2067                   continue;
2068                 }
2069             }
2070           h0->ip4_over_ethernet[1].ip4 = resolve0;
2071
2072           p0->error = node->errors[IP4_ARP_ERROR_REQUEST_SENT];
2073
2074           vlib_buffer_copy_trace_flag (vm, p0, bi0);
2075           VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
2076           vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
2077
2078           vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
2079
2080           vlib_set_next_frame_buffer (vm, node,
2081                                       adj0->rewrite_header.next_index, bi0);
2082         }
2083
2084       vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
2085     }
2086
2087   return frame->n_vectors;
2088 }
2089
2090 VLIB_NODE_FN (ip4_arp_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2091                              vlib_frame_t * frame)
2092 {
2093   return (ip4_arp_inline (vm, node, frame, 0));
2094 }
2095
2096 VLIB_NODE_FN (ip4_glean_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2097                                vlib_frame_t * frame)
2098 {
2099   return (ip4_arp_inline (vm, node, frame, 1));
2100 }
2101
2102 static char *ip4_arp_error_strings[] = {
2103   [IP4_ARP_ERROR_THROTTLED] = "ARP requests throttled",
2104   [IP4_ARP_ERROR_RESOLVED] = "ARP requests resolved",
2105   [IP4_ARP_ERROR_NO_BUFFERS] = "ARP requests out of buffer",
2106   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
2107   [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
2108   [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
2109 };
2110
2111 /* *INDENT-OFF* */
2112 VLIB_REGISTER_NODE (ip4_arp_node) =
2113 {
2114   .name = "ip4-arp",
2115   .vector_size = sizeof (u32),
2116   .format_trace = format_ip4_forward_next_trace,
2117   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2118   .error_strings = ip4_arp_error_strings,
2119   .n_next_nodes = IP4_ARP_N_NEXT,
2120   .next_nodes =
2121   {
2122     [IP4_ARP_NEXT_DROP] = "error-drop",
2123   },
2124 };
2125
2126 VLIB_REGISTER_NODE (ip4_glean_node) =
2127 {
2128   .name = "ip4-glean",
2129   .vector_size = sizeof (u32),
2130   .format_trace = format_ip4_forward_next_trace,
2131   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2132   .error_strings = ip4_arp_error_strings,
2133   .n_next_nodes = IP4_ARP_N_NEXT,
2134   .next_nodes = {
2135   [IP4_ARP_NEXT_DROP] = "error-drop",
2136   },
2137 };
2138 /* *INDENT-ON* */
2139
2140 #define foreach_notrace_ip4_arp_error           \
2141 _(THROTTLED)                                    \
2142 _(RESOLVED)                                     \
2143 _(NO_BUFFERS)                                   \
2144 _(REQUEST_SENT)                                 \
2145 _(NON_ARP_ADJ)                                  \
2146 _(NO_SOURCE_ADDRESS)
2147
2148 static clib_error_t *
2149 arp_notrace_init (vlib_main_t * vm)
2150 {
2151   vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, ip4_arp_node.index);
2152
2153   /* don't trace ARP request packets */
2154 #define _(a)                                    \
2155     vnet_pcap_drop_trace_filter_add_del         \
2156         (rt->errors[IP4_ARP_ERROR_##a],         \
2157          1 /* is_add */);
2158   foreach_notrace_ip4_arp_error;
2159 #undef _
2160   return 0;
2161 }
2162
2163 VLIB_INIT_FUNCTION (arp_notrace_init);
2164
2165
2166 #ifndef CLIB_MARCH_VARIANT
2167 /* Send an ARP request to see if given destination is reachable on given interface. */
2168 clib_error_t *
2169 ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index,
2170                     u8 refresh)
2171 {
2172   vnet_main_t *vnm = vnet_get_main ();
2173   ip4_main_t *im = &ip4_main;
2174   ethernet_arp_header_t *h;
2175   ip4_address_t *src;
2176   ip_interface_address_t *ia;
2177   ip_adjacency_t *adj;
2178   vnet_hw_interface_t *hi;
2179   vnet_sw_interface_t *si;
2180   vlib_buffer_t *b;
2181   adj_index_t ai;
2182   u32 bi = 0;
2183   u8 unicast_rewrite = 0;
2184
2185   si = vnet_get_sw_interface (vnm, sw_if_index);
2186
2187   if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
2188     {
2189       return clib_error_return (0, "%U: interface %U down",
2190                                 format_ip4_address, dst,
2191                                 format_vnet_sw_if_index_name, vnm,
2192                                 sw_if_index);
2193     }
2194
2195   src =
2196     ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
2197   if (!src)
2198     {
2199       vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
2200       return clib_error_return
2201         (0,
2202          "no matching interface address for destination %U (interface %U)",
2203          format_ip4_address, dst, format_vnet_sw_if_index_name, vnm,
2204          sw_if_index);
2205     }
2206
2207   h = vlib_packet_template_get_packet (vm,
2208                                        &im->ip4_arp_request_packet_template,
2209                                        &bi);
2210
2211   if (!h)
2212     return clib_error_return (0, "ARP request packet allocation failed");
2213
2214   hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
2215   if (PREDICT_FALSE (!hi->hw_address))
2216     {
2217       return clib_error_return (0, "%U: interface %U do not support ip probe",
2218                                 format_ip4_address, dst,
2219                                 format_vnet_sw_if_index_name, vnm,
2220                                 sw_if_index);
2221     }
2222
2223   mac_address_from_bytes (&h->ip4_over_ethernet[0].mac, hi->hw_address);
2224
2225   h->ip4_over_ethernet[0].ip4 = src[0];
2226   h->ip4_over_ethernet[1].ip4 = dst[0];
2227
2228   b = vlib_get_buffer (vm, bi);
2229   vnet_buffer (b)->sw_if_index[VLIB_RX] =
2230     vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
2231
2232   ip46_address_t nh = {
2233     .ip4 = *dst,
2234   };
2235
2236   ai = adj_nbr_add_or_lock (FIB_PROTOCOL_IP4,
2237                             VNET_LINK_IP4, &nh, sw_if_index);
2238   adj = adj_get (ai);
2239
2240   /* Peer has been previously resolved, retrieve glean adj instead */
2241   if (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE)
2242     {
2243       if (refresh)
2244         unicast_rewrite = 1;
2245       else
2246         {
2247           adj_unlock (ai);
2248           ai = adj_glean_add_or_lock (FIB_PROTOCOL_IP4,
2249                                       VNET_LINK_IP4, sw_if_index, &nh);
2250           adj = adj_get (ai);
2251         }
2252     }
2253
2254   /* Add encapsulation string for software interface (e.g. ethernet header). */
2255   vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
2256   if (unicast_rewrite)
2257     {
2258       u16 *etype = vlib_buffer_get_current (b) - 2;
2259       etype[0] = clib_host_to_net_u16 (ETHERNET_TYPE_ARP);
2260     }
2261   vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
2262
2263   {
2264     vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index);
2265     u32 *to_next = vlib_frame_vector_args (f);
2266     to_next[0] = bi;
2267     f->n_vectors = 1;
2268     vlib_put_frame_to_node (vm, hi->output_node_index, f);
2269   }
2270
2271   adj_unlock (ai);
2272   return /* no error */ 0;
2273 }
2274 #endif
2275
2276 typedef enum
2277 {
2278   IP4_REWRITE_NEXT_DROP,
2279   IP4_REWRITE_NEXT_ICMP_ERROR,
2280   IP4_REWRITE_NEXT_FRAGMENT,
2281   IP4_REWRITE_N_NEXT            /* Last */
2282 } ip4_rewrite_next_t;
2283
2284 /**
2285  * This bits of an IPv4 address to mask to construct a multicast
2286  * MAC address
2287  */
2288 #if CLIB_ARCH_IS_BIG_ENDIAN
2289 #define IP4_MCAST_ADDR_MASK 0x007fffff
2290 #else
2291 #define IP4_MCAST_ADDR_MASK 0xffff7f00
2292 #endif
2293
2294 always_inline void
2295 ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
2296                u16 adj_packet_bytes, bool df, u16 * next, u32 * error,
2297                u8 is_midchain)
2298 {
2299   if (packet_len > adj_packet_bytes)
2300     {
2301       *error = IP4_ERROR_MTU_EXCEEDED;
2302       if (df)
2303         {
2304           icmp4_error_set_vnet_buffer
2305             (b, ICMP4_destination_unreachable,
2306              ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
2307              adj_packet_bytes);
2308           *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2309         }
2310       else
2311         {
2312           /* IP fragmentation */
2313           ip_frag_set_vnet_buffer (b, adj_packet_bytes,
2314                                    (is_midchain ?
2315                                     IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN :
2316                                     IP4_FRAG_NEXT_IP4_REWRITE), 0);
2317           *next = IP4_REWRITE_NEXT_FRAGMENT;
2318         }
2319     }
2320 }
2321
2322 /* increment TTL & update checksum.
2323    Works either endian, so no need for byte swap. */
2324 static_always_inline void
2325 ip4_ttl_inc (vlib_buffer_t * b, ip4_header_t * ip)
2326 {
2327   i32 ttl;
2328   u32 checksum;
2329   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
2330     {
2331       b->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED;
2332       return;
2333     }
2334
2335   ttl = ip->ttl;
2336
2337   checksum = ip->checksum - clib_host_to_net_u16 (0x0100);
2338   checksum += checksum >= 0xffff;
2339
2340   ip->checksum = checksum;
2341   ttl += 1;
2342   ip->ttl = ttl;
2343
2344   ASSERT (ip->checksum == ip4_header_checksum (ip));
2345 }
2346
2347 /* Decrement TTL & update checksum.
2348    Works either endian, so no need for byte swap. */
2349 static_always_inline void
2350 ip4_ttl_and_checksum_check (vlib_buffer_t * b, ip4_header_t * ip, u16 * next,
2351                             u32 * error)
2352 {
2353   i32 ttl;
2354   u32 checksum;
2355   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
2356     {
2357       b->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED;
2358       return;
2359     }
2360
2361   ttl = ip->ttl;
2362
2363   /* Input node should have reject packets with ttl 0. */
2364   ASSERT (ip->ttl > 0);
2365
2366   checksum = ip->checksum + clib_host_to_net_u16 (0x0100);
2367   checksum += checksum >= 0xffff;
2368
2369   ip->checksum = checksum;
2370   ttl -= 1;
2371   ip->ttl = ttl;
2372
2373   /*
2374    * If the ttl drops below 1 when forwarding, generate
2375    * an ICMP response.
2376    */
2377   if (PREDICT_FALSE (ttl <= 0))
2378     {
2379       *error = IP4_ERROR_TIME_EXPIRED;
2380       vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2381       icmp4_error_set_vnet_buffer (b, ICMP4_time_exceeded,
2382                                    ICMP4_time_exceeded_ttl_exceeded_in_transit,
2383                                    0);
2384       *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2385     }
2386
2387   /* Verify checksum. */
2388   ASSERT ((ip->checksum == ip4_header_checksum (ip)) ||
2389           (b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM));
2390 }
2391
2392
2393 always_inline uword
2394 ip4_rewrite_inline_with_gso (vlib_main_t * vm,
2395                              vlib_node_runtime_t * node,
2396                              vlib_frame_t * frame,
2397                              int do_counters, int is_midchain, int is_mcast,
2398                              int do_gso)
2399 {
2400   ip_lookup_main_t *lm = &ip4_main.lookup_main;
2401   u32 *from = vlib_frame_vector_args (frame);
2402   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2403   u16 nexts[VLIB_FRAME_SIZE], *next;
2404   u32 n_left_from;
2405   vlib_node_runtime_t *error_node =
2406     vlib_node_get_runtime (vm, ip4_input_node.index);
2407
2408   n_left_from = frame->n_vectors;
2409   u32 thread_index = vm->thread_index;
2410
2411   vlib_get_buffers (vm, from, bufs, n_left_from);
2412   clib_memset_u16 (nexts, IP4_REWRITE_NEXT_DROP, n_left_from);
2413
2414 #if (CLIB_N_PREFETCHES >= 8)
2415   if (n_left_from >= 6)
2416     {
2417       int i;
2418       for (i = 2; i < 6; i++)
2419         vlib_prefetch_buffer_header (bufs[i], LOAD);
2420     }
2421
2422   next = nexts;
2423   b = bufs;
2424   while (n_left_from >= 8)
2425     {
2426       ip_adjacency_t *adj0, *adj1;
2427       ip4_header_t *ip0, *ip1;
2428       u32 rw_len0, error0, adj_index0;
2429       u32 rw_len1, error1, adj_index1;
2430       u32 tx_sw_if_index0, tx_sw_if_index1;
2431       u8 *p;
2432
2433       vlib_prefetch_buffer_header (b[6], LOAD);
2434       vlib_prefetch_buffer_header (b[7], LOAD);
2435
2436       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2437       adj_index1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
2438
2439       /*
2440        * pre-fetch the per-adjacency counters
2441        */
2442       if (do_counters)
2443         {
2444           vlib_prefetch_combined_counter (&adjacency_counters,
2445                                           thread_index, adj_index0);
2446           vlib_prefetch_combined_counter (&adjacency_counters,
2447                                           thread_index, adj_index1);
2448         }
2449
2450       ip0 = vlib_buffer_get_current (b[0]);
2451       ip1 = vlib_buffer_get_current (b[1]);
2452
2453       error0 = error1 = IP4_ERROR_NONE;
2454
2455       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2456       ip4_ttl_and_checksum_check (b[1], ip1, next + 1, &error1);
2457
2458       /* Rewrite packet header and updates lengths. */
2459       adj0 = adj_get (adj_index0);
2460       adj1 = adj_get (adj_index1);
2461
2462       /* Worth pipelining. No guarantee that adj0,1 are hot... */
2463       rw_len0 = adj0[0].rewrite_header.data_bytes;
2464       rw_len1 = adj1[0].rewrite_header.data_bytes;
2465       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2466       vnet_buffer (b[1])->ip.save_rewrite_length = rw_len1;
2467
2468       p = vlib_buffer_get_current (b[2]);
2469       CLIB_PREFETCH (p - CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES, STORE);
2470       CLIB_PREFETCH (p, CLIB_CACHE_LINE_BYTES, LOAD);
2471
2472       p = vlib_buffer_get_current (b[3]);
2473       CLIB_PREFETCH (p - CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES, STORE);
2474       CLIB_PREFETCH (p, CLIB_CACHE_LINE_BYTES, LOAD);
2475
2476       /* Check MTU of outgoing interface. */
2477       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2478       u16 ip1_len = clib_net_to_host_u16 (ip1->length);
2479
2480       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2481         ip0_len = gso_mtu_sz (b[0]);
2482       if (do_gso && (b[1]->flags & VNET_BUFFER_F_GSO))
2483         ip1_len = gso_mtu_sz (b[1]);
2484
2485       ip4_mtu_check (b[0], ip0_len,
2486                      adj0[0].rewrite_header.max_l3_packet_bytes,
2487                      ip0->flags_and_fragment_offset &
2488                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2489                      next + 0, &error0, is_midchain);
2490       ip4_mtu_check (b[1], ip1_len,
2491                      adj1[0].rewrite_header.max_l3_packet_bytes,
2492                      ip1->flags_and_fragment_offset &
2493                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2494                      next + 1, &error1, is_midchain);
2495
2496       if (is_mcast)
2497         {
2498           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2499                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2500                     IP4_ERROR_SAME_INTERFACE : error0);
2501           error1 = ((adj1[0].rewrite_header.sw_if_index ==
2502                      vnet_buffer (b[1])->sw_if_index[VLIB_RX]) ?
2503                     IP4_ERROR_SAME_INTERFACE : error1);
2504         }
2505
2506       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2507        * to see the IP header */
2508       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2509         {
2510           u32 next_index = adj0[0].rewrite_header.next_index;
2511           vlib_buffer_advance (b[0], -(word) rw_len0);
2512
2513           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2514           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2515
2516           if (PREDICT_FALSE
2517               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2518             vnet_feature_arc_start (lm->output_feature_arc_index,
2519                                     tx_sw_if_index0, &next_index, b[0]);
2520           next[0] = next_index;
2521           if (is_midchain)
2522             calc_checksums (vm, b[0]);
2523         }
2524       else
2525         {
2526           b[0]->error = error_node->errors[error0];
2527           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2528             ip4_ttl_inc (b[0], ip0);
2529         }
2530       if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
2531         {
2532           u32 next_index = adj1[0].rewrite_header.next_index;
2533           vlib_buffer_advance (b[1], -(word) rw_len1);
2534
2535           tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2536           vnet_buffer (b[1])->sw_if_index[VLIB_TX] = tx_sw_if_index1;
2537
2538           if (PREDICT_FALSE
2539               (adj1[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2540             vnet_feature_arc_start (lm->output_feature_arc_index,
2541                                     tx_sw_if_index1, &next_index, b[1]);
2542           next[1] = next_index;
2543           if (is_midchain)
2544             calc_checksums (vm, b[1]);
2545         }
2546       else
2547         {
2548           b[1]->error = error_node->errors[error1];
2549           if (error1 == IP4_ERROR_MTU_EXCEEDED)
2550             ip4_ttl_inc (b[1], ip1);
2551         }
2552
2553       /* Guess we are only writing on simple Ethernet header. */
2554       vnet_rewrite_two_headers (adj0[0], adj1[0],
2555                                 ip0, ip1, sizeof (ethernet_header_t));
2556
2557       if (do_counters)
2558         {
2559           if (error0 == IP4_ERROR_NONE)
2560             vlib_increment_combined_counter
2561               (&adjacency_counters,
2562                thread_index,
2563                adj_index0, 1,
2564                vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2565
2566           if (error1 == IP4_ERROR_NONE)
2567             vlib_increment_combined_counter
2568               (&adjacency_counters,
2569                thread_index,
2570                adj_index1, 1,
2571                vlib_buffer_length_in_chain (vm, b[1]) + rw_len1);
2572         }
2573
2574       if (is_midchain)
2575         {
2576           if (error0 == IP4_ERROR_NONE && adj0->sub_type.midchain.fixup_func)
2577             adj0->sub_type.midchain.fixup_func
2578               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2579           if (error1 == IP4_ERROR_NONE && adj1->sub_type.midchain.fixup_func)
2580             adj1->sub_type.midchain.fixup_func
2581               (vm, adj1, b[1], adj1->sub_type.midchain.fixup_data);
2582         }
2583
2584       if (is_mcast)
2585         {
2586           /* copy bytes from the IP address into the MAC rewrite */
2587           if (error0 == IP4_ERROR_NONE)
2588             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2589                                         adj0->rewrite_header.dst_mcast_offset,
2590                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2591           if (error1 == IP4_ERROR_NONE)
2592             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2593                                         adj1->rewrite_header.dst_mcast_offset,
2594                                         &ip1->dst_address.as_u32, (u8 *) ip1);
2595         }
2596
2597       next += 2;
2598       b += 2;
2599       n_left_from -= 2;
2600     }
2601 #elif (CLIB_N_PREFETCHES >= 4)
2602   next = nexts;
2603   b = bufs;
2604   while (n_left_from >= 1)
2605     {
2606       ip_adjacency_t *adj0;
2607       ip4_header_t *ip0;
2608       u32 rw_len0, error0, adj_index0;
2609       u32 tx_sw_if_index0;
2610       u8 *p;
2611
2612       /* Prefetch next iteration */
2613       if (PREDICT_TRUE (n_left_from >= 4))
2614         {
2615           ip_adjacency_t *adj2;
2616           u32 adj_index2;
2617
2618           vlib_prefetch_buffer_header (b[3], LOAD);
2619           vlib_prefetch_buffer_data (b[2], LOAD);
2620
2621           /* Prefetch adj->rewrite_header */
2622           adj_index2 = vnet_buffer (b[2])->ip.adj_index[VLIB_TX];
2623           adj2 = adj_get (adj_index2);
2624           p = (u8 *) adj2;
2625           CLIB_PREFETCH (p + CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES,
2626                          LOAD);
2627         }
2628
2629       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2630
2631       /*
2632        * Prefetch the per-adjacency counters
2633        */
2634       if (do_counters)
2635         {
2636           vlib_prefetch_combined_counter (&adjacency_counters,
2637                                           thread_index, adj_index0);
2638         }
2639
2640       ip0 = vlib_buffer_get_current (b[0]);
2641
2642       error0 = IP4_ERROR_NONE;
2643
2644       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2645
2646       /* Rewrite packet header and updates lengths. */
2647       adj0 = adj_get (adj_index0);
2648
2649       /* Rewrite header was prefetched. */
2650       rw_len0 = adj0[0].rewrite_header.data_bytes;
2651       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2652
2653       /* Check MTU of outgoing interface. */
2654       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2655
2656       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2657         ip0_len = gso_mtu_sz (b[0]);
2658
2659       ip4_mtu_check (b[0], ip0_len,
2660                      adj0[0].rewrite_header.max_l3_packet_bytes,
2661                      ip0->flags_and_fragment_offset &
2662                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2663                      next + 0, &error0, is_midchain);
2664
2665       if (is_mcast)
2666         {
2667           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2668                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2669                     IP4_ERROR_SAME_INTERFACE : error0);
2670         }
2671
2672       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2673        * to see the IP header */
2674       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2675         {
2676           u32 next_index = adj0[0].rewrite_header.next_index;
2677           vlib_buffer_advance (b[0], -(word) rw_len0);
2678           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2679           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2680
2681           if (PREDICT_FALSE
2682               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2683             vnet_feature_arc_start (lm->output_feature_arc_index,
2684                                     tx_sw_if_index0, &next_index, b[0]);
2685           next[0] = next_index;
2686
2687           if (is_midchain)
2688             calc_checksums (vm, b[0]);
2689
2690           /* Guess we are only writing on simple Ethernet header. */
2691           vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
2692
2693           /*
2694            * Bump the per-adjacency counters
2695            */
2696           if (do_counters)
2697             vlib_increment_combined_counter
2698               (&adjacency_counters,
2699                thread_index,
2700                adj_index0, 1, vlib_buffer_length_in_chain (vm,
2701                                                            b[0]) + rw_len0);
2702
2703           if (is_midchain && adj0->sub_type.midchain.fixup_func)
2704             adj0->sub_type.midchain.fixup_func
2705               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2706
2707           if (is_mcast)
2708             /* copy bytes from the IP address into the MAC rewrite */
2709             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2710                                         adj0->rewrite_header.dst_mcast_offset,
2711                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2712         }
2713       else
2714         {
2715           b[0]->error = error_node->errors[error0];
2716           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2717             ip4_ttl_inc (b[0], ip0);
2718         }
2719
2720       next += 1;
2721       b += 1;
2722       n_left_from -= 1;
2723     }
2724 #endif
2725
2726   while (n_left_from > 0)
2727     {
2728       ip_adjacency_t *adj0;
2729       ip4_header_t *ip0;
2730       u32 rw_len0, adj_index0, error0;
2731       u32 tx_sw_if_index0;
2732
2733       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2734
2735       adj0 = adj_get (adj_index0);
2736
2737       if (do_counters)
2738         vlib_prefetch_combined_counter (&adjacency_counters,
2739                                         thread_index, adj_index0);
2740
2741       ip0 = vlib_buffer_get_current (b[0]);
2742
2743       error0 = IP4_ERROR_NONE;
2744
2745       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2746
2747
2748       /* Update packet buffer attributes/set output interface. */
2749       rw_len0 = adj0[0].rewrite_header.data_bytes;
2750       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2751
2752       /* Check MTU of outgoing interface. */
2753       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2754       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2755         ip0_len = gso_mtu_sz (b[0]);
2756
2757       ip4_mtu_check (b[0], ip0_len,
2758                      adj0[0].rewrite_header.max_l3_packet_bytes,
2759                      ip0->flags_and_fragment_offset &
2760                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2761                      next + 0, &error0, is_midchain);
2762
2763       if (is_mcast)
2764         {
2765           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2766                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2767                     IP4_ERROR_SAME_INTERFACE : error0);
2768         }
2769
2770       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2771        * to see the IP header */
2772       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2773         {
2774           u32 next_index = adj0[0].rewrite_header.next_index;
2775           vlib_buffer_advance (b[0], -(word) rw_len0);
2776           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2777           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2778
2779           if (PREDICT_FALSE
2780               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2781             vnet_feature_arc_start (lm->output_feature_arc_index,
2782                                     tx_sw_if_index0, &next_index, b[0]);
2783           next[0] = next_index;
2784
2785           if (is_midchain)
2786             /* this acts on the packet that is about to be encapped */
2787             calc_checksums (vm, b[0]);
2788
2789           /* Guess we are only writing on simple Ethernet header. */
2790           vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
2791
2792           if (do_counters)
2793             vlib_increment_combined_counter
2794               (&adjacency_counters,
2795                thread_index, adj_index0, 1,
2796                vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2797
2798           if (is_midchain && adj0->sub_type.midchain.fixup_func)
2799             adj0->sub_type.midchain.fixup_func
2800               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2801
2802           if (is_mcast)
2803             /* copy bytes from the IP address into the MAC rewrite */
2804             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2805                                         adj0->rewrite_header.dst_mcast_offset,
2806                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2807         }
2808       else
2809         {
2810           b[0]->error = error_node->errors[error0];
2811           /* undo the TTL decrement - we'll be back to do it again */
2812           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2813             ip4_ttl_inc (b[0], ip0);
2814         }
2815
2816       next += 1;
2817       b += 1;
2818       n_left_from -= 1;
2819     }
2820
2821
2822   /* Need to do trace after rewrites to pick up new packet data. */
2823   if (node->flags & VLIB_NODE_FLAG_TRACE)
2824     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2825
2826   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2827   return frame->n_vectors;
2828 }
2829
2830 always_inline uword
2831 ip4_rewrite_inline (vlib_main_t * vm,
2832                     vlib_node_runtime_t * node,
2833                     vlib_frame_t * frame,
2834                     int do_counters, int is_midchain, int is_mcast)
2835 {
2836   vnet_main_t *vnm = vnet_get_main ();
2837   if (PREDICT_FALSE (vnm->interface_main.gso_interface_count > 0))
2838     return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
2839                                         is_midchain, is_mcast,
2840                                         1 /* do_gso */ );
2841   else
2842     return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
2843                                         is_midchain, is_mcast,
2844                                         0 /* no do_gso */ );
2845 }
2846
2847
2848 /** @brief IPv4 rewrite node.
2849     @node ip4-rewrite
2850
2851     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2852     header checksum, fetch the ip adjacency, check the outbound mtu,
2853     apply the adjacency rewrite, and send pkts to the adjacency
2854     rewrite header's rewrite_next_index.
2855
2856     @param vm vlib_main_t corresponding to the current thread
2857     @param node vlib_node_runtime_t
2858     @param frame vlib_frame_t whose contents should be dispatched
2859
2860     @par Graph mechanics: buffer metadata, next index usage
2861
2862     @em Uses:
2863     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2864         - the rewrite adjacency index
2865     - <code>adj->lookup_next_index</code>
2866         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2867           the packet will be dropped.
2868     - <code>adj->rewrite_header</code>
2869         - Rewrite string length, rewrite string, next_index
2870
2871     @em Sets:
2872     - <code>b->current_data, b->current_length</code>
2873         - Updated net of applying the rewrite string
2874
2875     <em>Next Indices:</em>
2876     - <code> adj->rewrite_header.next_index </code>
2877       or @c ip4-drop
2878 */
2879
2880 VLIB_NODE_FN (ip4_rewrite_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2881                                  vlib_frame_t * frame)
2882 {
2883   if (adj_are_counters_enabled ())
2884     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2885   else
2886     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2887 }
2888
2889 VLIB_NODE_FN (ip4_rewrite_bcast_node) (vlib_main_t * vm,
2890                                        vlib_node_runtime_t * node,
2891                                        vlib_frame_t * frame)
2892 {
2893   if (adj_are_counters_enabled ())
2894     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2895   else
2896     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2897 }
2898
2899 VLIB_NODE_FN (ip4_midchain_node) (vlib_main_t * vm,
2900                                   vlib_node_runtime_t * node,
2901                                   vlib_frame_t * frame)
2902 {
2903   if (adj_are_counters_enabled ())
2904     return ip4_rewrite_inline (vm, node, frame, 1, 1, 0);
2905   else
2906     return ip4_rewrite_inline (vm, node, frame, 0, 1, 0);
2907 }
2908
2909 VLIB_NODE_FN (ip4_rewrite_mcast_node) (vlib_main_t * vm,
2910                                        vlib_node_runtime_t * node,
2911                                        vlib_frame_t * frame)
2912 {
2913   if (adj_are_counters_enabled ())
2914     return ip4_rewrite_inline (vm, node, frame, 1, 0, 1);
2915   else
2916     return ip4_rewrite_inline (vm, node, frame, 0, 0, 1);
2917 }
2918
2919 VLIB_NODE_FN (ip4_mcast_midchain_node) (vlib_main_t * vm,
2920                                         vlib_node_runtime_t * node,
2921                                         vlib_frame_t * frame)
2922 {
2923   if (adj_are_counters_enabled ())
2924     return ip4_rewrite_inline (vm, node, frame, 1, 1, 1);
2925   else
2926     return ip4_rewrite_inline (vm, node, frame, 0, 1, 1);
2927 }
2928
2929 /* *INDENT-OFF* */
2930 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2931   .name = "ip4-rewrite",
2932   .vector_size = sizeof (u32),
2933
2934   .format_trace = format_ip4_rewrite_trace,
2935
2936   .n_next_nodes = IP4_REWRITE_N_NEXT,
2937   .next_nodes = {
2938     [IP4_REWRITE_NEXT_DROP] = "ip4-drop",
2939     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2940     [IP4_REWRITE_NEXT_FRAGMENT] = "ip4-frag",
2941   },
2942 };
2943
2944 VLIB_REGISTER_NODE (ip4_rewrite_bcast_node) = {
2945   .name = "ip4-rewrite-bcast",
2946   .vector_size = sizeof (u32),
2947
2948   .format_trace = format_ip4_rewrite_trace,
2949   .sibling_of = "ip4-rewrite",
2950 };
2951
2952 VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = {
2953   .name = "ip4-rewrite-mcast",
2954   .vector_size = sizeof (u32),
2955
2956   .format_trace = format_ip4_rewrite_trace,
2957   .sibling_of = "ip4-rewrite",
2958 };
2959
2960 VLIB_REGISTER_NODE (ip4_mcast_midchain_node) = {
2961   .name = "ip4-mcast-midchain",
2962   .vector_size = sizeof (u32),
2963
2964   .format_trace = format_ip4_rewrite_trace,
2965   .sibling_of = "ip4-rewrite",
2966 };
2967
2968 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2969   .name = "ip4-midchain",
2970   .vector_size = sizeof (u32),
2971   .format_trace = format_ip4_rewrite_trace,
2972   .sibling_of = "ip4-rewrite",
2973 };
2974 /* *INDENT-ON */
2975
2976 static int
2977 ip4_lookup_validate (ip4_address_t * a, u32 fib_index0)
2978 {
2979   ip4_fib_mtrie_t *mtrie0;
2980   ip4_fib_mtrie_leaf_t leaf0;
2981   u32 lbi0;
2982
2983   mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
2984
2985   leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, a);
2986   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
2987   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
2988
2989   lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
2990
2991   return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get (fib_index0), a);
2992 }
2993
2994 static clib_error_t *
2995 test_lookup_command_fn (vlib_main_t * vm,
2996                         unformat_input_t * input, vlib_cli_command_t * cmd)
2997 {
2998   ip4_fib_t *fib;
2999   u32 table_id = 0;
3000   f64 count = 1;
3001   u32 n;
3002   int i;
3003   ip4_address_t ip4_base_address;
3004   u64 errors = 0;
3005
3006   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3007     {
3008       if (unformat (input, "table %d", &table_id))
3009         {
3010           /* Make sure the entry exists. */
3011           fib = ip4_fib_get (table_id);
3012           if ((fib) && (fib->index != table_id))
3013             return clib_error_return (0, "<fib-index> %d does not exist",
3014                                       table_id);
3015         }
3016       else if (unformat (input, "count %f", &count))
3017         ;
3018
3019       else if (unformat (input, "%U",
3020                          unformat_ip4_address, &ip4_base_address))
3021         ;
3022       else
3023         return clib_error_return (0, "unknown input `%U'",
3024                                   format_unformat_error, input);
3025     }
3026
3027   n = count;
3028
3029   for (i = 0; i < n; i++)
3030     {
3031       if (!ip4_lookup_validate (&ip4_base_address, table_id))
3032         errors++;
3033
3034       ip4_base_address.as_u32 =
3035         clib_host_to_net_u32 (1 +
3036                               clib_net_to_host_u32 (ip4_base_address.as_u32));
3037     }
3038
3039   if (errors)
3040     vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
3041   else
3042     vlib_cli_output (vm, "No errors in %d lookups\n", n);
3043
3044   return 0;
3045 }
3046
3047 /*?
3048  * Perform a lookup of an IPv4 Address (or range of addresses) in the
3049  * given FIB table to determine if there is a conflict with the
3050  * adjacency table. The fib-id can be determined by using the
3051  * '<em>show ip fib</em>' command. If fib-id is not entered, default value
3052  * of 0 is used.
3053  *
3054  * @todo This command uses fib-id, other commands use table-id (not
3055  * just a name, they are different indexes). Would like to change this
3056  * to table-id for consistency.
3057  *
3058  * @cliexpar
3059  * Example of how to run the test lookup command:
3060  * @cliexstart{test lookup 172.16.1.1 table 1 count 2}
3061  * No errors in 2 lookups
3062  * @cliexend
3063 ?*/
3064 /* *INDENT-OFF* */
3065 VLIB_CLI_COMMAND (lookup_test_command, static) =
3066 {
3067   .path = "test lookup",
3068   .short_help = "test lookup <ipv4-addr> [table <fib-id>] [count <nn>]",
3069   .function = test_lookup_command_fn,
3070 };
3071 /* *INDENT-ON* */
3072
3073 #ifndef CLIB_MARCH_VARIANT
3074 int
3075 vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
3076 {
3077   u32 fib_index;
3078
3079   fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id);
3080
3081   if (~0 == fib_index)
3082     return VNET_API_ERROR_NO_SUCH_FIB;
3083
3084   fib_table_set_flow_hash_config (fib_index, FIB_PROTOCOL_IP4,
3085                                   flow_hash_config);
3086
3087   return 0;
3088 }
3089 #endif
3090
3091 static clib_error_t *
3092 set_ip_flow_hash_command_fn (vlib_main_t * vm,
3093                              unformat_input_t * input,
3094                              vlib_cli_command_t * cmd)
3095 {
3096   int matched = 0;
3097   u32 table_id = 0;
3098   u32 flow_hash_config = 0;
3099   int rv;
3100
3101   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3102     {
3103       if (unformat (input, "table %d", &table_id))
3104         matched = 1;
3105 #define _(a,v) \
3106     else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
3107       foreach_flow_hash_bit
3108 #undef _
3109         else
3110         break;
3111     }
3112
3113   if (matched == 0)
3114     return clib_error_return (0, "unknown input `%U'",
3115                               format_unformat_error, input);
3116
3117   rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
3118   switch (rv)
3119     {
3120     case 0:
3121       break;
3122
3123     case VNET_API_ERROR_NO_SUCH_FIB:
3124       return clib_error_return (0, "no such FIB table %d", table_id);
3125
3126     default:
3127       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
3128       break;
3129     }
3130
3131   return 0;
3132 }
3133
3134 /*?
3135  * Configure the set of IPv4 fields used by the flow hash.
3136  *
3137  * @cliexpar
3138  * Example of how to set the flow hash on a given table:
3139  * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
3140  * Example of display the configured flow hash:
3141  * @cliexstart{show ip fib}
3142  * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
3143  * 0.0.0.0/0
3144  *   unicast-ip4-chain
3145  *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
3146  *     [0] [@0]: dpo-drop ip6
3147  * 0.0.0.0/32
3148  *   unicast-ip4-chain
3149  *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
3150  *     [0] [@0]: dpo-drop ip6
3151  * 224.0.0.0/8
3152  *   unicast-ip4-chain
3153  *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
3154  *     [0] [@0]: dpo-drop ip6
3155  * 6.0.1.2/32
3156  *   unicast-ip4-chain
3157  *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
3158  *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3159  * 7.0.0.1/32
3160  *   unicast-ip4-chain
3161  *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
3162  *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3163  *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3164  *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3165  *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3166  * 240.0.0.0/8
3167  *   unicast-ip4-chain
3168  *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
3169  *     [0] [@0]: dpo-drop ip6
3170  * 255.255.255.255/32
3171  *   unicast-ip4-chain
3172  *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
3173  *     [0] [@0]: dpo-drop ip6
3174  * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
3175  * 0.0.0.0/0
3176  *   unicast-ip4-chain
3177  *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
3178  *     [0] [@0]: dpo-drop ip6
3179  * 0.0.0.0/32
3180  *   unicast-ip4-chain
3181  *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
3182  *     [0] [@0]: dpo-drop ip6
3183  * 172.16.1.0/24
3184  *   unicast-ip4-chain
3185  *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
3186  *     [0] [@4]: ipv4-glean: af_packet0
3187  * 172.16.1.1/32
3188  *   unicast-ip4-chain
3189  *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
3190  *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
3191  * 172.16.1.2/32
3192  *   unicast-ip4-chain
3193  *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
3194  *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
3195  * 172.16.2.0/24
3196  *   unicast-ip4-chain
3197  *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
3198  *     [0] [@4]: ipv4-glean: af_packet1
3199  * 172.16.2.1/32
3200  *   unicast-ip4-chain
3201  *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
3202  *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
3203  * 224.0.0.0/8
3204  *   unicast-ip4-chain
3205  *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
3206  *     [0] [@0]: dpo-drop ip6
3207  * 240.0.0.0/8
3208  *   unicast-ip4-chain
3209  *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
3210  *     [0] [@0]: dpo-drop ip6
3211  * 255.255.255.255/32
3212  *   unicast-ip4-chain
3213  *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
3214  *     [0] [@0]: dpo-drop ip6
3215  * @cliexend
3216 ?*/
3217 /* *INDENT-OFF* */
3218 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) =
3219 {
3220   .path = "set ip flow-hash",
3221   .short_help =
3222   "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
3223   .function = set_ip_flow_hash_command_fn,
3224 };
3225 /* *INDENT-ON* */
3226
3227 #ifndef CLIB_MARCH_VARIANT
3228 int
3229 vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
3230                              u32 table_index)
3231 {
3232   vnet_main_t *vnm = vnet_get_main ();
3233   vnet_interface_main_t *im = &vnm->interface_main;
3234   ip4_main_t *ipm = &ip4_main;
3235   ip_lookup_main_t *lm = &ipm->lookup_main;
3236   vnet_classify_main_t *cm = &vnet_classify_main;
3237   ip4_address_t *if_addr;
3238
3239   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
3240     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
3241
3242   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
3243     return VNET_API_ERROR_NO_SUCH_ENTRY;
3244
3245   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
3246   lm->classify_table_index_by_sw_if_index[sw_if_index] = table_index;
3247
3248   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
3249
3250   if (NULL != if_addr)
3251     {
3252       fib_prefix_t pfx = {
3253         .fp_len = 32,
3254         .fp_proto = FIB_PROTOCOL_IP4,
3255         .fp_addr.ip4 = *if_addr,
3256       };
3257       u32 fib_index;
3258
3259       fib_index = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
3260                                                        sw_if_index);
3261
3262
3263       if (table_index != (u32) ~ 0)
3264         {
3265           dpo_id_t dpo = DPO_INVALID;
3266
3267           dpo_set (&dpo,
3268                    DPO_CLASSIFY,
3269                    DPO_PROTO_IP4,
3270                    classify_dpo_create (DPO_PROTO_IP4, table_index));
3271
3272           fib_table_entry_special_dpo_add (fib_index,
3273                                            &pfx,
3274                                            FIB_SOURCE_CLASSIFY,
3275                                            FIB_ENTRY_FLAG_NONE, &dpo);
3276           dpo_reset (&dpo);
3277         }
3278       else
3279         {
3280           fib_table_entry_special_remove (fib_index,
3281                                           &pfx, FIB_SOURCE_CLASSIFY);
3282         }
3283     }
3284
3285   return 0;
3286 }
3287 #endif
3288
3289 static clib_error_t *
3290 set_ip_classify_command_fn (vlib_main_t * vm,
3291                             unformat_input_t * input,
3292                             vlib_cli_command_t * cmd)
3293 {
3294   u32 table_index = ~0;
3295   int table_index_set = 0;
3296   u32 sw_if_index = ~0;
3297   int rv;
3298
3299   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3300     {
3301       if (unformat (input, "table-index %d", &table_index))
3302         table_index_set = 1;
3303       else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
3304                          vnet_get_main (), &sw_if_index))
3305         ;
3306       else
3307         break;
3308     }
3309
3310   if (table_index_set == 0)
3311     return clib_error_return (0, "classify table-index must be specified");
3312
3313   if (sw_if_index == ~0)
3314     return clib_error_return (0, "interface / subif must be specified");
3315
3316   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
3317
3318   switch (rv)
3319     {
3320     case 0:
3321       break;
3322
3323     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
3324       return clib_error_return (0, "No such interface");
3325
3326     case VNET_API_ERROR_NO_SUCH_ENTRY:
3327       return clib_error_return (0, "No such classifier table");
3328     }
3329   return 0;
3330 }
3331
3332 /*?
3333  * Assign a classification table to an interface. The classification
3334  * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
3335  * commands. Once the table is create, use this command to filter packets
3336  * on an interface.
3337  *
3338  * @cliexpar
3339  * Example of how to assign a classification table to an interface:
3340  * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
3341 ?*/
3342 /* *INDENT-OFF* */
3343 VLIB_CLI_COMMAND (set_ip_classify_command, static) =
3344 {
3345     .path = "set ip classify",
3346     .short_help =
3347     "set ip classify intfc <interface> table-index <classify-idx>",
3348     .function = set_ip_classify_command_fn,
3349 };
3350 /* *INDENT-ON* */
3351
3352 static clib_error_t *
3353 ip4_config (vlib_main_t * vm, unformat_input_t * input)
3354 {
3355   ip4_main_t *im = &ip4_main;
3356   uword heapsize = 0;
3357
3358   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3359     {
3360       if (unformat (input, "heap-size %U", unformat_memory_size, &heapsize))
3361         ;
3362       else
3363         return clib_error_return (0,
3364                                   "invalid heap-size parameter `%U'",
3365                                   format_unformat_error, input);
3366     }
3367
3368   im->mtrie_heap_size = heapsize;
3369
3370   return 0;
3371 }
3372
3373 VLIB_EARLY_CONFIG_FUNCTION (ip4_config, "ip");
3374
3375 /*
3376  * fd.io coding-style-patch-verification: ON
3377  *
3378  * Local Variables:
3379  * eval: (c-set-style "gnu")
3380  * End:
3381  */