ip: apply dual loop unrolling in ip4_rewrite
[vpp.git] / src / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ip/ip_frag.h>
43 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
44 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
45 #include <vnet/ppp/ppp.h>
46 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
47 #include <vnet/api_errno.h>     /* for API error numbers */
48 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
50 #include <vnet/fib/fib_urpf_list.h>     /* for FIB uRPF check */
51 #include <vnet/fib/ip4_fib.h>
52 #include <vnet/dpo/load_balance.h>
53 #include <vnet/dpo/load_balance_map.h>
54 #include <vnet/dpo/classify_dpo.h>
55 #include <vnet/mfib/mfib_table.h>       /* for mFIB table and entry creation */
56
57 #include <vnet/ip/ip4_forward.h>
58 #include <vnet/interface_output.h>
59
60 /** @brief IPv4 lookup node.
61     @node ip4-lookup
62
63     This is the main IPv4 lookup dispatch node.
64
65     @param vm vlib_main_t corresponding to the current thread
66     @param node vlib_node_runtime_t
67     @param frame vlib_frame_t whose contents should be dispatched
68
69     @par Graph mechanics: buffer metadata, next index usage
70
71     @em Uses:
72     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
73         - Indicates the @c sw_if_index value of the interface that the
74           packet was received on.
75     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
76         - When the value is @c ~0 then the node performs a longest prefix
77           match (LPM) for the packet destination address in the FIB attached
78           to the receive interface.
79         - Otherwise perform LPM for the packet destination address in the
80           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
81           value (0, 1, ...) and not a VRF id.
82
83     @em Sets:
84     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
85         - The lookup result adjacency index.
86
87     <em>Next Index:</em>
88     - Dispatches the packet to the node index found in
89       ip_adjacency_t @c adj->lookup_next_index
90       (where @c adj is the lookup result adjacency).
91 */
92 VLIB_NODE_FN (ip4_lookup_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
93                                 vlib_frame_t * frame)
94 {
95   return ip4_lookup_inline (vm, node, frame);
96 }
97
98 static u8 *format_ip4_lookup_trace (u8 * s, va_list * args);
99
100 /* *INDENT-OFF* */
101 VLIB_REGISTER_NODE (ip4_lookup_node) =
102 {
103   .name = "ip4-lookup",
104   .vector_size = sizeof (u32),
105   .format_trace = format_ip4_lookup_trace,
106   .n_next_nodes = IP_LOOKUP_N_NEXT,
107   .next_nodes = IP4_LOOKUP_NEXT_NODES,
108 };
109 /* *INDENT-ON* */
110
111 VLIB_NODE_FN (ip4_load_balance_node) (vlib_main_t * vm,
112                                       vlib_node_runtime_t * node,
113                                       vlib_frame_t * frame)
114 {
115   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
116   u32 n_left, *from;
117   u32 thread_index = vm->thread_index;
118   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
119   u16 nexts[VLIB_FRAME_SIZE], *next;
120
121   from = vlib_frame_vector_args (frame);
122   n_left = frame->n_vectors;
123   next = nexts;
124
125   vlib_get_buffers (vm, from, bufs, n_left);
126
127   while (n_left >= 4)
128     {
129       const load_balance_t *lb0, *lb1;
130       const ip4_header_t *ip0, *ip1;
131       u32 lbi0, hc0, lbi1, hc1;
132       const dpo_id_t *dpo0, *dpo1;
133
134       /* Prefetch next iteration. */
135       {
136         vlib_prefetch_buffer_header (b[2], LOAD);
137         vlib_prefetch_buffer_header (b[3], LOAD);
138
139         CLIB_PREFETCH (b[2]->data, sizeof (ip0[0]), LOAD);
140         CLIB_PREFETCH (b[3]->data, sizeof (ip0[0]), LOAD);
141       }
142
143       ip0 = vlib_buffer_get_current (b[0]);
144       ip1 = vlib_buffer_get_current (b[1]);
145       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
146       lbi1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
147
148       lb0 = load_balance_get (lbi0);
149       lb1 = load_balance_get (lbi1);
150
151       /*
152        * this node is for via FIBs we can re-use the hash value from the
153        * to node if present.
154        * We don't want to use the same hash value at each level in the recursion
155        * graph as that would lead to polarisation
156        */
157       hc0 = hc1 = 0;
158
159       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
160         {
161           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
162             {
163               hc0 = vnet_buffer (b[0])->ip.flow_hash =
164                 vnet_buffer (b[0])->ip.flow_hash >> 1;
165             }
166           else
167             {
168               hc0 = vnet_buffer (b[0])->ip.flow_hash =
169                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
170             }
171           dpo0 = load_balance_get_fwd_bucket
172             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
173         }
174       else
175         {
176           dpo0 = load_balance_get_bucket_i (lb0, 0);
177         }
178       if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
179         {
180           if (PREDICT_TRUE (vnet_buffer (b[1])->ip.flow_hash))
181             {
182               hc1 = vnet_buffer (b[1])->ip.flow_hash =
183                 vnet_buffer (b[1])->ip.flow_hash >> 1;
184             }
185           else
186             {
187               hc1 = vnet_buffer (b[1])->ip.flow_hash =
188                 ip4_compute_flow_hash (ip1, lb1->lb_hash_config);
189             }
190           dpo1 = load_balance_get_fwd_bucket
191             (lb1, (hc1 & (lb1->lb_n_buckets_minus_1)));
192         }
193       else
194         {
195           dpo1 = load_balance_get_bucket_i (lb1, 0);
196         }
197
198       next[0] = dpo0->dpoi_next_node;
199       next[1] = dpo1->dpoi_next_node;
200
201       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
202       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
203
204       vlib_increment_combined_counter
205         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
206       vlib_increment_combined_counter
207         (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b[1]));
208
209       b += 2;
210       next += 2;
211       n_left -= 2;
212     }
213
214   while (n_left > 0)
215     {
216       const load_balance_t *lb0;
217       const ip4_header_t *ip0;
218       const dpo_id_t *dpo0;
219       u32 lbi0, hc0;
220
221       ip0 = vlib_buffer_get_current (b[0]);
222       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
223
224       lb0 = load_balance_get (lbi0);
225
226       hc0 = 0;
227       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
228         {
229           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
230             {
231               hc0 = vnet_buffer (b[0])->ip.flow_hash =
232                 vnet_buffer (b[0])->ip.flow_hash >> 1;
233             }
234           else
235             {
236               hc0 = vnet_buffer (b[0])->ip.flow_hash =
237                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
238             }
239           dpo0 = load_balance_get_fwd_bucket
240             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
241         }
242       else
243         {
244           dpo0 = load_balance_get_bucket_i (lb0, 0);
245         }
246
247       next[0] = dpo0->dpoi_next_node;
248       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
249
250       vlib_increment_combined_counter
251         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
252
253       b += 1;
254       next += 1;
255       n_left -= 1;
256     }
257
258   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
259   if (node->flags & VLIB_NODE_FLAG_TRACE)
260     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
261
262   return frame->n_vectors;
263 }
264
265 /* *INDENT-OFF* */
266 VLIB_REGISTER_NODE (ip4_load_balance_node) =
267 {
268   .name = "ip4-load-balance",
269   .vector_size = sizeof (u32),
270   .sibling_of = "ip4-lookup",
271   .format_trace = format_ip4_lookup_trace,
272 };
273 /* *INDENT-ON* */
274
275 #ifndef CLIB_MARCH_VARIANT
276 /* get first interface address */
277 ip4_address_t *
278 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
279                              ip_interface_address_t ** result_ia)
280 {
281   ip_lookup_main_t *lm = &im->lookup_main;
282   ip_interface_address_t *ia = 0;
283   ip4_address_t *result = 0;
284
285   /* *INDENT-OFF* */
286   foreach_ip_interface_address
287     (lm, ia, sw_if_index,
288      1 /* honor unnumbered */ ,
289      ({
290        ip4_address_t * a =
291          ip_interface_address_get_address (lm, ia);
292        result = a;
293        break;
294      }));
295   /* *INDENT-OFF* */
296   if (result_ia)
297     *result_ia = result ? ia : 0;
298   return result;
299 }
300 #endif
301
302 static void
303 ip4_add_subnet_bcast_route (u32 fib_index,
304                             fib_prefix_t *pfx,
305                             u32 sw_if_index)
306 {
307   vnet_sw_interface_flags_t iflags;
308
309   iflags = vnet_sw_interface_get_flags(vnet_get_main(), sw_if_index);
310
311   fib_table_entry_special_remove(fib_index,
312                                  pfx,
313                                  FIB_SOURCE_INTERFACE);
314
315   if (iflags & VNET_SW_INTERFACE_FLAG_DIRECTED_BCAST)
316     {
317       fib_table_entry_update_one_path (fib_index, pfx,
318                                        FIB_SOURCE_INTERFACE,
319                                        FIB_ENTRY_FLAG_NONE,
320                                        DPO_PROTO_IP4,
321                                        /* No next-hop address */
322                                        &ADJ_BCAST_ADDR,
323                                        sw_if_index,
324                                        // invalid FIB index
325                                        ~0,
326                                        1,
327                                        // no out-label stack
328                                        NULL,
329                                        FIB_ROUTE_PATH_FLAG_NONE);
330     }
331   else
332     {
333         fib_table_entry_special_add(fib_index,
334                                     pfx,
335                                     FIB_SOURCE_INTERFACE,
336                                     (FIB_ENTRY_FLAG_DROP |
337                                      FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
338     }
339 }
340
341 static void
342 ip4_add_interface_prefix_routes (ip4_main_t *im,
343                                  u32 sw_if_index,
344                                  u32 fib_index,
345                                  ip_interface_address_t * a)
346 {
347   ip_lookup_main_t *lm = &im->lookup_main;
348   ip_interface_prefix_t *if_prefix;
349   ip4_address_t *address = ip_interface_address_get_address (lm, a);
350
351   ip_interface_prefix_key_t key = {
352     .prefix = {
353       .fp_len = a->address_length,
354       .fp_proto = FIB_PROTOCOL_IP4,
355       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[a->address_length],
356     },
357     .sw_if_index = sw_if_index,
358   };
359
360   fib_prefix_t pfx_special = {
361     .fp_proto = FIB_PROTOCOL_IP4,
362   };
363
364   /* If prefix already set on interface, just increment ref count & return */
365   if_prefix = ip_get_interface_prefix (lm, &key);
366   if (if_prefix)
367     {
368       if_prefix->ref_count += 1;
369       return;
370     }
371
372   /* New prefix - allocate a pool entry, initialize it, add to the hash */
373   pool_get (lm->if_prefix_pool, if_prefix);
374   if_prefix->ref_count = 1;
375   if_prefix->src_ia_index = a - lm->if_address_pool;
376   clib_memcpy (&if_prefix->key, &key, sizeof (key));
377   mhash_set (&lm->prefix_to_if_prefix_index, &key,
378              if_prefix - lm->if_prefix_pool, 0 /* old value */);
379
380   /* length <= 30 - add glean, drop first address, maybe drop bcast address */
381   if (a->address_length <= 30)
382     {
383       pfx_special.fp_len = a->address_length;
384       pfx_special.fp_addr.ip4.as_u32 = address->as_u32;
385
386       /* set the glean route for the prefix */
387       fib_table_entry_update_one_path (fib_index, &pfx_special,
388                                        FIB_SOURCE_INTERFACE,
389                                        (FIB_ENTRY_FLAG_CONNECTED |
390                                         FIB_ENTRY_FLAG_ATTACHED),
391                                        DPO_PROTO_IP4,
392                                        /* No next-hop address */
393                                        NULL,
394                                        sw_if_index,
395                                        /* invalid FIB index */
396                                        ~0,
397                                        1,
398                                        /* no out-label stack */
399                                        NULL,
400                                        FIB_ROUTE_PATH_FLAG_NONE);
401
402       /* set a drop route for the base address of the prefix */
403       pfx_special.fp_len = 32;
404       pfx_special.fp_addr.ip4.as_u32 =
405         address->as_u32 & im->fib_masks[a->address_length];
406
407       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
408         fib_table_entry_special_add (fib_index, &pfx_special,
409                                      FIB_SOURCE_INTERFACE,
410                                      (FIB_ENTRY_FLAG_DROP |
411                                       FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
412
413       /* set a route for the broadcast address of the prefix */
414       pfx_special.fp_len = 32;
415       pfx_special.fp_addr.ip4.as_u32 =
416         address->as_u32 | ~im->fib_masks[a->address_length];
417       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
418         ip4_add_subnet_bcast_route (fib_index, &pfx_special, sw_if_index);
419
420
421     }
422   /* length == 31 - add an attached route for the other address */
423   else if (a->address_length == 31)
424     {
425       pfx_special.fp_len = 32;
426       pfx_special.fp_addr.ip4.as_u32 =
427         address->as_u32 ^ clib_host_to_net_u32(1);
428
429       fib_table_entry_update_one_path (fib_index, &pfx_special,
430                                        FIB_SOURCE_INTERFACE,
431                                        (FIB_ENTRY_FLAG_ATTACHED),
432                                        DPO_PROTO_IP4,
433                                        &pfx_special.fp_addr,
434                                        sw_if_index,
435                                        /* invalid FIB index */
436                                        ~0,
437                                        1,
438                                        NULL,
439                                        FIB_ROUTE_PATH_FLAG_NONE);
440     }
441 }
442
443 static void
444 ip4_add_interface_routes (u32 sw_if_index,
445                           ip4_main_t * im, u32 fib_index,
446                           ip_interface_address_t * a)
447 {
448   ip_lookup_main_t *lm = &im->lookup_main;
449   ip4_address_t *address = ip_interface_address_get_address (lm, a);
450   fib_prefix_t pfx = {
451     .fp_len = 32,
452     .fp_proto = FIB_PROTOCOL_IP4,
453     .fp_addr.ip4 = *address,
454   };
455
456   /* set special routes for the prefix if needed */
457   ip4_add_interface_prefix_routes (im, sw_if_index, fib_index, a);
458
459   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
460     {
461       u32 classify_table_index =
462         lm->classify_table_index_by_sw_if_index[sw_if_index];
463       if (classify_table_index != (u32) ~ 0)
464         {
465           dpo_id_t dpo = DPO_INVALID;
466
467           dpo_set (&dpo,
468                    DPO_CLASSIFY,
469                    DPO_PROTO_IP4,
470                    classify_dpo_create (DPO_PROTO_IP4, classify_table_index));
471
472           fib_table_entry_special_dpo_add (fib_index,
473                                            &pfx,
474                                            FIB_SOURCE_CLASSIFY,
475                                            FIB_ENTRY_FLAG_NONE, &dpo);
476           dpo_reset (&dpo);
477         }
478     }
479
480   fib_table_entry_update_one_path (fib_index, &pfx,
481                                    FIB_SOURCE_INTERFACE,
482                                    (FIB_ENTRY_FLAG_CONNECTED |
483                                     FIB_ENTRY_FLAG_LOCAL),
484                                    DPO_PROTO_IP4,
485                                    &pfx.fp_addr,
486                                    sw_if_index,
487                                    // invalid FIB index
488                                    ~0,
489                                    1, NULL,
490                                    FIB_ROUTE_PATH_FLAG_NONE);
491 }
492
493 static void
494 ip4_del_interface_prefix_routes (ip4_main_t * im,
495                                  u32 sw_if_index,
496                                  u32 fib_index,
497                                  ip4_address_t * address,
498                                  u32 address_length)
499 {
500   ip_lookup_main_t *lm = &im->lookup_main;
501   ip_interface_prefix_t *if_prefix;
502
503   ip_interface_prefix_key_t key = {
504     .prefix = {
505       .fp_len = address_length,
506       .fp_proto = FIB_PROTOCOL_IP4,
507       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[address_length],
508     },
509     .sw_if_index = sw_if_index,
510   };
511
512   fib_prefix_t pfx_special = {
513     .fp_len = 32,
514     .fp_proto = FIB_PROTOCOL_IP4,
515   };
516
517   if_prefix = ip_get_interface_prefix (lm, &key);
518   if (!if_prefix)
519     {
520       clib_warning ("Prefix not found while deleting %U",
521                     format_ip4_address_and_length, address, address_length);
522       return;
523     }
524
525   if_prefix->ref_count -= 1;
526
527   /*
528    * Routes need to be adjusted if:
529    * - deleting last intf addr in prefix
530    * - deleting intf addr used as default source address in glean adjacency
531    *
532    * We're done now otherwise
533    */
534   if ((if_prefix->ref_count > 0) &&
535       !pool_is_free_index (lm->if_address_pool, if_prefix->src_ia_index))
536     return;
537
538   /* length <= 30, delete glean route, first address, last address */
539   if (address_length <= 30)
540     {
541
542       /* remove glean route for prefix */
543       pfx_special.fp_addr.ip4 = *address;
544       pfx_special.fp_len = address_length;
545       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
546
547       /* if no more intf addresses in prefix, remove other special routes */
548       if (!if_prefix->ref_count)
549         {
550           /* first address in prefix */
551           pfx_special.fp_addr.ip4.as_u32 =
552             address->as_u32 & im->fib_masks[address_length];
553           pfx_special.fp_len = 32;
554
555           if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
556           fib_table_entry_special_remove (fib_index,
557                                           &pfx_special,
558                                           FIB_SOURCE_INTERFACE);
559
560           /* prefix broadcast address */
561           pfx_special.fp_addr.ip4.as_u32 =
562             address->as_u32 | ~im->fib_masks[address_length];
563           pfx_special.fp_len = 32;
564
565           if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
566           fib_table_entry_special_remove (fib_index,
567                                           &pfx_special,
568                                           FIB_SOURCE_INTERFACE);
569         }
570       else
571         /* default source addr just got deleted, find another */
572         {
573           ip_interface_address_t *new_src_ia = NULL;
574           ip4_address_t *new_src_addr = NULL;
575
576           new_src_addr =
577             ip4_interface_address_matching_destination
578               (im, address, sw_if_index, &new_src_ia);
579
580           if_prefix->src_ia_index = new_src_ia - lm->if_address_pool;
581
582           pfx_special.fp_len = address_length;
583           pfx_special.fp_addr.ip4 = *new_src_addr;
584
585           /* set new glean route for the prefix */
586           fib_table_entry_update_one_path (fib_index, &pfx_special,
587                                            FIB_SOURCE_INTERFACE,
588                                            (FIB_ENTRY_FLAG_CONNECTED |
589                                             FIB_ENTRY_FLAG_ATTACHED),
590                                            DPO_PROTO_IP4,
591                                            /* No next-hop address */
592                                            NULL,
593                                            sw_if_index,
594                                            /* invalid FIB index */
595                                            ~0,
596                                            1,
597                                            /* no out-label stack */
598                                            NULL,
599                                            FIB_ROUTE_PATH_FLAG_NONE);
600           return;
601         }
602     }
603   /* length == 31, delete attached route for the other address */
604   else if (address_length == 31)
605     {
606       pfx_special.fp_addr.ip4.as_u32 =
607         address->as_u32 ^ clib_host_to_net_u32(1);
608
609       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
610     }
611
612   mhash_unset (&lm->prefix_to_if_prefix_index, &key, 0 /* old_value */);
613   pool_put (lm->if_prefix_pool, if_prefix);
614 }
615
616 static void
617 ip4_del_interface_routes (u32 sw_if_index,
618                           ip4_main_t * im,
619                           u32 fib_index,
620                           ip4_address_t * address, u32 address_length)
621 {
622   fib_prefix_t pfx = {
623     .fp_len = address_length,
624     .fp_proto = FIB_PROTOCOL_IP4,
625     .fp_addr.ip4 = *address,
626   };
627
628   ip4_del_interface_prefix_routes (im, sw_if_index, fib_index,
629                                    address, address_length);
630
631   pfx.fp_len = 32;
632   fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
633 }
634
635 #ifndef CLIB_MARCH_VARIANT
636 void
637 ip4_sw_interface_enable_disable (u32 sw_if_index, u32 is_enable)
638 {
639   ip4_main_t *im = &ip4_main;
640
641   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
642
643   /*
644    * enable/disable only on the 1<->0 transition
645    */
646   if (is_enable)
647     {
648       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
649         return;
650     }
651   else
652     {
653       ASSERT (im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
654       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
655         return;
656     }
657   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
658                                !is_enable, 0, 0);
659
660
661   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
662                                sw_if_index, !is_enable, 0, 0);
663
664   {
665     ip4_enable_disable_interface_callback_t *cb;
666     vec_foreach (cb, im->enable_disable_interface_callbacks)
667       cb->function (im, cb->function_opaque, sw_if_index, is_enable);
668   }
669 }
670
671 static clib_error_t *
672 ip4_add_del_interface_address_internal (vlib_main_t * vm,
673                                         u32 sw_if_index,
674                                         ip4_address_t * address,
675                                         u32 address_length, u32 is_del)
676 {
677   vnet_main_t *vnm = vnet_get_main ();
678   ip4_main_t *im = &ip4_main;
679   ip_lookup_main_t *lm = &im->lookup_main;
680   clib_error_t *error = 0;
681   u32 if_address_index, elts_before;
682   ip4_address_fib_t ip4_af, *addr_fib = 0;
683
684   /* local0 interface doesn't support IP addressing  */
685   if (sw_if_index == 0)
686     {
687       return
688        clib_error_create ("local0 interface doesn't support IP addressing");
689     }
690
691   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
692   ip4_addr_fib_init (&ip4_af, address,
693                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
694   vec_add1 (addr_fib, ip4_af);
695
696   /*
697    * there is no support for adj-fib handling in the presence of overlapping
698    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
699    * most routers do.
700    */
701   /* *INDENT-OFF* */
702   if (!is_del)
703     {
704       /* When adding an address check that it does not conflict
705          with an existing address on any interface in this table. */
706       ip_interface_address_t *ia;
707       vnet_sw_interface_t *sif;
708
709       pool_foreach(sif, vnm->interface_main.sw_interfaces,
710       ({
711           if (im->fib_index_by_sw_if_index[sw_if_index] ==
712               im->fib_index_by_sw_if_index[sif->sw_if_index])
713             {
714               foreach_ip_interface_address
715                 (&im->lookup_main, ia, sif->sw_if_index,
716                  0 /* honor unnumbered */ ,
717                  ({
718                    ip4_address_t * x =
719                      ip_interface_address_get_address
720                      (&im->lookup_main, ia);
721                    if (ip4_destination_matches_route
722                        (im, address, x, ia->address_length) ||
723                        ip4_destination_matches_route (im,
724                                                       x,
725                                                       address,
726                                                       address_length))
727                      {
728                        /* an intf may have >1 addr from the same prefix */
729                        if ((sw_if_index == sif->sw_if_index) &&
730                            (ia->address_length == address_length) &&
731                            (x->as_u32 != address->as_u32))
732                          continue;
733
734                        /* error if the length or intf was different */
735                        vnm->api_errno = VNET_API_ERROR_DUPLICATE_IF_ADDRESS;
736
737                        return
738                          clib_error_create
739                          ("failed to add %U which conflicts with %U for interface %U",
740                           format_ip4_address_and_length, address,
741                           address_length,
742                           format_ip4_address_and_length, x,
743                           ia->address_length,
744                           format_vnet_sw_if_index_name, vnm,
745                           sif->sw_if_index);
746                      }
747                  }));
748             }
749       }));
750     }
751   /* *INDENT-ON* */
752
753   elts_before = pool_elts (lm->if_address_pool);
754
755   error = ip_interface_address_add_del
756     (lm, sw_if_index, addr_fib, address_length, is_del, &if_address_index);
757   if (error)
758     goto done;
759
760   ip4_sw_interface_enable_disable (sw_if_index, !is_del);
761
762   /* intf addr routes are added/deleted on admin up/down */
763   if (vnet_sw_interface_is_admin_up (vnm, sw_if_index))
764     {
765       if (is_del)
766         ip4_del_interface_routes (sw_if_index,
767                                   im, ip4_af.fib_index, address,
768                                   address_length);
769       else
770         ip4_add_interface_routes (sw_if_index,
771                                   im, ip4_af.fib_index,
772                                   pool_elt_at_index
773                                   (lm->if_address_pool, if_address_index));
774     }
775
776   /* If pool did not grow/shrink: add duplicate address. */
777   if (elts_before != pool_elts (lm->if_address_pool))
778     {
779       ip4_add_del_interface_address_callback_t *cb;
780       vec_foreach (cb, im->add_del_interface_address_callbacks)
781         cb->function (im, cb->function_opaque, sw_if_index,
782                       address, address_length, if_address_index, is_del);
783     }
784
785 done:
786   vec_free (addr_fib);
787   return error;
788 }
789
790 clib_error_t *
791 ip4_add_del_interface_address (vlib_main_t * vm,
792                                u32 sw_if_index,
793                                ip4_address_t * address,
794                                u32 address_length, u32 is_del)
795 {
796   return ip4_add_del_interface_address_internal
797     (vm, sw_if_index, address, address_length, is_del);
798 }
799
800 void
801 ip4_directed_broadcast (u32 sw_if_index, u8 enable)
802 {
803   ip_interface_address_t *ia;
804   ip4_main_t *im;
805
806   im = &ip4_main;
807
808   /*
809    * when directed broadcast is enabled, the subnet braodcast route will forward
810    * packets using an adjacency with a broadcast MAC. otherwise it drops
811    */
812   /* *INDENT-OFF* */
813   foreach_ip_interface_address(&im->lookup_main, ia,
814                                sw_if_index, 0,
815      ({
816        if (ia->address_length <= 30)
817          {
818            ip4_address_t *ipa;
819
820            ipa = ip_interface_address_get_address (&im->lookup_main, ia);
821
822            fib_prefix_t pfx = {
823              .fp_len = 32,
824              .fp_proto = FIB_PROTOCOL_IP4,
825              .fp_addr = {
826                .ip4.as_u32 = (ipa->as_u32 | ~im->fib_masks[ia->address_length]),
827              },
828            };
829
830            ip4_add_subnet_bcast_route
831              (fib_table_get_index_for_sw_if_index(FIB_PROTOCOL_IP4,
832                                                   sw_if_index),
833               &pfx, sw_if_index);
834          }
835      }));
836   /* *INDENT-ON* */
837 }
838 #endif
839
840 static clib_error_t *
841 ip4_sw_interface_admin_up_down (vnet_main_t * vnm, u32 sw_if_index, u32 flags)
842 {
843   ip4_main_t *im = &ip4_main;
844   ip_interface_address_t *ia;
845   ip4_address_t *a;
846   u32 is_admin_up, fib_index;
847
848   /* Fill in lookup tables with default table (0). */
849   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
850
851   vec_validate_init_empty (im->
852                            lookup_main.if_address_pool_index_by_sw_if_index,
853                            sw_if_index, ~0);
854
855   is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
856
857   fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
858
859   /* *INDENT-OFF* */
860   foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
861                                 0 /* honor unnumbered */,
862   ({
863     a = ip_interface_address_get_address (&im->lookup_main, ia);
864     if (is_admin_up)
865       ip4_add_interface_routes (sw_if_index,
866                                 im, fib_index,
867                                 ia);
868     else
869       ip4_del_interface_routes (sw_if_index,
870                                 im, fib_index,
871                                 a, ia->address_length);
872   }));
873   /* *INDENT-ON* */
874
875   return 0;
876 }
877
878 VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
879
880 /* Built-in ip4 unicast rx feature path definition */
881 /* *INDENT-OFF* */
882 VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
883 {
884   .arc_name = "ip4-unicast",
885   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
886   .last_in_arc = "ip4-lookup",
887   .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
888 };
889
890 VNET_FEATURE_INIT (ip4_flow_classify, static) =
891 {
892   .arc_name = "ip4-unicast",
893   .node_name = "ip4-flow-classify",
894   .runs_before = VNET_FEATURES ("ip4-inacl"),
895 };
896
897 VNET_FEATURE_INIT (ip4_inacl, static) =
898 {
899   .arc_name = "ip4-unicast",
900   .node_name = "ip4-inacl",
901   .runs_before = VNET_FEATURES ("ip4-source-check-via-rx"),
902 };
903
904 VNET_FEATURE_INIT (ip4_source_check_1, static) =
905 {
906   .arc_name = "ip4-unicast",
907   .node_name = "ip4-source-check-via-rx",
908   .runs_before = VNET_FEATURES ("ip4-source-check-via-any"),
909 };
910
911 VNET_FEATURE_INIT (ip4_source_check_2, static) =
912 {
913   .arc_name = "ip4-unicast",
914   .node_name = "ip4-source-check-via-any",
915   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
916 };
917
918 VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) =
919 {
920   .arc_name = "ip4-unicast",
921   .node_name = "ip4-source-and-port-range-check-rx",
922   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
923 };
924
925 VNET_FEATURE_INIT (ip4_policer_classify, static) =
926 {
927   .arc_name = "ip4-unicast",
928   .node_name = "ip4-policer-classify",
929   .runs_before = VNET_FEATURES ("ipsec4-input-feature"),
930 };
931
932 VNET_FEATURE_INIT (ip4_ipsec, static) =
933 {
934   .arc_name = "ip4-unicast",
935   .node_name = "ipsec4-input-feature",
936   .runs_before = VNET_FEATURES ("vpath-input-ip4"),
937 };
938
939 VNET_FEATURE_INIT (ip4_vpath, static) =
940 {
941   .arc_name = "ip4-unicast",
942   .node_name = "vpath-input-ip4",
943   .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
944 };
945
946 VNET_FEATURE_INIT (ip4_vxlan_bypass, static) =
947 {
948   .arc_name = "ip4-unicast",
949   .node_name = "ip4-vxlan-bypass",
950   .runs_before = VNET_FEATURES ("ip4-lookup"),
951 };
952
953 VNET_FEATURE_INIT (ip4_not_enabled, static) =
954 {
955   .arc_name = "ip4-unicast",
956   .node_name = "ip4-not-enabled",
957   .runs_before = VNET_FEATURES ("ip4-lookup"),
958 };
959
960 VNET_FEATURE_INIT (ip4_lookup, static) =
961 {
962   .arc_name = "ip4-unicast",
963   .node_name = "ip4-lookup",
964   .runs_before = 0,     /* not before any other features */
965 };
966
967 /* Built-in ip4 multicast rx feature path definition */
968 VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
969 {
970   .arc_name = "ip4-multicast",
971   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
972   .last_in_arc = "ip4-mfib-forward-lookup",
973   .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
974 };
975
976 VNET_FEATURE_INIT (ip4_vpath_mc, static) =
977 {
978   .arc_name = "ip4-multicast",
979   .node_name = "vpath-input-ip4",
980   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
981 };
982
983 VNET_FEATURE_INIT (ip4_mc_not_enabled, static) =
984 {
985   .arc_name = "ip4-multicast",
986   .node_name = "ip4-not-enabled",
987   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
988 };
989
990 VNET_FEATURE_INIT (ip4_lookup_mc, static) =
991 {
992   .arc_name = "ip4-multicast",
993   .node_name = "ip4-mfib-forward-lookup",
994   .runs_before = 0,     /* last feature */
995 };
996
997 /* Source and port-range check ip4 tx feature path definition */
998 VNET_FEATURE_ARC_INIT (ip4_output, static) =
999 {
1000   .arc_name = "ip4-output",
1001   .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain", "ip4-dvr-dpo"),
1002   .last_in_arc = "interface-output",
1003   .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
1004 };
1005
1006 VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) =
1007 {
1008   .arc_name = "ip4-output",
1009   .node_name = "ip4-source-and-port-range-check-tx",
1010   .runs_before = VNET_FEATURES ("ip4-outacl"),
1011 };
1012
1013 VNET_FEATURE_INIT (ip4_outacl, static) =
1014 {
1015   .arc_name = "ip4-output",
1016   .node_name = "ip4-outacl",
1017   .runs_before = VNET_FEATURES ("ipsec4-output-feature"),
1018 };
1019
1020 VNET_FEATURE_INIT (ip4_ipsec_output, static) =
1021 {
1022   .arc_name = "ip4-output",
1023   .node_name = "ipsec4-output-feature",
1024   .runs_before = VNET_FEATURES ("interface-output"),
1025 };
1026
1027 /* Built-in ip4 tx feature path definition */
1028 VNET_FEATURE_INIT (ip4_interface_output, static) =
1029 {
1030   .arc_name = "ip4-output",
1031   .node_name = "interface-output",
1032   .runs_before = 0,     /* not before any other features */
1033 };
1034 /* *INDENT-ON* */
1035
1036 static clib_error_t *
1037 ip4_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add)
1038 {
1039   ip4_main_t *im = &ip4_main;
1040
1041   /* Fill in lookup tables with default table (0). */
1042   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1043   vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);
1044
1045   if (!is_add)
1046     {
1047       ip4_main_t *im4 = &ip4_main;
1048       ip_lookup_main_t *lm4 = &im4->lookup_main;
1049       ip_interface_address_t *ia = 0;
1050       ip4_address_t *address;
1051       vlib_main_t *vm = vlib_get_main ();
1052
1053       vnet_sw_interface_update_unnumbered (sw_if_index, ~0, 0);
1054       /* *INDENT-OFF* */
1055       foreach_ip_interface_address (lm4, ia, sw_if_index, 0,
1056       ({
1057         address = ip_interface_address_get_address (lm4, ia);
1058         ip4_add_del_interface_address(vm, sw_if_index, address, ia->address_length, 1);
1059       }));
1060       /* *INDENT-ON* */
1061     }
1062
1063   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
1064                                is_add, 0, 0);
1065
1066   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
1067                                sw_if_index, is_add, 0, 0);
1068
1069   return /* no error */ 0;
1070 }
1071
1072 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1073
1074 /* Global IP4 main. */
1075 #ifndef CLIB_MARCH_VARIANT
1076 ip4_main_t ip4_main;
1077 #endif /* CLIB_MARCH_VARIANT */
1078
1079 static clib_error_t *
1080 ip4_lookup_init (vlib_main_t * vm)
1081 {
1082   ip4_main_t *im = &ip4_main;
1083   clib_error_t *error;
1084   uword i;
1085
1086   if ((error = vlib_call_init_function (vm, vnet_feature_init)))
1087     return error;
1088   if ((error = vlib_call_init_function (vm, ip4_mtrie_module_init)))
1089     return (error);
1090   if ((error = vlib_call_init_function (vm, fib_module_init)))
1091     return error;
1092   if ((error = vlib_call_init_function (vm, mfib_module_init)))
1093     return error;
1094
1095   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1096     {
1097       u32 m;
1098
1099       if (i < 32)
1100         m = pow2_mask (i) << (32 - i);
1101       else
1102         m = ~0;
1103       im->fib_masks[i] = clib_host_to_net_u32 (m);
1104     }
1105
1106   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1107
1108   /* Create FIB with index 0 and table id of 0. */
1109   fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1110                                      FIB_SOURCE_DEFAULT_ROUTE);
1111   mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1112                                       MFIB_SOURCE_DEFAULT_ROUTE);
1113
1114   {
1115     pg_node_t *pn;
1116     pn = pg_get_node (ip4_lookup_node.index);
1117     pn->unformat_edit = unformat_pg_ip4_header;
1118   }
1119
1120   {
1121     ethernet_arp_header_t h;
1122
1123     clib_memset (&h, 0, sizeof (h));
1124
1125 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1126 #define _8(f,v) h.f = v;
1127     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1128     _16 (l3_type, ETHERNET_TYPE_IP4);
1129     _8 (n_l2_address_bytes, 6);
1130     _8 (n_l3_address_bytes, 4);
1131     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1132 #undef _16
1133 #undef _8
1134
1135     vlib_packet_template_init (vm, &im->ip4_arp_request_packet_template,
1136                                /* data */ &h,
1137                                sizeof (h),
1138                                /* alloc chunk size */ 8,
1139                                "ip4 arp");
1140   }
1141
1142   return error;
1143 }
1144
1145 VLIB_INIT_FUNCTION (ip4_lookup_init);
1146
1147 typedef struct
1148 {
1149   /* Adjacency taken. */
1150   u32 dpo_index;
1151   u32 flow_hash;
1152   u32 fib_index;
1153
1154   /* Packet data, possibly *after* rewrite. */
1155   u8 packet_data[64 - 1 * sizeof (u32)];
1156 }
1157 ip4_forward_next_trace_t;
1158
1159 #ifndef CLIB_MARCH_VARIANT
1160 u8 *
1161 format_ip4_forward_next_trace (u8 * s, va_list * args)
1162 {
1163   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1164   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1165   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1166   u32 indent = format_get_indent (s);
1167   s = format (s, "%U%U",
1168               format_white_space, indent,
1169               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1170   return s;
1171 }
1172 #endif
1173
1174 static u8 *
1175 format_ip4_lookup_trace (u8 * s, va_list * args)
1176 {
1177   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1178   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1179   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1180   u32 indent = format_get_indent (s);
1181
1182   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1183               t->fib_index, t->dpo_index, t->flow_hash);
1184   s = format (s, "\n%U%U",
1185               format_white_space, indent,
1186               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1187   return s;
1188 }
1189
1190 static u8 *
1191 format_ip4_rewrite_trace (u8 * s, va_list * args)
1192 {
1193   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1194   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1195   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1196   u32 indent = format_get_indent (s);
1197
1198   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1199               t->fib_index, t->dpo_index, format_ip_adjacency,
1200               t->dpo_index, FORMAT_IP_ADJACENCY_NONE, t->flow_hash);
1201   s = format (s, "\n%U%U",
1202               format_white_space, indent,
1203               format_ip_adjacency_packet_data,
1204               t->dpo_index, t->packet_data, sizeof (t->packet_data));
1205   return s;
1206 }
1207
1208 #ifndef CLIB_MARCH_VARIANT
1209 /* Common trace function for all ip4-forward next nodes. */
1210 void
1211 ip4_forward_next_trace (vlib_main_t * vm,
1212                         vlib_node_runtime_t * node,
1213                         vlib_frame_t * frame, vlib_rx_or_tx_t which_adj_index)
1214 {
1215   u32 *from, n_left;
1216   ip4_main_t *im = &ip4_main;
1217
1218   n_left = frame->n_vectors;
1219   from = vlib_frame_vector_args (frame);
1220
1221   while (n_left >= 4)
1222     {
1223       u32 bi0, bi1;
1224       vlib_buffer_t *b0, *b1;
1225       ip4_forward_next_trace_t *t0, *t1;
1226
1227       /* Prefetch next iteration. */
1228       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1229       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1230
1231       bi0 = from[0];
1232       bi1 = from[1];
1233
1234       b0 = vlib_get_buffer (vm, bi0);
1235       b1 = vlib_get_buffer (vm, bi1);
1236
1237       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1238         {
1239           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1240           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1241           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1242           t0->fib_index =
1243             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1244              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1245             vec_elt (im->fib_index_by_sw_if_index,
1246                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1247
1248           clib_memcpy_fast (t0->packet_data,
1249                             vlib_buffer_get_current (b0),
1250                             sizeof (t0->packet_data));
1251         }
1252       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1253         {
1254           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1255           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1256           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1257           t1->fib_index =
1258             (vnet_buffer (b1)->sw_if_index[VLIB_TX] !=
1259              (u32) ~ 0) ? vnet_buffer (b1)->sw_if_index[VLIB_TX] :
1260             vec_elt (im->fib_index_by_sw_if_index,
1261                      vnet_buffer (b1)->sw_if_index[VLIB_RX]);
1262           clib_memcpy_fast (t1->packet_data, vlib_buffer_get_current (b1),
1263                             sizeof (t1->packet_data));
1264         }
1265       from += 2;
1266       n_left -= 2;
1267     }
1268
1269   while (n_left >= 1)
1270     {
1271       u32 bi0;
1272       vlib_buffer_t *b0;
1273       ip4_forward_next_trace_t *t0;
1274
1275       bi0 = from[0];
1276
1277       b0 = vlib_get_buffer (vm, bi0);
1278
1279       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1280         {
1281           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1282           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1283           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1284           t0->fib_index =
1285             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1286              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1287             vec_elt (im->fib_index_by_sw_if_index,
1288                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1289           clib_memcpy_fast (t0->packet_data, vlib_buffer_get_current (b0),
1290                             sizeof (t0->packet_data));
1291         }
1292       from += 1;
1293       n_left -= 1;
1294     }
1295 }
1296
1297 /* Compute TCP/UDP/ICMP4 checksum in software. */
1298 u16
1299 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1300                               ip4_header_t * ip0)
1301 {
1302   ip_csum_t sum0;
1303   u32 ip_header_length, payload_length_host_byte_order;
1304   u32 n_this_buffer, n_bytes_left, n_ip_bytes_this_buffer;
1305   u16 sum16;
1306   u8 *data_this_buffer;
1307   u8 length_odd;
1308
1309   /* Initialize checksum with ip header. */
1310   ip_header_length = ip4_header_bytes (ip0);
1311   payload_length_host_byte_order =
1312     clib_net_to_host_u16 (ip0->length) - ip_header_length;
1313   sum0 =
1314     clib_host_to_net_u32 (payload_length_host_byte_order +
1315                           (ip0->protocol << 16));
1316
1317   if (BITS (uword) == 32)
1318     {
1319       sum0 =
1320         ip_csum_with_carry (sum0,
1321                             clib_mem_unaligned (&ip0->src_address, u32));
1322       sum0 =
1323         ip_csum_with_carry (sum0,
1324                             clib_mem_unaligned (&ip0->dst_address, u32));
1325     }
1326   else
1327     sum0 =
1328       ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1329
1330   n_bytes_left = n_this_buffer = payload_length_host_byte_order;
1331   data_this_buffer = (u8 *) ip0 + ip_header_length;
1332   n_ip_bytes_this_buffer =
1333     p0->current_length - (((u8 *) ip0 - p0->data) - p0->current_data);
1334   if (n_this_buffer + ip_header_length > n_ip_bytes_this_buffer)
1335     {
1336       n_this_buffer = n_ip_bytes_this_buffer > ip_header_length ?
1337         n_ip_bytes_this_buffer - ip_header_length : 0;
1338     }
1339
1340   while (1)
1341     {
1342       sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
1343       n_bytes_left -= n_this_buffer;
1344       if (n_bytes_left == 0)
1345         break;
1346
1347       ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
1348       if (!(p0->flags & VLIB_BUFFER_NEXT_PRESENT))
1349         return 0xfefe;
1350
1351       length_odd = (n_this_buffer & 1);
1352
1353       p0 = vlib_get_buffer (vm, p0->next_buffer);
1354       data_this_buffer = vlib_buffer_get_current (p0);
1355       n_this_buffer = clib_min (p0->current_length, n_bytes_left);
1356
1357       if (PREDICT_FALSE (length_odd))
1358         {
1359           /* Prepend a 0 or the resulting checksum will be incorrect. */
1360           data_this_buffer--;
1361           n_this_buffer++;
1362           n_bytes_left++;
1363           data_this_buffer[0] = 0;
1364         }
1365     }
1366
1367   sum16 = ~ip_csum_fold (sum0);
1368   return sum16;
1369 }
1370
1371 u32
1372 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1373 {
1374   ip4_header_t *ip0 = vlib_buffer_get_current (p0);
1375   udp_header_t *udp0;
1376   u16 sum16;
1377
1378   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1379           || ip0->protocol == IP_PROTOCOL_UDP);
1380
1381   udp0 = (void *) (ip0 + 1);
1382   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1383     {
1384       p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1385                     | VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
1386       return p0->flags;
1387     }
1388
1389   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1390
1391   p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1392                 | ((sum16 == 0) << VNET_BUFFER_F_LOG2_L4_CHECKSUM_CORRECT));
1393
1394   return p0->flags;
1395 }
1396 #endif
1397
1398 /* *INDENT-OFF* */
1399 VNET_FEATURE_ARC_INIT (ip4_local) =
1400 {
1401   .arc_name  = "ip4-local",
1402   .start_nodes = VNET_FEATURES ("ip4-local"),
1403   .last_in_arc = "ip4-local-end-of-arc",
1404 };
1405 /* *INDENT-ON* */
1406
1407 static inline void
1408 ip4_local_l4_csum_validate (vlib_main_t * vm, vlib_buffer_t * p,
1409                             ip4_header_t * ip, u8 is_udp, u8 * error,
1410                             u8 * good_tcp_udp)
1411 {
1412   u32 flags0;
1413   flags0 = ip4_tcp_udp_validate_checksum (vm, p);
1414   *good_tcp_udp = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0;
1415   if (is_udp)
1416     {
1417       udp_header_t *udp;
1418       u32 ip_len, udp_len;
1419       i32 len_diff;
1420       udp = ip4_next_header (ip);
1421       /* Verify UDP length. */
1422       ip_len = clib_net_to_host_u16 (ip->length);
1423       udp_len = clib_net_to_host_u16 (udp->length);
1424
1425       len_diff = ip_len - udp_len;
1426       *good_tcp_udp &= len_diff >= 0;
1427       *error = len_diff < 0 ? IP4_ERROR_UDP_LENGTH : *error;
1428     }
1429 }
1430
1431 #define ip4_local_csum_is_offloaded(_b)                                 \
1432     _b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM                         \
1433         || _b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM
1434
1435 #define ip4_local_need_csum_check(is_tcp_udp, _b)                       \
1436     (is_tcp_udp && !(_b->flags & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED     \
1437         || ip4_local_csum_is_offloaded (_b)))
1438
1439 #define ip4_local_csum_is_valid(_b)                                     \
1440     (_b->flags & VNET_BUFFER_F_L4_CHECKSUM_CORRECT                      \
1441         || (ip4_local_csum_is_offloaded (_b))) != 0
1442
1443 static inline void
1444 ip4_local_check_l4_csum (vlib_main_t * vm, vlib_buffer_t * b,
1445                          ip4_header_t * ih, u8 * error)
1446 {
1447   u8 is_udp, is_tcp_udp, good_tcp_udp;
1448
1449   is_udp = ih->protocol == IP_PROTOCOL_UDP;
1450   is_tcp_udp = is_udp || ih->protocol == IP_PROTOCOL_TCP;
1451
1452   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp, b)))
1453     ip4_local_l4_csum_validate (vm, b, ih, is_udp, error, &good_tcp_udp);
1454   else
1455     good_tcp_udp = ip4_local_csum_is_valid (b);
1456
1457   ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1458   *error = (is_tcp_udp && !good_tcp_udp
1459             ? IP4_ERROR_TCP_CHECKSUM + is_udp : *error);
1460 }
1461
1462 static inline void
1463 ip4_local_check_l4_csum_x2 (vlib_main_t * vm, vlib_buffer_t ** b,
1464                             ip4_header_t ** ih, u8 * error)
1465 {
1466   u8 is_udp[2], is_tcp_udp[2], good_tcp_udp[2];
1467
1468   is_udp[0] = ih[0]->protocol == IP_PROTOCOL_UDP;
1469   is_udp[1] = ih[1]->protocol == IP_PROTOCOL_UDP;
1470
1471   is_tcp_udp[0] = is_udp[0] || ih[0]->protocol == IP_PROTOCOL_TCP;
1472   is_tcp_udp[1] = is_udp[1] || ih[1]->protocol == IP_PROTOCOL_TCP;
1473
1474   good_tcp_udp[0] = ip4_local_csum_is_valid (b[0]);
1475   good_tcp_udp[1] = ip4_local_csum_is_valid (b[1]);
1476
1477   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp[0], b[0])
1478                      || ip4_local_need_csum_check (is_tcp_udp[1], b[1])))
1479     {
1480       if (is_tcp_udp[0])
1481         ip4_local_l4_csum_validate (vm, b[0], ih[0], is_udp[0], &error[0],
1482                                     &good_tcp_udp[0]);
1483       if (is_tcp_udp[1])
1484         ip4_local_l4_csum_validate (vm, b[1], ih[1], is_udp[1], &error[1],
1485                                     &good_tcp_udp[1]);
1486     }
1487
1488   error[0] = (is_tcp_udp[0] && !good_tcp_udp[0] ?
1489               IP4_ERROR_TCP_CHECKSUM + is_udp[0] : error[0]);
1490   error[1] = (is_tcp_udp[1] && !good_tcp_udp[1] ?
1491               IP4_ERROR_TCP_CHECKSUM + is_udp[1] : error[1]);
1492 }
1493
1494 static inline void
1495 ip4_local_set_next_and_error (vlib_node_runtime_t * error_node,
1496                               vlib_buffer_t * b, u16 * next, u8 error,
1497                               u8 head_of_feature_arc)
1498 {
1499   u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;
1500   u32 next_index;
1501
1502   *next = error != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : *next;
1503   b->error = error ? error_node->errors[error] : 0;
1504   if (head_of_feature_arc)
1505     {
1506       next_index = *next;
1507       if (PREDICT_TRUE (error == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1508         {
1509           vnet_feature_arc_start (arc_index,
1510                                   vnet_buffer (b)->sw_if_index[VLIB_RX],
1511                                   &next_index, b);
1512           *next = next_index;
1513         }
1514     }
1515 }
1516
1517 typedef struct
1518 {
1519   ip4_address_t src;
1520   u32 lbi;
1521   u8 error;
1522   u8 first;
1523 } ip4_local_last_check_t;
1524
1525 static inline void
1526 ip4_local_check_src (vlib_buffer_t * b, ip4_header_t * ip0,
1527                      ip4_local_last_check_t * last_check, u8 * error0)
1528 {
1529   ip4_fib_mtrie_leaf_t leaf0;
1530   ip4_fib_mtrie_t *mtrie0;
1531   const dpo_id_t *dpo0;
1532   load_balance_t *lb0;
1533   u32 lbi0;
1534
1535   vnet_buffer (b)->ip.fib_index =
1536     vnet_buffer (b)->sw_if_index[VLIB_TX] != ~0 ?
1537     vnet_buffer (b)->sw_if_index[VLIB_TX] : vnet_buffer (b)->ip.fib_index;
1538
1539   /*
1540    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1541    *  adjacency for the destination address (the local interface address).
1542    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1543    *  adjacency for the source address (the remote sender's address)
1544    */
1545   if (PREDICT_FALSE (last_check->first ||
1546                      (last_check->src.as_u32 != ip0->src_address.as_u32)))
1547     {
1548       mtrie0 = &ip4_fib_get (vnet_buffer (b)->ip.fib_index)->mtrie;
1549       leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, &ip0->src_address);
1550       leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1551       leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1552       lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1553
1554       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1555         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1556       vnet_buffer (b)->ip.adj_index[VLIB_TX] = lbi0;
1557
1558       lb0 = load_balance_get (lbi0);
1559       dpo0 = load_balance_get_bucket_i (lb0, 0);
1560
1561       /*
1562        * Must have a route to source otherwise we drop the packet.
1563        * ip4 broadcasts are accepted, e.g. to make dhcp client work
1564        *
1565        * The checks are:
1566        *  - the source is a recieve => it's from us => bogus, do this
1567        *    first since it sets a different error code.
1568        *  - uRPF check for any route to source - accept if passes.
1569        *  - allow packets destined to the broadcast address from unknown sources
1570        */
1571
1572       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1573                   && dpo0->dpoi_type == DPO_RECEIVE) ?
1574                  IP4_ERROR_SPOOFED_LOCAL_PACKETS : *error0);
1575       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1576                   && !fib_urpf_check_size (lb0->lb_urpf)
1577                   && ip0->dst_address.as_u32 != 0xFFFFFFFF) ?
1578                  IP4_ERROR_SRC_LOOKUP_MISS : *error0);
1579
1580       last_check->src.as_u32 = ip0->src_address.as_u32;
1581       last_check->lbi = lbi0;
1582       last_check->error = *error0;
1583     }
1584   else
1585     {
1586       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1587         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1588       vnet_buffer (b)->ip.adj_index[VLIB_TX] = last_check->lbi;
1589       *error0 = last_check->error;
1590       last_check->first = 0;
1591     }
1592 }
1593
1594 static inline void
1595 ip4_local_check_src_x2 (vlib_buffer_t ** b, ip4_header_t ** ip,
1596                         ip4_local_last_check_t * last_check, u8 * error)
1597 {
1598   ip4_fib_mtrie_leaf_t leaf[2];
1599   ip4_fib_mtrie_t *mtrie[2];
1600   const dpo_id_t *dpo[2];
1601   load_balance_t *lb[2];
1602   u32 not_last_hit;
1603   u32 lbi[2];
1604
1605   not_last_hit = last_check->first;
1606   not_last_hit |= ip[0]->src_address.as_u32 ^ last_check->src.as_u32;
1607   not_last_hit |= ip[1]->src_address.as_u32 ^ last_check->src.as_u32;
1608
1609   vnet_buffer (b[0])->ip.fib_index =
1610     vnet_buffer (b[0])->sw_if_index[VLIB_TX] != ~0 ?
1611     vnet_buffer (b[0])->sw_if_index[VLIB_TX] :
1612     vnet_buffer (b[0])->ip.fib_index;
1613
1614   vnet_buffer (b[1])->ip.fib_index =
1615     vnet_buffer (b[1])->sw_if_index[VLIB_TX] != ~0 ?
1616     vnet_buffer (b[1])->sw_if_index[VLIB_TX] :
1617     vnet_buffer (b[1])->ip.fib_index;
1618
1619   /*
1620    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1621    *  adjacency for the destination address (the local interface address).
1622    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1623    *  adjacency for the source address (the remote sender's address)
1624    */
1625   if (PREDICT_FALSE (not_last_hit))
1626     {
1627       mtrie[0] = &ip4_fib_get (vnet_buffer (b[0])->ip.fib_index)->mtrie;
1628       mtrie[1] = &ip4_fib_get (vnet_buffer (b[1])->ip.fib_index)->mtrie;
1629
1630       leaf[0] = ip4_fib_mtrie_lookup_step_one (mtrie[0], &ip[0]->src_address);
1631       leaf[1] = ip4_fib_mtrie_lookup_step_one (mtrie[1], &ip[1]->src_address);
1632
1633       leaf[0] = ip4_fib_mtrie_lookup_step (mtrie[0], leaf[0],
1634                                            &ip[0]->src_address, 2);
1635       leaf[1] = ip4_fib_mtrie_lookup_step (mtrie[1], leaf[1],
1636                                            &ip[1]->src_address, 2);
1637
1638       leaf[0] = ip4_fib_mtrie_lookup_step (mtrie[0], leaf[0],
1639                                            &ip[0]->src_address, 3);
1640       leaf[1] = ip4_fib_mtrie_lookup_step (mtrie[1], leaf[1],
1641                                            &ip[1]->src_address, 3);
1642
1643       lbi[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf[0]);
1644       lbi[1] = ip4_fib_mtrie_leaf_get_adj_index (leaf[1]);
1645
1646       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1647         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1648       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = lbi[0];
1649
1650       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1651         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1652       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = lbi[1];
1653
1654       lb[0] = load_balance_get (lbi[0]);
1655       lb[1] = load_balance_get (lbi[1]);
1656
1657       dpo[0] = load_balance_get_bucket_i (lb[0], 0);
1658       dpo[1] = load_balance_get_bucket_i (lb[1], 0);
1659
1660       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1661                    dpo[0]->dpoi_type == DPO_RECEIVE) ?
1662                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[0]);
1663       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1664                    !fib_urpf_check_size (lb[0]->lb_urpf) &&
1665                    ip[0]->dst_address.as_u32 != 0xFFFFFFFF)
1666                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[0]);
1667
1668       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1669                    dpo[1]->dpoi_type == DPO_RECEIVE) ?
1670                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[1]);
1671       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1672                    !fib_urpf_check_size (lb[1]->lb_urpf) &&
1673                    ip[1]->dst_address.as_u32 != 0xFFFFFFFF)
1674                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[1]);
1675
1676       last_check->src.as_u32 = ip[1]->src_address.as_u32;
1677       last_check->lbi = lbi[1];
1678       last_check->error = error[1];
1679     }
1680   else
1681     {
1682       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1683         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1684       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = last_check->lbi;
1685
1686       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1687         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1688       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = last_check->lbi;
1689
1690       error[0] = last_check->error;
1691       error[1] = last_check->error;
1692       last_check->first = 0;
1693     }
1694 }
1695
1696 enum ip_local_packet_type_e
1697 {
1698   IP_LOCAL_PACKET_TYPE_L4,
1699   IP_LOCAL_PACKET_TYPE_NAT,
1700   IP_LOCAL_PACKET_TYPE_FRAG,
1701 };
1702
1703 /**
1704  * Determine packet type and next node.
1705  *
1706  * The expectation is that all packets that are not L4 will skip
1707  * checksums and source checks.
1708  */
1709 always_inline u8
1710 ip4_local_classify (vlib_buffer_t * b, ip4_header_t * ip, u16 * next)
1711 {
1712   ip_lookup_main_t *lm = &ip4_main.lookup_main;
1713
1714   if (PREDICT_FALSE (ip4_is_fragment (ip)))
1715     {
1716       *next = IP_LOCAL_NEXT_REASSEMBLY;
1717       return IP_LOCAL_PACKET_TYPE_FRAG;
1718     }
1719   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_IS_NATED))
1720     {
1721       *next = lm->local_next_by_ip_protocol[ip->protocol];
1722       return IP_LOCAL_PACKET_TYPE_NAT;
1723     }
1724
1725   *next = lm->local_next_by_ip_protocol[ip->protocol];
1726   return IP_LOCAL_PACKET_TYPE_L4;
1727 }
1728
1729 static inline uword
1730 ip4_local_inline (vlib_main_t * vm,
1731                   vlib_node_runtime_t * node,
1732                   vlib_frame_t * frame, int head_of_feature_arc)
1733 {
1734   u32 *from, n_left_from;
1735   vlib_node_runtime_t *error_node =
1736     vlib_node_get_runtime (vm, ip4_input_node.index);
1737   u16 nexts[VLIB_FRAME_SIZE], *next;
1738   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1739   ip4_header_t *ip[2];
1740   u8 error[2], pt[2];
1741
1742   ip4_local_last_check_t last_check = {
1743     /*
1744      * 0.0.0.0 can appear as the source address of an IP packet,
1745      * as can any other address, hence the need to use the 'first'
1746      * member to make sure the .lbi is initialised for the first
1747      * packet.
1748      */
1749     .src = {.as_u32 = 0},
1750     .lbi = ~0,
1751     .error = IP4_ERROR_UNKNOWN_PROTOCOL,
1752     .first = 1,
1753   };
1754
1755   from = vlib_frame_vector_args (frame);
1756   n_left_from = frame->n_vectors;
1757
1758   if (node->flags & VLIB_NODE_FLAG_TRACE)
1759     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1760
1761   vlib_get_buffers (vm, from, bufs, n_left_from);
1762   b = bufs;
1763   next = nexts;
1764
1765   while (n_left_from >= 6)
1766     {
1767       u8 not_batch = 0;
1768
1769       /* Prefetch next iteration. */
1770       {
1771         vlib_prefetch_buffer_header (b[4], LOAD);
1772         vlib_prefetch_buffer_header (b[5], LOAD);
1773
1774         CLIB_PREFETCH (b[4]->data, CLIB_CACHE_LINE_BYTES, LOAD);
1775         CLIB_PREFETCH (b[5]->data, CLIB_CACHE_LINE_BYTES, LOAD);
1776       }
1777
1778       error[0] = error[1] = IP4_ERROR_UNKNOWN_PROTOCOL;
1779
1780       ip[0] = vlib_buffer_get_current (b[0]);
1781       ip[1] = vlib_buffer_get_current (b[1]);
1782
1783       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1784       vnet_buffer (b[1])->l3_hdr_offset = b[1]->current_data;
1785
1786       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1787       pt[1] = ip4_local_classify (b[1], ip[1], &next[1]);
1788
1789       not_batch = pt[0] ^ pt[1];
1790
1791       if (head_of_feature_arc == 0 || (pt[0] && not_batch == 0))
1792         goto skip_checks;
1793
1794       if (PREDICT_TRUE (not_batch == 0))
1795         {
1796           ip4_local_check_l4_csum_x2 (vm, b, ip, error);
1797           ip4_local_check_src_x2 (b, ip, &last_check, error);
1798         }
1799       else
1800         {
1801           if (!pt[0])
1802             {
1803               ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1804               ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1805             }
1806           if (!pt[1])
1807             {
1808               ip4_local_check_l4_csum (vm, b[1], ip[1], &error[1]);
1809               ip4_local_check_src (b[1], ip[1], &last_check, &error[1]);
1810             }
1811         }
1812
1813     skip_checks:
1814
1815       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1816                                     head_of_feature_arc);
1817       ip4_local_set_next_and_error (error_node, b[1], &next[1], error[1],
1818                                     head_of_feature_arc);
1819
1820       b += 2;
1821       next += 2;
1822       n_left_from -= 2;
1823     }
1824
1825   while (n_left_from > 0)
1826     {
1827       error[0] = IP4_ERROR_UNKNOWN_PROTOCOL;
1828
1829       ip[0] = vlib_buffer_get_current (b[0]);
1830       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1831       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1832
1833       if (head_of_feature_arc == 0 || pt[0])
1834         goto skip_check;
1835
1836       ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1837       ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1838
1839     skip_check:
1840
1841       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1842                                     head_of_feature_arc);
1843
1844       b += 1;
1845       next += 1;
1846       n_left_from -= 1;
1847     }
1848
1849   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
1850   return frame->n_vectors;
1851 }
1852
1853 VLIB_NODE_FN (ip4_local_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1854                                vlib_frame_t * frame)
1855 {
1856   return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ );
1857 }
1858
1859 /* *INDENT-OFF* */
1860 VLIB_REGISTER_NODE (ip4_local_node) =
1861 {
1862   .name = "ip4-local",
1863   .vector_size = sizeof (u32),
1864   .format_trace = format_ip4_forward_next_trace,
1865   .n_next_nodes = IP_LOCAL_N_NEXT,
1866   .next_nodes =
1867   {
1868     [IP_LOCAL_NEXT_DROP] = "ip4-drop",
1869     [IP_LOCAL_NEXT_PUNT] = "ip4-punt",
1870     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1871     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
1872     [IP_LOCAL_NEXT_REASSEMBLY] = "ip4-reassembly",
1873   },
1874 };
1875 /* *INDENT-ON* */
1876
1877
1878 VLIB_NODE_FN (ip4_local_end_of_arc_node) (vlib_main_t * vm,
1879                                           vlib_node_runtime_t * node,
1880                                           vlib_frame_t * frame)
1881 {
1882   return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ );
1883 }
1884
1885 /* *INDENT-OFF* */
1886 VLIB_REGISTER_NODE (ip4_local_end_of_arc_node) = {
1887   .name = "ip4-local-end-of-arc",
1888   .vector_size = sizeof (u32),
1889
1890   .format_trace = format_ip4_forward_next_trace,
1891   .sibling_of = "ip4-local",
1892 };
1893
1894 VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = {
1895   .arc_name = "ip4-local",
1896   .node_name = "ip4-local-end-of-arc",
1897   .runs_before = 0, /* not before any other features */
1898 };
1899 /* *INDENT-ON* */
1900
1901 #ifndef CLIB_MARCH_VARIANT
1902 void
1903 ip4_register_protocol (u32 protocol, u32 node_index)
1904 {
1905   vlib_main_t *vm = vlib_get_main ();
1906   ip4_main_t *im = &ip4_main;
1907   ip_lookup_main_t *lm = &im->lookup_main;
1908
1909   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1910   lm->local_next_by_ip_protocol[protocol] =
1911     vlib_node_add_next (vm, ip4_local_node.index, node_index);
1912 }
1913
1914 void
1915 ip4_unregister_protocol (u32 protocol)
1916 {
1917   ip4_main_t *im = &ip4_main;
1918   ip_lookup_main_t *lm = &im->lookup_main;
1919
1920   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1921   lm->local_next_by_ip_protocol[protocol] = IP_LOCAL_NEXT_PUNT;
1922 }
1923 #endif
1924
1925 static clib_error_t *
1926 show_ip_local_command_fn (vlib_main_t * vm,
1927                           unformat_input_t * input, vlib_cli_command_t * cmd)
1928 {
1929   ip4_main_t *im = &ip4_main;
1930   ip_lookup_main_t *lm = &im->lookup_main;
1931   int i;
1932
1933   vlib_cli_output (vm, "Protocols handled by ip4_local");
1934   for (i = 0; i < ARRAY_LEN (lm->local_next_by_ip_protocol); i++)
1935     {
1936       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1937         {
1938           u32 node_index = vlib_get_node (vm,
1939                                           ip4_local_node.index)->
1940             next_nodes[lm->local_next_by_ip_protocol[i]];
1941           vlib_cli_output (vm, "%U: %U", format_ip_protocol, i,
1942                            format_vlib_node_name, vm, node_index);
1943         }
1944     }
1945   return 0;
1946 }
1947
1948
1949
1950 /*?
1951  * Display the set of protocols handled by the local IPv4 stack.
1952  *
1953  * @cliexpar
1954  * Example of how to display local protocol table:
1955  * @cliexstart{show ip local}
1956  * Protocols handled by ip4_local
1957  * 1
1958  * 17
1959  * 47
1960  * @cliexend
1961 ?*/
1962 /* *INDENT-OFF* */
1963 VLIB_CLI_COMMAND (show_ip_local, static) =
1964 {
1965   .path = "show ip local",
1966   .function = show_ip_local_command_fn,
1967   .short_help = "show ip local",
1968 };
1969 /* *INDENT-ON* */
1970
1971 always_inline uword
1972 ip4_arp_inline (vlib_main_t * vm,
1973                 vlib_node_runtime_t * node,
1974                 vlib_frame_t * frame, int is_glean)
1975 {
1976   vnet_main_t *vnm = vnet_get_main ();
1977   ip4_main_t *im = &ip4_main;
1978   ip_lookup_main_t *lm = &im->lookup_main;
1979   u32 *from, *to_next_drop;
1980   uword n_left_from, n_left_to_next_drop, next_index;
1981   u32 thread_index = vm->thread_index;
1982   u64 seed;
1983
1984   if (node->flags & VLIB_NODE_FLAG_TRACE)
1985     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1986
1987   seed = throttle_seed (&im->arp_throttle, thread_index, vlib_time_now (vm));
1988
1989   from = vlib_frame_vector_args (frame);
1990   n_left_from = frame->n_vectors;
1991   next_index = node->cached_next_index;
1992   if (next_index == IP4_ARP_NEXT_DROP)
1993     next_index = IP4_ARP_N_NEXT;        /* point to first interface */
1994
1995   while (n_left_from > 0)
1996     {
1997       vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
1998                            to_next_drop, n_left_to_next_drop);
1999
2000       while (n_left_from > 0 && n_left_to_next_drop > 0)
2001         {
2002           u32 pi0, bi0, adj_index0, sw_if_index0;
2003           ip_adjacency_t *adj0;
2004           vlib_buffer_t *p0, *b0;
2005           ip4_address_t resolve0;
2006           ethernet_arp_header_t *h0;
2007           vnet_hw_interface_t *hw_if0;
2008           u64 r0;
2009
2010           pi0 = from[0];
2011           p0 = vlib_get_buffer (vm, pi0);
2012
2013           from += 1;
2014           n_left_from -= 1;
2015           to_next_drop[0] = pi0;
2016           to_next_drop += 1;
2017           n_left_to_next_drop -= 1;
2018
2019           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2020           adj0 = adj_get (adj_index0);
2021
2022           if (is_glean)
2023             {
2024               /* resolve the packet's destination */
2025               ip4_header_t *ip0 = vlib_buffer_get_current (p0);
2026               resolve0 = ip0->dst_address;
2027             }
2028           else
2029             {
2030               /* resolve the incomplete adj */
2031               resolve0 = adj0->sub_type.nbr.next_hop.ip4;
2032             }
2033
2034           /* combine the address and interface for the hash key */
2035           sw_if_index0 = adj0->rewrite_header.sw_if_index;
2036           r0 = (u64) resolve0.data_u32 << 32;
2037           r0 |= sw_if_index0;
2038
2039           if (throttle_check (&im->arp_throttle, thread_index, r0, seed))
2040             {
2041               p0->error = node->errors[IP4_ARP_ERROR_THROTTLED];
2042               continue;
2043             }
2044
2045           /*
2046            * the adj has been updated to a rewrite but the node the DPO that got
2047            * us here hasn't - yet. no big deal. we'll drop while we wait.
2048            */
2049           if (IP_LOOKUP_NEXT_REWRITE == adj0->lookup_next_index)
2050             {
2051               p0->error = node->errors[IP4_ARP_ERROR_RESOLVED];
2052               continue;
2053             }
2054
2055           /*
2056            * Can happen if the control-plane is programming tables
2057            * with traffic flowing; at least that's today's lame excuse.
2058            */
2059           if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN)
2060               || (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
2061             {
2062               p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
2063               continue;
2064             }
2065           /* Send ARP request. */
2066           h0 =
2067             vlib_packet_template_get_packet (vm,
2068                                              &im->ip4_arp_request_packet_template,
2069                                              &bi0);
2070           /* Seems we're out of buffers */
2071           if (PREDICT_FALSE (!h0))
2072             {
2073               p0->error = node->errors[IP4_ARP_ERROR_NO_BUFFERS];
2074               continue;
2075             }
2076
2077           b0 = vlib_get_buffer (vm, bi0);
2078
2079           /* copy the persistent fields from the original */
2080           clib_memcpy_fast (b0->opaque2, p0->opaque2, sizeof (p0->opaque2));
2081
2082           /* Add rewrite/encap string for ARP packet. */
2083           vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t));
2084
2085           hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
2086
2087           /* Src ethernet address in ARP header. */
2088           mac_address_from_bytes (&h0->ip4_over_ethernet[0].mac,
2089                                   hw_if0->hw_address);
2090           if (is_glean)
2091             {
2092               /* The interface's source address is stashed in the Glean Adj */
2093               h0->ip4_over_ethernet[0].ip4 =
2094                 adj0->sub_type.glean.receive_addr.ip4;
2095             }
2096           else
2097             {
2098               /* Src IP address in ARP header. */
2099               if (ip4_src_address_for_packet (lm, sw_if_index0,
2100                                               &h0->ip4_over_ethernet[0].ip4))
2101                 {
2102                   /* No source address available */
2103                   p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
2104                   vlib_buffer_free (vm, &bi0, 1);
2105                   continue;
2106                 }
2107             }
2108           h0->ip4_over_ethernet[1].ip4 = resolve0;
2109
2110           p0->error = node->errors[IP4_ARP_ERROR_REQUEST_SENT];
2111
2112           vlib_buffer_copy_trace_flag (vm, p0, bi0);
2113           VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);
2114           vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
2115
2116           vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
2117
2118           vlib_set_next_frame_buffer (vm, node,
2119                                       adj0->rewrite_header.next_index, bi0);
2120         }
2121
2122       vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
2123     }
2124
2125   return frame->n_vectors;
2126 }
2127
2128 VLIB_NODE_FN (ip4_arp_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2129                              vlib_frame_t * frame)
2130 {
2131   return (ip4_arp_inline (vm, node, frame, 0));
2132 }
2133
2134 VLIB_NODE_FN (ip4_glean_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2135                                vlib_frame_t * frame)
2136 {
2137   return (ip4_arp_inline (vm, node, frame, 1));
2138 }
2139
2140 static char *ip4_arp_error_strings[] = {
2141   [IP4_ARP_ERROR_THROTTLED] = "ARP requests throttled",
2142   [IP4_ARP_ERROR_RESOLVED] = "ARP requests resolved",
2143   [IP4_ARP_ERROR_NO_BUFFERS] = "ARP requests out of buffer",
2144   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
2145   [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
2146   [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
2147 };
2148
2149 /* *INDENT-OFF* */
2150 VLIB_REGISTER_NODE (ip4_arp_node) =
2151 {
2152   .name = "ip4-arp",
2153   .vector_size = sizeof (u32),
2154   .format_trace = format_ip4_forward_next_trace,
2155   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2156   .error_strings = ip4_arp_error_strings,
2157   .n_next_nodes = IP4_ARP_N_NEXT,
2158   .next_nodes =
2159   {
2160     [IP4_ARP_NEXT_DROP] = "error-drop",
2161   },
2162 };
2163
2164 VLIB_REGISTER_NODE (ip4_glean_node) =
2165 {
2166   .name = "ip4-glean",
2167   .vector_size = sizeof (u32),
2168   .format_trace = format_ip4_forward_next_trace,
2169   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2170   .error_strings = ip4_arp_error_strings,
2171   .n_next_nodes = IP4_ARP_N_NEXT,
2172   .next_nodes = {
2173   [IP4_ARP_NEXT_DROP] = "error-drop",
2174   },
2175 };
2176 /* *INDENT-ON* */
2177
2178 #define foreach_notrace_ip4_arp_error           \
2179 _(THROTTLED)                                    \
2180 _(RESOLVED)                                     \
2181 _(NO_BUFFERS)                                   \
2182 _(REQUEST_SENT)                                 \
2183 _(NON_ARP_ADJ)                                  \
2184 _(NO_SOURCE_ADDRESS)
2185
2186 static clib_error_t *
2187 arp_notrace_init (vlib_main_t * vm)
2188 {
2189   vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, ip4_arp_node.index);
2190
2191   /* don't trace ARP request packets */
2192 #define _(a)                                    \
2193     vnet_pcap_drop_trace_filter_add_del         \
2194         (rt->errors[IP4_ARP_ERROR_##a],         \
2195          1 /* is_add */);
2196   foreach_notrace_ip4_arp_error;
2197 #undef _
2198   return 0;
2199 }
2200
2201 VLIB_INIT_FUNCTION (arp_notrace_init);
2202
2203
2204 #ifndef CLIB_MARCH_VARIANT
2205 /* Send an ARP request to see if given destination is reachable on given interface. */
2206 clib_error_t *
2207 ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index,
2208                     u8 refresh)
2209 {
2210   vnet_main_t *vnm = vnet_get_main ();
2211   ip4_main_t *im = &ip4_main;
2212   ethernet_arp_header_t *h;
2213   ip4_address_t *src;
2214   ip_interface_address_t *ia;
2215   ip_adjacency_t *adj;
2216   vnet_hw_interface_t *hi;
2217   vnet_sw_interface_t *si;
2218   vlib_buffer_t *b;
2219   adj_index_t ai;
2220   u32 bi = 0;
2221   u8 unicast_rewrite = 0;
2222
2223   si = vnet_get_sw_interface (vnm, sw_if_index);
2224
2225   if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
2226     {
2227       return clib_error_return (0, "%U: interface %U down",
2228                                 format_ip4_address, dst,
2229                                 format_vnet_sw_if_index_name, vnm,
2230                                 sw_if_index);
2231     }
2232
2233   src =
2234     ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
2235   if (!src)
2236     {
2237       vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
2238       return clib_error_return
2239         (0,
2240          "no matching interface address for destination %U (interface %U)",
2241          format_ip4_address, dst, format_vnet_sw_if_index_name, vnm,
2242          sw_if_index);
2243     }
2244
2245   h = vlib_packet_template_get_packet (vm,
2246                                        &im->ip4_arp_request_packet_template,
2247                                        &bi);
2248
2249   if (!h)
2250     return clib_error_return (0, "ARP request packet allocation failed");
2251
2252   hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
2253   if (PREDICT_FALSE (!hi->hw_address))
2254     {
2255       return clib_error_return (0, "%U: interface %U do not support ip probe",
2256                                 format_ip4_address, dst,
2257                                 format_vnet_sw_if_index_name, vnm,
2258                                 sw_if_index);
2259     }
2260
2261   mac_address_from_bytes (&h->ip4_over_ethernet[0].mac, hi->hw_address);
2262
2263   h->ip4_over_ethernet[0].ip4 = src[0];
2264   h->ip4_over_ethernet[1].ip4 = dst[0];
2265
2266   b = vlib_get_buffer (vm, bi);
2267   vnet_buffer (b)->sw_if_index[VLIB_RX] =
2268     vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
2269
2270   ip46_address_t nh = {
2271     .ip4 = *dst,
2272   };
2273
2274   ai = adj_nbr_add_or_lock (FIB_PROTOCOL_IP4,
2275                             VNET_LINK_IP4, &nh, sw_if_index);
2276   adj = adj_get (ai);
2277
2278   /* Peer has been previously resolved, retrieve glean adj instead */
2279   if (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE)
2280     {
2281       if (refresh)
2282         unicast_rewrite = 1;
2283       else
2284         {
2285           adj_unlock (ai);
2286           ai = adj_glean_add_or_lock (FIB_PROTOCOL_IP4,
2287                                       VNET_LINK_IP4, sw_if_index, &nh);
2288           adj = adj_get (ai);
2289         }
2290     }
2291
2292   /* Add encapsulation string for software interface (e.g. ethernet header). */
2293   vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
2294   if (unicast_rewrite)
2295     {
2296       u16 *etype = vlib_buffer_get_current (b) - 2;
2297       etype[0] = clib_host_to_net_u16 (ETHERNET_TYPE_ARP);
2298     }
2299   vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
2300
2301   {
2302     vlib_frame_t *f = vlib_get_frame_to_node (vm, hi->output_node_index);
2303     u32 *to_next = vlib_frame_vector_args (f);
2304     to_next[0] = bi;
2305     f->n_vectors = 1;
2306     vlib_put_frame_to_node (vm, hi->output_node_index, f);
2307   }
2308
2309   adj_unlock (ai);
2310   return /* no error */ 0;
2311 }
2312 #endif
2313
2314 typedef enum
2315 {
2316   IP4_REWRITE_NEXT_DROP,
2317   IP4_REWRITE_NEXT_ICMP_ERROR,
2318   IP4_REWRITE_NEXT_FRAGMENT,
2319   IP4_REWRITE_N_NEXT            /* Last */
2320 } ip4_rewrite_next_t;
2321
2322 /**
2323  * This bits of an IPv4 address to mask to construct a multicast
2324  * MAC address
2325  */
2326 #if CLIB_ARCH_IS_BIG_ENDIAN
2327 #define IP4_MCAST_ADDR_MASK 0x007fffff
2328 #else
2329 #define IP4_MCAST_ADDR_MASK 0xffff7f00
2330 #endif
2331
2332 always_inline void
2333 ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
2334                u16 adj_packet_bytes, bool df, u16 * next, u32 * error)
2335 {
2336   if (packet_len > adj_packet_bytes)
2337     {
2338       *error = IP4_ERROR_MTU_EXCEEDED;
2339       if (df)
2340         {
2341           icmp4_error_set_vnet_buffer
2342             (b, ICMP4_destination_unreachable,
2343              ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
2344              adj_packet_bytes);
2345           *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2346         }
2347       else
2348         {
2349           /* IP fragmentation */
2350           ip_frag_set_vnet_buffer (b, adj_packet_bytes,
2351                                    IP4_FRAG_NEXT_IP4_REWRITE, 0);
2352           *next = IP4_REWRITE_NEXT_FRAGMENT;
2353         }
2354     }
2355 }
2356
2357 /* Decrement TTL & update checksum.
2358    Works either endian, so no need for byte swap. */
2359 static_always_inline void
2360 ip4_ttl_and_checksum_check (vlib_buffer_t * b, ip4_header_t * ip, u16 * next,
2361                             u32 * error)
2362 {
2363   i32 ttl;
2364   u32 checksum;
2365   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
2366     {
2367       b->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED;
2368       return;
2369     }
2370
2371   ttl = ip->ttl;
2372
2373   /* Input node should have reject packets with ttl 0. */
2374   ASSERT (ip->ttl > 0);
2375
2376   checksum = ip->checksum + clib_host_to_net_u16 (0x0100);
2377   checksum += checksum >= 0xffff;
2378
2379   ip->checksum = checksum;
2380   ttl -= 1;
2381   ip->ttl = ttl;
2382
2383   /*
2384    * If the ttl drops below 1 when forwarding, generate
2385    * an ICMP response.
2386    */
2387   if (PREDICT_FALSE (ttl <= 0))
2388     {
2389       *error = IP4_ERROR_TIME_EXPIRED;
2390       vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2391       icmp4_error_set_vnet_buffer (b, ICMP4_time_exceeded,
2392                                    ICMP4_time_exceeded_ttl_exceeded_in_transit,
2393                                    0);
2394       *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2395     }
2396
2397   /* Verify checksum. */
2398   ASSERT ((ip->checksum == ip4_header_checksum (ip)) ||
2399           (b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM));
2400 }
2401
2402
2403 always_inline uword
2404 ip4_rewrite_inline_with_gso (vlib_main_t * vm,
2405                              vlib_node_runtime_t * node,
2406                              vlib_frame_t * frame,
2407                              int do_counters, int is_midchain, int is_mcast,
2408                              int do_gso)
2409 {
2410   ip_lookup_main_t *lm = &ip4_main.lookup_main;
2411   u32 *from = vlib_frame_vector_args (frame);
2412   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2413   u16 nexts[VLIB_FRAME_SIZE], *next;
2414   u32 n_left_from;
2415   vlib_node_runtime_t *error_node =
2416     vlib_node_get_runtime (vm, ip4_input_node.index);
2417
2418   n_left_from = frame->n_vectors;
2419   u32 thread_index = vm->thread_index;
2420
2421   vlib_get_buffers (vm, from, bufs, n_left_from);
2422   clib_memset_u16 (nexts, IP4_REWRITE_NEXT_DROP, n_left_from);
2423
2424 #if (CLIB_N_PREFETCHES >= 8)
2425   if (n_left_from >= 6)
2426     {
2427       int i;
2428       for (i = 2; i < 6; i++)
2429         vlib_prefetch_buffer_header (bufs[i], LOAD);
2430     }
2431
2432   next = nexts;
2433   b = bufs;
2434   while (n_left_from >= 8)
2435     {
2436       ip_adjacency_t *adj0, *adj1;
2437       ip4_header_t *ip0, *ip1;
2438       u32 rw_len0, error0, adj_index0;
2439       u32 rw_len1, error1, adj_index1;
2440       u32 tx_sw_if_index0, tx_sw_if_index1;
2441       u8 *p;
2442
2443       vlib_prefetch_buffer_header (b[6], LOAD);
2444       vlib_prefetch_buffer_header (b[7], LOAD);
2445
2446       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2447       adj_index1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
2448
2449       /*
2450        * pre-fetch the per-adjacency counters
2451        */
2452       if (do_counters)
2453         {
2454           vlib_prefetch_combined_counter (&adjacency_counters,
2455                                           thread_index, adj_index0);
2456           vlib_prefetch_combined_counter (&adjacency_counters,
2457                                           thread_index, adj_index1);
2458         }
2459
2460       ip0 = vlib_buffer_get_current (b[0]);
2461       ip1 = vlib_buffer_get_current (b[1]);
2462
2463       error0 = error1 = IP4_ERROR_NONE;
2464
2465       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2466       ip4_ttl_and_checksum_check (b[1], ip1, next + 1, &error1);
2467
2468       /* Rewrite packet header and updates lengths. */
2469       adj0 = adj_get (adj_index0);
2470       adj1 = adj_get (adj_index1);
2471
2472       /* Worth pipelining. No guarantee that adj0,1 are hot... */
2473       rw_len0 = adj0[0].rewrite_header.data_bytes;
2474       rw_len1 = adj1[0].rewrite_header.data_bytes;
2475       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2476       vnet_buffer (b[1])->ip.save_rewrite_length = rw_len1;
2477
2478       p = vlib_buffer_get_current (b[2]);
2479       CLIB_PREFETCH (p - CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES, STORE);
2480       CLIB_PREFETCH (p, CLIB_CACHE_LINE_BYTES, LOAD);
2481
2482       p = vlib_buffer_get_current (b[3]);
2483       CLIB_PREFETCH (p - CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES, STORE);
2484       CLIB_PREFETCH (p, CLIB_CACHE_LINE_BYTES, LOAD);
2485
2486       /* Check MTU of outgoing interface. */
2487       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2488       u16 ip1_len = clib_net_to_host_u16 (ip1->length);
2489
2490       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2491         ip0_len = gso_mtu_sz (b[0]);
2492       if (do_gso && (b[1]->flags & VNET_BUFFER_F_GSO))
2493         ip1_len = gso_mtu_sz (b[1]);
2494
2495       ip4_mtu_check (b[0], ip0_len,
2496                      adj0[0].rewrite_header.max_l3_packet_bytes,
2497                      ip0->flags_and_fragment_offset &
2498                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2499                      next + 0, &error0);
2500       ip4_mtu_check (b[1], ip1_len,
2501                      adj1[0].rewrite_header.max_l3_packet_bytes,
2502                      ip1->flags_and_fragment_offset &
2503                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2504                      next + 1, &error1);
2505
2506       if (is_mcast)
2507         {
2508           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2509                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2510                     IP4_ERROR_SAME_INTERFACE : error0);
2511           error1 = ((adj1[0].rewrite_header.sw_if_index ==
2512                      vnet_buffer (b[1])->sw_if_index[VLIB_RX]) ?
2513                     IP4_ERROR_SAME_INTERFACE : error1);
2514         }
2515
2516       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2517        * to see the IP header */
2518       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2519         {
2520           u32 next_index = adj0[0].rewrite_header.next_index;
2521           vlib_buffer_advance (b[0], -(word) rw_len0);
2522           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2523           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2524
2525           if (PREDICT_FALSE
2526               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2527             vnet_feature_arc_start (lm->output_feature_arc_index,
2528                                     tx_sw_if_index0, &next_index, b[0]);
2529           next[0] = next_index;
2530         }
2531       else
2532         {
2533           b[0]->error = error_node->errors[error0];
2534         }
2535       if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
2536         {
2537           u32 next_index = adj1[0].rewrite_header.next_index;
2538           vlib_buffer_advance (b[1], -(word) rw_len1);
2539
2540           tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2541           vnet_buffer (b[1])->sw_if_index[VLIB_TX] = tx_sw_if_index1;
2542
2543           if (PREDICT_FALSE
2544               (adj1[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2545             vnet_feature_arc_start (lm->output_feature_arc_index,
2546                                     tx_sw_if_index1, &next_index, b[1]);
2547           next[1] = next_index;
2548         }
2549       else
2550         {
2551           b[1]->error = error_node->errors[error1];
2552         }
2553       if (is_midchain)
2554         {
2555           calc_checksums (vm, b[0]);
2556           calc_checksums (vm, b[1]);
2557         }
2558       /* Guess we are only writing on simple Ethernet header. */
2559       vnet_rewrite_two_headers (adj0[0], adj1[0],
2560                                 ip0, ip1, sizeof (ethernet_header_t));
2561
2562       /*
2563        * Bump the per-adjacency counters
2564        */
2565       if (do_counters)
2566         {
2567           vlib_increment_combined_counter
2568             (&adjacency_counters,
2569              thread_index,
2570              adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2571
2572           vlib_increment_combined_counter
2573             (&adjacency_counters,
2574              thread_index,
2575              adj_index1, 1, vlib_buffer_length_in_chain (vm, b[1]) + rw_len1);
2576         }
2577
2578       if (is_midchain)
2579         {
2580           if (adj0->sub_type.midchain.fixup_func)
2581             adj0->sub_type.midchain.fixup_func
2582               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2583           if (adj1->sub_type.midchain.fixup_func)
2584             adj1->sub_type.midchain.fixup_func
2585               (vm, adj1, b[1], adj1->sub_type.midchain.fixup_data);
2586         }
2587
2588       if (is_mcast)
2589         {
2590           /*
2591            * copy bytes from the IP address into the MAC rewrite
2592            */
2593           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2594                                       adj0->rewrite_header.dst_mcast_offset,
2595                                       &ip0->dst_address.as_u32, (u8 *) ip0);
2596           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2597                                       adj1->rewrite_header.dst_mcast_offset,
2598                                       &ip1->dst_address.as_u32, (u8 *) ip1);
2599         }
2600
2601       next += 2;
2602       b += 2;
2603       n_left_from -= 2;
2604     }
2605 #elif (CLIB_N_PREFETCHES >= 4)
2606   next = nexts;
2607   b = bufs;
2608   while (n_left_from >= 1)
2609     {
2610       ip_adjacency_t *adj0;
2611       ip4_header_t *ip0;
2612       u32 rw_len0, error0, adj_index0;
2613       u32 tx_sw_if_index0;
2614       u8 *p;
2615
2616       /* Prefetch next iteration */
2617       if (PREDICT_TRUE (n_left_from >= 4))
2618         {
2619           ip_adjacency_t *adj2;
2620           u32 adj_index2;
2621
2622           vlib_prefetch_buffer_header (b[3], LOAD);
2623           vlib_prefetch_buffer_data (b[2], LOAD);
2624
2625           /* Prefetch adj->rewrite_header */
2626           adj_index2 = vnet_buffer (b[2])->ip.adj_index[VLIB_TX];
2627           adj2 = adj_get (adj_index2);
2628           p = (u8 *) adj2;
2629           CLIB_PREFETCH (p + CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES,
2630                          LOAD);
2631         }
2632
2633       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2634
2635       /*
2636        * Prefetch the per-adjacency counters
2637        */
2638       if (do_counters)
2639         {
2640           vlib_prefetch_combined_counter (&adjacency_counters,
2641                                           thread_index, adj_index0);
2642         }
2643
2644       ip0 = vlib_buffer_get_current (b[0]);
2645
2646       error0 = IP4_ERROR_NONE;
2647
2648       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2649
2650       /* Rewrite packet header and updates lengths. */
2651       adj0 = adj_get (adj_index0);
2652
2653       /* Rewrite header was prefetched. */
2654       rw_len0 = adj0[0].rewrite_header.data_bytes;
2655       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2656
2657       /* Check MTU of outgoing interface. */
2658       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2659
2660       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2661         ip0_len = gso_mtu_sz (b[0]);
2662
2663       ip4_mtu_check (b[0], ip0_len,
2664                      adj0[0].rewrite_header.max_l3_packet_bytes,
2665                      ip0->flags_and_fragment_offset &
2666                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2667                      next + 0, &error0);
2668
2669       if (is_mcast)
2670         {
2671           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2672                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2673                     IP4_ERROR_SAME_INTERFACE : error0);
2674         }
2675
2676       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2677        * to see the IP header */
2678       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2679         {
2680           u32 next_index = adj0[0].rewrite_header.next_index;
2681           vlib_buffer_advance (b[0], -(word) rw_len0);
2682           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2683           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2684
2685           if (PREDICT_FALSE
2686               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2687             vnet_feature_arc_start (lm->output_feature_arc_index,
2688                                     tx_sw_if_index0, &next_index, b[0]);
2689           next[0] = next_index;
2690         }
2691       else
2692         {
2693           b[0]->error = error_node->errors[error0];
2694         }
2695       if (is_midchain)
2696         {
2697           calc_checksums (vm, b[0]);
2698         }
2699       /* Guess we are only writing on simple Ethernet header. */
2700       vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
2701
2702       /*
2703        * Bump the per-adjacency counters
2704        */
2705       if (do_counters)
2706         {
2707           vlib_increment_combined_counter
2708             (&adjacency_counters,
2709              thread_index,
2710              adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2711         }
2712
2713       if (is_midchain)
2714         {
2715           if (adj0->sub_type.midchain.fixup_func)
2716             adj0->sub_type.midchain.fixup_func
2717               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2718         }
2719
2720       if (is_mcast)
2721         {
2722           /*
2723            * copy bytes from the IP address into the MAC rewrite
2724            */
2725           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2726                                       adj0->rewrite_header.dst_mcast_offset,
2727                                       &ip0->dst_address.as_u32, (u8 *) ip0);
2728         }
2729
2730       next += 1;
2731       b += 1;
2732       n_left_from -= 1;
2733     }
2734 #endif
2735
2736   while (n_left_from > 0)
2737     {
2738       ip_adjacency_t *adj0;
2739       ip4_header_t *ip0;
2740       u32 rw_len0, adj_index0, error0;
2741       u32 tx_sw_if_index0;
2742
2743       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2744
2745       adj0 = adj_get (adj_index0);
2746
2747       if (do_counters)
2748         vlib_prefetch_combined_counter (&adjacency_counters,
2749                                         thread_index, adj_index0);
2750
2751       ip0 = vlib_buffer_get_current (b[0]);
2752
2753       error0 = IP4_ERROR_NONE;
2754
2755       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2756
2757
2758       /* Update packet buffer attributes/set output interface. */
2759       rw_len0 = adj0[0].rewrite_header.data_bytes;
2760       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2761
2762       /* Check MTU of outgoing interface. */
2763       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2764       if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
2765         ip0_len = gso_mtu_sz (b[0]);
2766
2767       ip4_mtu_check (b[0], ip0_len,
2768                      adj0[0].rewrite_header.max_l3_packet_bytes,
2769                      ip0->flags_and_fragment_offset &
2770                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2771                      next + 0, &error0);
2772
2773       if (is_mcast)
2774         {
2775           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2776                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2777                     IP4_ERROR_SAME_INTERFACE : error0);
2778         }
2779
2780       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2781        * to see the IP header */
2782       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2783         {
2784           u32 next_index = adj0[0].rewrite_header.next_index;
2785           vlib_buffer_advance (b[0], -(word) rw_len0);
2786           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2787           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2788
2789           if (PREDICT_FALSE
2790               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2791             vnet_feature_arc_start (lm->output_feature_arc_index,
2792                                     tx_sw_if_index0, &next_index, b[0]);
2793           next[0] = next_index;
2794         }
2795       else
2796         {
2797           b[0]->error = error_node->errors[error0];
2798         }
2799       if (is_midchain)
2800         {
2801           calc_checksums (vm, b[0]);
2802         }
2803       /* Guess we are only writing on simple Ethernet header. */
2804       vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
2805
2806       if (do_counters)
2807         vlib_increment_combined_counter
2808           (&adjacency_counters,
2809            thread_index, adj_index0, 1,
2810            vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2811
2812       if (is_midchain)
2813         {
2814           if (adj0->sub_type.midchain.fixup_func)
2815             adj0->sub_type.midchain.fixup_func
2816               (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data);
2817         }
2818
2819       if (is_mcast)
2820         {
2821           /*
2822            * copy bytes from the IP address into the MAC rewrite
2823            */
2824           vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2825                                       adj0->rewrite_header.dst_mcast_offset,
2826                                       &ip0->dst_address.as_u32, (u8 *) ip0);
2827         }
2828
2829       next += 1;
2830       b += 1;
2831       n_left_from -= 1;
2832     }
2833
2834
2835   /* Need to do trace after rewrites to pick up new packet data. */
2836   if (node->flags & VLIB_NODE_FLAG_TRACE)
2837     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2838
2839   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2840   return frame->n_vectors;
2841 }
2842
2843 always_inline uword
2844 ip4_rewrite_inline (vlib_main_t * vm,
2845                     vlib_node_runtime_t * node,
2846                     vlib_frame_t * frame,
2847                     int do_counters, int is_midchain, int is_mcast)
2848 {
2849   vnet_main_t *vnm = vnet_get_main ();
2850   if (PREDICT_FALSE (vnm->interface_main.gso_interface_count > 0))
2851     return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
2852                                         is_midchain, is_mcast,
2853                                         1 /* do_gso */ );
2854   else
2855     return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
2856                                         is_midchain, is_mcast,
2857                                         0 /* no do_gso */ );
2858 }
2859
2860
2861 /** @brief IPv4 rewrite node.
2862     @node ip4-rewrite
2863
2864     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2865     header checksum, fetch the ip adjacency, check the outbound mtu,
2866     apply the adjacency rewrite, and send pkts to the adjacency
2867     rewrite header's rewrite_next_index.
2868
2869     @param vm vlib_main_t corresponding to the current thread
2870     @param node vlib_node_runtime_t
2871     @param frame vlib_frame_t whose contents should be dispatched
2872
2873     @par Graph mechanics: buffer metadata, next index usage
2874
2875     @em Uses:
2876     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2877         - the rewrite adjacency index
2878     - <code>adj->lookup_next_index</code>
2879         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2880           the packet will be dropped.
2881     - <code>adj->rewrite_header</code>
2882         - Rewrite string length, rewrite string, next_index
2883
2884     @em Sets:
2885     - <code>b->current_data, b->current_length</code>
2886         - Updated net of applying the rewrite string
2887
2888     <em>Next Indices:</em>
2889     - <code> adj->rewrite_header.next_index </code>
2890       or @c ip4-drop
2891 */
2892
2893 VLIB_NODE_FN (ip4_rewrite_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2894                                  vlib_frame_t * frame)
2895 {
2896   if (adj_are_counters_enabled ())
2897     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2898   else
2899     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2900 }
2901
2902 VLIB_NODE_FN (ip4_rewrite_bcast_node) (vlib_main_t * vm,
2903                                        vlib_node_runtime_t * node,
2904                                        vlib_frame_t * frame)
2905 {
2906   if (adj_are_counters_enabled ())
2907     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2908   else
2909     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2910 }
2911
2912 VLIB_NODE_FN (ip4_midchain_node) (vlib_main_t * vm,
2913                                   vlib_node_runtime_t * node,
2914                                   vlib_frame_t * frame)
2915 {
2916   if (adj_are_counters_enabled ())
2917     return ip4_rewrite_inline (vm, node, frame, 1, 1, 0);
2918   else
2919     return ip4_rewrite_inline (vm, node, frame, 0, 1, 0);
2920 }
2921
2922 VLIB_NODE_FN (ip4_rewrite_mcast_node) (vlib_main_t * vm,
2923                                        vlib_node_runtime_t * node,
2924                                        vlib_frame_t * frame)
2925 {
2926   if (adj_are_counters_enabled ())
2927     return ip4_rewrite_inline (vm, node, frame, 1, 0, 1);
2928   else
2929     return ip4_rewrite_inline (vm, node, frame, 0, 0, 1);
2930 }
2931
2932 VLIB_NODE_FN (ip4_mcast_midchain_node) (vlib_main_t * vm,
2933                                         vlib_node_runtime_t * node,
2934                                         vlib_frame_t * frame)
2935 {
2936   if (adj_are_counters_enabled ())
2937     return ip4_rewrite_inline (vm, node, frame, 1, 1, 1);
2938   else
2939     return ip4_rewrite_inline (vm, node, frame, 0, 1, 1);
2940 }
2941
2942 /* *INDENT-OFF* */
2943 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2944   .name = "ip4-rewrite",
2945   .vector_size = sizeof (u32),
2946
2947   .format_trace = format_ip4_rewrite_trace,
2948
2949   .n_next_nodes = IP4_REWRITE_N_NEXT,
2950   .next_nodes = {
2951     [IP4_REWRITE_NEXT_DROP] = "ip4-drop",
2952     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2953     [IP4_REWRITE_NEXT_FRAGMENT] = "ip4-frag",
2954   },
2955 };
2956
2957 VLIB_REGISTER_NODE (ip4_rewrite_bcast_node) = {
2958   .name = "ip4-rewrite-bcast",
2959   .vector_size = sizeof (u32),
2960
2961   .format_trace = format_ip4_rewrite_trace,
2962   .sibling_of = "ip4-rewrite",
2963 };
2964
2965 VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = {
2966   .name = "ip4-rewrite-mcast",
2967   .vector_size = sizeof (u32),
2968
2969   .format_trace = format_ip4_rewrite_trace,
2970   .sibling_of = "ip4-rewrite",
2971 };
2972
2973 VLIB_REGISTER_NODE (ip4_mcast_midchain_node) = {
2974   .name = "ip4-mcast-midchain",
2975   .vector_size = sizeof (u32),
2976
2977   .format_trace = format_ip4_rewrite_trace,
2978   .sibling_of = "ip4-rewrite",
2979 };
2980
2981 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2982   .name = "ip4-midchain",
2983   .vector_size = sizeof (u32),
2984   .format_trace = format_ip4_forward_next_trace,
2985   .sibling_of =  "ip4-rewrite",
2986 };
2987 /* *INDENT-ON */
2988
2989 static int
2990 ip4_lookup_validate (ip4_address_t * a, u32 fib_index0)
2991 {
2992   ip4_fib_mtrie_t *mtrie0;
2993   ip4_fib_mtrie_leaf_t leaf0;
2994   u32 lbi0;
2995
2996   mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
2997
2998   leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, a);
2999   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
3000   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
3001
3002   lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
3003
3004   return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get (fib_index0), a);
3005 }
3006
3007 static clib_error_t *
3008 test_lookup_command_fn (vlib_main_t * vm,
3009                         unformat_input_t * input, vlib_cli_command_t * cmd)
3010 {
3011   ip4_fib_t *fib;
3012   u32 table_id = 0;
3013   f64 count = 1;
3014   u32 n;
3015   int i;
3016   ip4_address_t ip4_base_address;
3017   u64 errors = 0;
3018
3019   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3020     {
3021       if (unformat (input, "table %d", &table_id))
3022         {
3023           /* Make sure the entry exists. */
3024           fib = ip4_fib_get (table_id);
3025           if ((fib) && (fib->index != table_id))
3026             return clib_error_return (0, "<fib-index> %d does not exist",
3027                                       table_id);
3028         }
3029       else if (unformat (input, "count %f", &count))
3030         ;
3031
3032       else if (unformat (input, "%U",
3033                          unformat_ip4_address, &ip4_base_address))
3034         ;
3035       else
3036         return clib_error_return (0, "unknown input `%U'",
3037                                   format_unformat_error, input);
3038     }
3039
3040   n = count;
3041
3042   for (i = 0; i < n; i++)
3043     {
3044       if (!ip4_lookup_validate (&ip4_base_address, table_id))
3045         errors++;
3046
3047       ip4_base_address.as_u32 =
3048         clib_host_to_net_u32 (1 +
3049                               clib_net_to_host_u32 (ip4_base_address.as_u32));
3050     }
3051
3052   if (errors)
3053     vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
3054   else
3055     vlib_cli_output (vm, "No errors in %d lookups\n", n);
3056
3057   return 0;
3058 }
3059
3060 /*?
3061  * Perform a lookup of an IPv4 Address (or range of addresses) in the
3062  * given FIB table to determine if there is a conflict with the
3063  * adjacency table. The fib-id can be determined by using the
3064  * '<em>show ip fib</em>' command. If fib-id is not entered, default value
3065  * of 0 is used.
3066  *
3067  * @todo This command uses fib-id, other commands use table-id (not
3068  * just a name, they are different indexes). Would like to change this
3069  * to table-id for consistency.
3070  *
3071  * @cliexpar
3072  * Example of how to run the test lookup command:
3073  * @cliexstart{test lookup 172.16.1.1 table 1 count 2}
3074  * No errors in 2 lookups
3075  * @cliexend
3076 ?*/
3077 /* *INDENT-OFF* */
3078 VLIB_CLI_COMMAND (lookup_test_command, static) =
3079 {
3080   .path = "test lookup",
3081   .short_help = "test lookup <ipv4-addr> [table <fib-id>] [count <nn>]",
3082   .function = test_lookup_command_fn,
3083 };
3084 /* *INDENT-ON* */
3085
3086 #ifndef CLIB_MARCH_VARIANT
3087 int
3088 vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
3089 {
3090   u32 fib_index;
3091
3092   fib_index = fib_table_find (FIB_PROTOCOL_IP4, table_id);
3093
3094   if (~0 == fib_index)
3095     return VNET_API_ERROR_NO_SUCH_FIB;
3096
3097   fib_table_set_flow_hash_config (fib_index, FIB_PROTOCOL_IP4,
3098                                   flow_hash_config);
3099
3100   return 0;
3101 }
3102 #endif
3103
3104 static clib_error_t *
3105 set_ip_flow_hash_command_fn (vlib_main_t * vm,
3106                              unformat_input_t * input,
3107                              vlib_cli_command_t * cmd)
3108 {
3109   int matched = 0;
3110   u32 table_id = 0;
3111   u32 flow_hash_config = 0;
3112   int rv;
3113
3114   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3115     {
3116       if (unformat (input, "table %d", &table_id))
3117         matched = 1;
3118 #define _(a,v) \
3119     else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
3120       foreach_flow_hash_bit
3121 #undef _
3122         else
3123         break;
3124     }
3125
3126   if (matched == 0)
3127     return clib_error_return (0, "unknown input `%U'",
3128                               format_unformat_error, input);
3129
3130   rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
3131   switch (rv)
3132     {
3133     case 0:
3134       break;
3135
3136     case VNET_API_ERROR_NO_SUCH_FIB:
3137       return clib_error_return (0, "no such FIB table %d", table_id);
3138
3139     default:
3140       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
3141       break;
3142     }
3143
3144   return 0;
3145 }
3146
3147 /*?
3148  * Configure the set of IPv4 fields used by the flow hash.
3149  *
3150  * @cliexpar
3151  * Example of how to set the flow hash on a given table:
3152  * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
3153  * Example of display the configured flow hash:
3154  * @cliexstart{show ip fib}
3155  * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
3156  * 0.0.0.0/0
3157  *   unicast-ip4-chain
3158  *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
3159  *     [0] [@0]: dpo-drop ip6
3160  * 0.0.0.0/32
3161  *   unicast-ip4-chain
3162  *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
3163  *     [0] [@0]: dpo-drop ip6
3164  * 224.0.0.0/8
3165  *   unicast-ip4-chain
3166  *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
3167  *     [0] [@0]: dpo-drop ip6
3168  * 6.0.1.2/32
3169  *   unicast-ip4-chain
3170  *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
3171  *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3172  * 7.0.0.1/32
3173  *   unicast-ip4-chain
3174  *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
3175  *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3176  *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3177  *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3178  *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3179  * 240.0.0.0/8
3180  *   unicast-ip4-chain
3181  *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
3182  *     [0] [@0]: dpo-drop ip6
3183  * 255.255.255.255/32
3184  *   unicast-ip4-chain
3185  *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
3186  *     [0] [@0]: dpo-drop ip6
3187  * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
3188  * 0.0.0.0/0
3189  *   unicast-ip4-chain
3190  *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
3191  *     [0] [@0]: dpo-drop ip6
3192  * 0.0.0.0/32
3193  *   unicast-ip4-chain
3194  *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
3195  *     [0] [@0]: dpo-drop ip6
3196  * 172.16.1.0/24
3197  *   unicast-ip4-chain
3198  *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
3199  *     [0] [@4]: ipv4-glean: af_packet0
3200  * 172.16.1.1/32
3201  *   unicast-ip4-chain
3202  *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
3203  *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
3204  * 172.16.1.2/32
3205  *   unicast-ip4-chain
3206  *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
3207  *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
3208  * 172.16.2.0/24
3209  *   unicast-ip4-chain
3210  *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
3211  *     [0] [@4]: ipv4-glean: af_packet1
3212  * 172.16.2.1/32
3213  *   unicast-ip4-chain
3214  *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
3215  *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
3216  * 224.0.0.0/8
3217  *   unicast-ip4-chain
3218  *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
3219  *     [0] [@0]: dpo-drop ip6
3220  * 240.0.0.0/8
3221  *   unicast-ip4-chain
3222  *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
3223  *     [0] [@0]: dpo-drop ip6
3224  * 255.255.255.255/32
3225  *   unicast-ip4-chain
3226  *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
3227  *     [0] [@0]: dpo-drop ip6
3228  * @cliexend
3229 ?*/
3230 /* *INDENT-OFF* */
3231 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) =
3232 {
3233   .path = "set ip flow-hash",
3234   .short_help =
3235   "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
3236   .function = set_ip_flow_hash_command_fn,
3237 };
3238 /* *INDENT-ON* */
3239
3240 #ifndef CLIB_MARCH_VARIANT
3241 int
3242 vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
3243                              u32 table_index)
3244 {
3245   vnet_main_t *vnm = vnet_get_main ();
3246   vnet_interface_main_t *im = &vnm->interface_main;
3247   ip4_main_t *ipm = &ip4_main;
3248   ip_lookup_main_t *lm = &ipm->lookup_main;
3249   vnet_classify_main_t *cm = &vnet_classify_main;
3250   ip4_address_t *if_addr;
3251
3252   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
3253     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
3254
3255   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
3256     return VNET_API_ERROR_NO_SUCH_ENTRY;
3257
3258   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
3259   lm->classify_table_index_by_sw_if_index[sw_if_index] = table_index;
3260
3261   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
3262
3263   if (NULL != if_addr)
3264     {
3265       fib_prefix_t pfx = {
3266         .fp_len = 32,
3267         .fp_proto = FIB_PROTOCOL_IP4,
3268         .fp_addr.ip4 = *if_addr,
3269       };
3270       u32 fib_index;
3271
3272       fib_index = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
3273                                                        sw_if_index);
3274
3275
3276       if (table_index != (u32) ~ 0)
3277         {
3278           dpo_id_t dpo = DPO_INVALID;
3279
3280           dpo_set (&dpo,
3281                    DPO_CLASSIFY,
3282                    DPO_PROTO_IP4,
3283                    classify_dpo_create (DPO_PROTO_IP4, table_index));
3284
3285           fib_table_entry_special_dpo_add (fib_index,
3286                                            &pfx,
3287                                            FIB_SOURCE_CLASSIFY,
3288                                            FIB_ENTRY_FLAG_NONE, &dpo);
3289           dpo_reset (&dpo);
3290         }
3291       else
3292         {
3293           fib_table_entry_special_remove (fib_index,
3294                                           &pfx, FIB_SOURCE_CLASSIFY);
3295         }
3296     }
3297
3298   return 0;
3299 }
3300 #endif
3301
3302 static clib_error_t *
3303 set_ip_classify_command_fn (vlib_main_t * vm,
3304                             unformat_input_t * input,
3305                             vlib_cli_command_t * cmd)
3306 {
3307   u32 table_index = ~0;
3308   int table_index_set = 0;
3309   u32 sw_if_index = ~0;
3310   int rv;
3311
3312   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3313     {
3314       if (unformat (input, "table-index %d", &table_index))
3315         table_index_set = 1;
3316       else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
3317                          vnet_get_main (), &sw_if_index))
3318         ;
3319       else
3320         break;
3321     }
3322
3323   if (table_index_set == 0)
3324     return clib_error_return (0, "classify table-index must be specified");
3325
3326   if (sw_if_index == ~0)
3327     return clib_error_return (0, "interface / subif must be specified");
3328
3329   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
3330
3331   switch (rv)
3332     {
3333     case 0:
3334       break;
3335
3336     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
3337       return clib_error_return (0, "No such interface");
3338
3339     case VNET_API_ERROR_NO_SUCH_ENTRY:
3340       return clib_error_return (0, "No such classifier table");
3341     }
3342   return 0;
3343 }
3344
3345 /*?
3346  * Assign a classification table to an interface. The classification
3347  * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
3348  * commands. Once the table is create, use this command to filter packets
3349  * on an interface.
3350  *
3351  * @cliexpar
3352  * Example of how to assign a classification table to an interface:
3353  * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
3354 ?*/
3355 /* *INDENT-OFF* */
3356 VLIB_CLI_COMMAND (set_ip_classify_command, static) =
3357 {
3358     .path = "set ip classify",
3359     .short_help =
3360     "set ip classify intfc <interface> table-index <classify-idx>",
3361     .function = set_ip_classify_command_fn,
3362 };
3363 /* *INDENT-ON* */
3364
3365 static clib_error_t *
3366 ip4_config (vlib_main_t * vm, unformat_input_t * input)
3367 {
3368   ip4_main_t *im = &ip4_main;
3369   uword heapsize = 0;
3370
3371   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
3372     {
3373       if (unformat (input, "heap-size %U", unformat_memory_size, &heapsize))
3374         ;
3375       else
3376         return clib_error_return (0,
3377                                   "invalid heap-size parameter `%U'",
3378                                   format_unformat_error, input);
3379     }
3380
3381   im->mtrie_heap_size = heapsize;
3382
3383   return 0;
3384 }
3385
3386 VLIB_EARLY_CONFIG_FUNCTION (ip4_config, "ip");
3387
3388 /*
3389  * fd.io coding-style-patch-verification: ON
3390  *
3391  * Local Variables:
3392  * eval: (c-set-style "gnu")
3393  * End:
3394  */