vxlan: improve to use the hardware offload
[vpp.git] / src / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ip/ip_frag.h>
43 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
44 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
45 #include <vnet/ppp/ppp.h>
46 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
47 #include <vnet/api_errno.h>     /* for API error numbers */
48 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
50 #include <vnet/fib/fib_urpf_list.h>     /* for FIB uRPF check */
51 #include <vnet/fib/ip4_fib.h>
52 #include <vnet/mfib/ip4_mfib.h>
53 #include <vnet/dpo/load_balance.h>
54 #include <vnet/dpo/load_balance_map.h>
55 #include <vnet/dpo/classify_dpo.h>
56 #include <vnet/mfib/mfib_table.h>       /* for mFIB table and entry creation */
57 #include <vnet/adj/adj_dp.h>
58 #include <vnet/pg/pg.h>
59
60 #include <vnet/ip/ip4_forward.h>
61 #include <vnet/interface_output.h>
62 #include <vnet/classify/vnet_classify.h>
63
64 /** @brief IPv4 lookup node.
65     @node ip4-lookup
66
67     This is the main IPv4 lookup dispatch node.
68
69     @param vm vlib_main_t corresponding to the current thread
70     @param node vlib_node_runtime_t
71     @param frame vlib_frame_t whose contents should be dispatched
72
73     @par Graph mechanics: buffer metadata, next index usage
74
75     @em Uses:
76     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
77         - Indicates the @c sw_if_index value of the interface that the
78           packet was received on.
79     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
80         - When the value is @c ~0 then the node performs a longest prefix
81           match (LPM) for the packet destination address in the FIB attached
82           to the receive interface.
83         - Otherwise perform LPM for the packet destination address in the
84           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
85           value (0, 1, ...) and not a VRF id.
86
87     @em Sets:
88     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
89         - The lookup result adjacency index.
90
91     <em>Next Index:</em>
92     - Dispatches the packet to the node index found in
93       ip_adjacency_t @c adj->lookup_next_index
94       (where @c adj is the lookup result adjacency).
95 */
96 VLIB_NODE_FN (ip4_lookup_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
97                                 vlib_frame_t * frame)
98 {
99   return ip4_lookup_inline (vm, node, frame);
100 }
101
102 static u8 *format_ip4_lookup_trace (u8 * s, va_list * args);
103
104 /* *INDENT-OFF* */
105 VLIB_REGISTER_NODE (ip4_lookup_node) =
106 {
107   .name = "ip4-lookup",
108   .vector_size = sizeof (u32),
109   .format_trace = format_ip4_lookup_trace,
110   .n_next_nodes = IP_LOOKUP_N_NEXT,
111   .next_nodes = IP4_LOOKUP_NEXT_NODES,
112 };
113 /* *INDENT-ON* */
114
115 VLIB_NODE_FN (ip4_load_balance_node) (vlib_main_t * vm,
116                                       vlib_node_runtime_t * node,
117                                       vlib_frame_t * frame)
118 {
119   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
120   u32 n_left, *from;
121   u32 thread_index = vm->thread_index;
122   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
123   u16 nexts[VLIB_FRAME_SIZE], *next;
124
125   from = vlib_frame_vector_args (frame);
126   n_left = frame->n_vectors;
127   next = nexts;
128
129   vlib_get_buffers (vm, from, bufs, n_left);
130
131   while (n_left >= 4)
132     {
133       const load_balance_t *lb0, *lb1;
134       const ip4_header_t *ip0, *ip1;
135       u32 lbi0, hc0, lbi1, hc1;
136       const dpo_id_t *dpo0, *dpo1;
137
138       /* Prefetch next iteration. */
139       {
140         vlib_prefetch_buffer_header (b[2], LOAD);
141         vlib_prefetch_buffer_header (b[3], LOAD);
142
143         CLIB_PREFETCH (b[2]->data, sizeof (ip0[0]), LOAD);
144         CLIB_PREFETCH (b[3]->data, sizeof (ip0[0]), LOAD);
145       }
146
147       ip0 = vlib_buffer_get_current (b[0]);
148       ip1 = vlib_buffer_get_current (b[1]);
149       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
150       lbi1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
151
152       lb0 = load_balance_get (lbi0);
153       lb1 = load_balance_get (lbi1);
154
155       /*
156        * this node is for via FIBs we can re-use the hash value from the
157        * to node if present.
158        * We don't want to use the same hash value at each level in the recursion
159        * graph as that would lead to polarisation
160        */
161       hc0 = hc1 = 0;
162
163       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
164         {
165           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
166             {
167               hc0 = vnet_buffer (b[0])->ip.flow_hash =
168                 vnet_buffer (b[0])->ip.flow_hash >> 1;
169             }
170           else
171             {
172               hc0 = vnet_buffer (b[0])->ip.flow_hash =
173                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
174             }
175           dpo0 = load_balance_get_fwd_bucket
176             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
177         }
178       else
179         {
180           dpo0 = load_balance_get_bucket_i (lb0, 0);
181         }
182       if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
183         {
184           if (PREDICT_TRUE (vnet_buffer (b[1])->ip.flow_hash))
185             {
186               hc1 = vnet_buffer (b[1])->ip.flow_hash =
187                 vnet_buffer (b[1])->ip.flow_hash >> 1;
188             }
189           else
190             {
191               hc1 = vnet_buffer (b[1])->ip.flow_hash =
192                 ip4_compute_flow_hash (ip1, lb1->lb_hash_config);
193             }
194           dpo1 = load_balance_get_fwd_bucket
195             (lb1, (hc1 & (lb1->lb_n_buckets_minus_1)));
196         }
197       else
198         {
199           dpo1 = load_balance_get_bucket_i (lb1, 0);
200         }
201
202       next[0] = dpo0->dpoi_next_node;
203       next[1] = dpo1->dpoi_next_node;
204
205       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
206       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
207
208       vlib_increment_combined_counter
209         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
210       vlib_increment_combined_counter
211         (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b[1]));
212
213       b += 2;
214       next += 2;
215       n_left -= 2;
216     }
217
218   while (n_left > 0)
219     {
220       const load_balance_t *lb0;
221       const ip4_header_t *ip0;
222       const dpo_id_t *dpo0;
223       u32 lbi0, hc0;
224
225       ip0 = vlib_buffer_get_current (b[0]);
226       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
227
228       lb0 = load_balance_get (lbi0);
229
230       hc0 = 0;
231       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
232         {
233           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
234             {
235               hc0 = vnet_buffer (b[0])->ip.flow_hash =
236                 vnet_buffer (b[0])->ip.flow_hash >> 1;
237             }
238           else
239             {
240               hc0 = vnet_buffer (b[0])->ip.flow_hash =
241                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
242             }
243           dpo0 = load_balance_get_fwd_bucket
244             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
245         }
246       else
247         {
248           dpo0 = load_balance_get_bucket_i (lb0, 0);
249         }
250
251       next[0] = dpo0->dpoi_next_node;
252       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
253
254       vlib_increment_combined_counter
255         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
256
257       b += 1;
258       next += 1;
259       n_left -= 1;
260     }
261
262   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
263   if (node->flags & VLIB_NODE_FLAG_TRACE)
264     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
265
266   return frame->n_vectors;
267 }
268
269 /* *INDENT-OFF* */
270 VLIB_REGISTER_NODE (ip4_load_balance_node) =
271 {
272   .name = "ip4-load-balance",
273   .vector_size = sizeof (u32),
274   .sibling_of = "ip4-lookup",
275   .format_trace = format_ip4_lookup_trace,
276 };
277 /* *INDENT-ON* */
278
279 #ifndef CLIB_MARCH_VARIANT
280 /* get first interface address */
281 ip4_address_t *
282 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
283                              ip_interface_address_t ** result_ia)
284 {
285   ip_lookup_main_t *lm = &im->lookup_main;
286   ip_interface_address_t *ia = 0;
287   ip4_address_t *result = 0;
288
289   /* *INDENT-OFF* */
290   foreach_ip_interface_address
291     (lm, ia, sw_if_index,
292      1 /* honor unnumbered */ ,
293      ({
294        ip4_address_t * a =
295          ip_interface_address_get_address (lm, ia);
296        result = a;
297        break;
298      }));
299   /* *INDENT-OFF* */
300   if (result_ia)
301     *result_ia = result ? ia : 0;
302   return result;
303 }
304 #endif
305
306 static void
307 ip4_add_subnet_bcast_route (u32 fib_index,
308                             fib_prefix_t *pfx,
309                             u32 sw_if_index)
310 {
311   vnet_sw_interface_flags_t iflags;
312
313   iflags = vnet_sw_interface_get_flags(vnet_get_main(), sw_if_index);
314
315   fib_table_entry_special_remove(fib_index,
316                                  pfx,
317                                  FIB_SOURCE_INTERFACE);
318
319   if (iflags & VNET_SW_INTERFACE_FLAG_DIRECTED_BCAST)
320     {
321       fib_table_entry_update_one_path (fib_index, pfx,
322                                        FIB_SOURCE_INTERFACE,
323                                        FIB_ENTRY_FLAG_NONE,
324                                        DPO_PROTO_IP4,
325                                        /* No next-hop address */
326                                        &ADJ_BCAST_ADDR,
327                                        sw_if_index,
328                                        // invalid FIB index
329                                        ~0,
330                                        1,
331                                        // no out-label stack
332                                        NULL,
333                                        FIB_ROUTE_PATH_FLAG_NONE);
334     }
335   else
336     {
337         fib_table_entry_special_add(fib_index,
338                                     pfx,
339                                     FIB_SOURCE_INTERFACE,
340                                     (FIB_ENTRY_FLAG_DROP |
341                                      FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
342     }
343 }
344
345 static void
346 ip4_add_interface_prefix_routes (ip4_main_t *im,
347                                  u32 sw_if_index,
348                                  u32 fib_index,
349                                  ip_interface_address_t * a)
350 {
351   ip_lookup_main_t *lm = &im->lookup_main;
352   ip_interface_prefix_t *if_prefix;
353   ip4_address_t *address = ip_interface_address_get_address (lm, a);
354
355   ip_interface_prefix_key_t key = {
356     .prefix = {
357       .fp_len = a->address_length,
358       .fp_proto = FIB_PROTOCOL_IP4,
359       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[a->address_length],
360     },
361     .sw_if_index = sw_if_index,
362   };
363
364   fib_prefix_t pfx_special = {
365     .fp_proto = FIB_PROTOCOL_IP4,
366   };
367
368   /* If prefix already set on interface, just increment ref count & return */
369   if_prefix = ip_get_interface_prefix (lm, &key);
370   if (if_prefix)
371     {
372       if_prefix->ref_count += 1;
373       return;
374     }
375
376   /* New prefix - allocate a pool entry, initialize it, add to the hash */
377   pool_get (lm->if_prefix_pool, if_prefix);
378   if_prefix->ref_count = 1;
379   if_prefix->src_ia_index = a - lm->if_address_pool;
380   clib_memcpy (&if_prefix->key, &key, sizeof (key));
381   mhash_set (&lm->prefix_to_if_prefix_index, &key,
382              if_prefix - lm->if_prefix_pool, 0 /* old value */);
383
384   pfx_special.fp_len = a->address_length;
385   pfx_special.fp_addr.ip4.as_u32 = address->as_u32;
386
387   /* set the glean route for the prefix */
388   fib_table_entry_update_one_path (fib_index, &pfx_special,
389                                    FIB_SOURCE_INTERFACE,
390                                    (FIB_ENTRY_FLAG_CONNECTED |
391                                     FIB_ENTRY_FLAG_ATTACHED),
392                                    DPO_PROTO_IP4,
393                                    /* No next-hop address */
394                                    NULL,
395                                    sw_if_index,
396                                    /* invalid FIB index */
397                                    ~0,
398                                    1,
399                                    /* no out-label stack */
400                                    NULL,
401                                    FIB_ROUTE_PATH_FLAG_NONE);
402
403   /* length <= 30 - add glean, drop first address, maybe drop bcast address */
404   if (a->address_length <= 30)
405     {
406       /* set a drop route for the base address of the prefix */
407       pfx_special.fp_len = 32;
408       pfx_special.fp_addr.ip4.as_u32 =
409         address->as_u32 & im->fib_masks[a->address_length];
410
411       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
412         fib_table_entry_special_add (fib_index, &pfx_special,
413                                      FIB_SOURCE_INTERFACE,
414                                      (FIB_ENTRY_FLAG_DROP |
415                                       FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
416
417       /* set a route for the broadcast address of the prefix */
418       pfx_special.fp_len = 32;
419       pfx_special.fp_addr.ip4.as_u32 =
420         address->as_u32 | ~im->fib_masks[a->address_length];
421       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
422         ip4_add_subnet_bcast_route (fib_index, &pfx_special, sw_if_index);
423
424
425     }
426   /* length == 31 - add an attached route for the other address */
427   else if (a->address_length == 31)
428     {
429       pfx_special.fp_len = 32;
430       pfx_special.fp_addr.ip4.as_u32 =
431         address->as_u32 ^ clib_host_to_net_u32(1);
432
433       fib_table_entry_update_one_path (fib_index, &pfx_special,
434                                        FIB_SOURCE_INTERFACE,
435                                        (FIB_ENTRY_FLAG_ATTACHED),
436                                        DPO_PROTO_IP4,
437                                        &pfx_special.fp_addr,
438                                        sw_if_index,
439                                        /* invalid FIB index */
440                                        ~0,
441                                        1,
442                                        NULL,
443                                        FIB_ROUTE_PATH_FLAG_NONE);
444     }
445 }
446
447 static void
448 ip4_add_interface_routes (u32 sw_if_index,
449                           ip4_main_t * im, u32 fib_index,
450                           ip_interface_address_t * a)
451 {
452   ip_lookup_main_t *lm = &im->lookup_main;
453   ip4_address_t *address = ip_interface_address_get_address (lm, a);
454   fib_prefix_t pfx = {
455     .fp_len = 32,
456     .fp_proto = FIB_PROTOCOL_IP4,
457     .fp_addr.ip4 = *address,
458   };
459
460   /* set special routes for the prefix if needed */
461   ip4_add_interface_prefix_routes (im, sw_if_index, fib_index, a);
462
463   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
464     {
465       u32 classify_table_index =
466         lm->classify_table_index_by_sw_if_index[sw_if_index];
467       if (classify_table_index != (u32) ~ 0)
468         {
469           dpo_id_t dpo = DPO_INVALID;
470
471           dpo_set (&dpo,
472                    DPO_CLASSIFY,
473                    DPO_PROTO_IP4,
474                    classify_dpo_create (DPO_PROTO_IP4, classify_table_index));
475
476           fib_table_entry_special_dpo_add (fib_index,
477                                            &pfx,
478                                            FIB_SOURCE_CLASSIFY,
479                                            FIB_ENTRY_FLAG_NONE, &dpo);
480           dpo_reset (&dpo);
481         }
482     }
483
484   fib_table_entry_update_one_path (fib_index, &pfx,
485                                    FIB_SOURCE_INTERFACE,
486                                    (FIB_ENTRY_FLAG_CONNECTED |
487                                     FIB_ENTRY_FLAG_LOCAL),
488                                    DPO_PROTO_IP4,
489                                    &pfx.fp_addr,
490                                    sw_if_index,
491                                    // invalid FIB index
492                                    ~0,
493                                    1, NULL,
494                                    FIB_ROUTE_PATH_FLAG_NONE);
495 }
496
497 static void
498 ip4_del_interface_prefix_routes (ip4_main_t * im,
499                                  u32 sw_if_index,
500                                  u32 fib_index,
501                                  ip4_address_t * address,
502                                  u32 address_length)
503 {
504   ip_lookup_main_t *lm = &im->lookup_main;
505   ip_interface_prefix_t *if_prefix;
506
507   ip_interface_prefix_key_t key = {
508     .prefix = {
509       .fp_len = address_length,
510       .fp_proto = FIB_PROTOCOL_IP4,
511       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[address_length],
512     },
513     .sw_if_index = sw_if_index,
514   };
515
516   fib_prefix_t pfx_special = {
517     .fp_len = 32,
518     .fp_proto = FIB_PROTOCOL_IP4,
519   };
520
521   if_prefix = ip_get_interface_prefix (lm, &key);
522   if (!if_prefix)
523     {
524       clib_warning ("Prefix not found while deleting %U",
525                     format_ip4_address_and_length, address, address_length);
526       return;
527     }
528
529   if_prefix->ref_count -= 1;
530
531   /*
532    * Routes need to be adjusted if deleting last intf addr in prefix
533    *
534    * We're done now otherwise
535    */
536   if (if_prefix->ref_count > 0)
537     return;
538
539   /* length <= 30, delete glean route, first address, last address */
540   if (address_length <= 30)
541     {
542       /* Less work to do in FIB if we remove the covered /32s first */
543
544       /* first address in prefix */
545       pfx_special.fp_addr.ip4.as_u32 =
546         address->as_u32 & im->fib_masks[address_length];
547       pfx_special.fp_len = 32;
548
549       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
550         fib_table_entry_special_remove (fib_index,
551                                         &pfx_special,
552                                         FIB_SOURCE_INTERFACE);
553
554       /* prefix broadcast address */
555       pfx_special.fp_addr.ip4.as_u32 =
556         address->as_u32 | ~im->fib_masks[address_length];
557       pfx_special.fp_len = 32;
558
559       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
560         fib_table_entry_special_remove (fib_index,
561                                         &pfx_special,
562                                         FIB_SOURCE_INTERFACE);
563     }
564   else if (address_length == 31)
565     {
566       /* length == 31, delete attached route for the other address */
567       pfx_special.fp_addr.ip4.as_u32 =
568         address->as_u32 ^ clib_host_to_net_u32(1);
569
570       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
571     }
572
573   /* remove glean route for prefix */
574   pfx_special.fp_addr.ip4 = *address;
575   pfx_special.fp_len = address_length;
576   fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
577
578   mhash_unset (&lm->prefix_to_if_prefix_index, &key, 0 /* old_value */);
579   pool_put (lm->if_prefix_pool, if_prefix);
580 }
581
582 static void
583 ip4_del_interface_routes (u32 sw_if_index,
584                           ip4_main_t * im,
585                           u32 fib_index,
586                           ip4_address_t * address, u32 address_length)
587 {
588   fib_prefix_t pfx = {
589     .fp_len = 32,
590     .fp_proto = FIB_PROTOCOL_IP4,
591     .fp_addr.ip4 = *address,
592   };
593
594   fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
595
596   ip4_del_interface_prefix_routes (im, sw_if_index, fib_index,
597                                    address, address_length);
598 }
599
600 #ifndef CLIB_MARCH_VARIANT
601 void
602 ip4_sw_interface_enable_disable (u32 sw_if_index, u32 is_enable)
603 {
604   ip4_main_t *im = &ip4_main;
605   vnet_main_t *vnm = vnet_get_main ();
606   vnet_hw_interface_t *hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
607
608   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
609
610   /*
611    * enable/disable only on the 1<->0 transition
612    */
613   if (is_enable)
614     {
615       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
616         return;
617     }
618   else
619     {
620       ASSERT (im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
621       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
622         return;
623     }
624   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
625                                !is_enable, 0, 0);
626
627
628   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
629                                sw_if_index, !is_enable, 0, 0);
630
631   if (is_enable)
632     hi->l3_if_count++;
633   else if (hi->l3_if_count)
634     hi->l3_if_count--;
635
636   {
637     ip4_enable_disable_interface_callback_t *cb;
638     vec_foreach (cb, im->enable_disable_interface_callbacks)
639       cb->function (im, cb->function_opaque, sw_if_index, is_enable);
640   }
641 }
642
643 static clib_error_t *
644 ip4_add_del_interface_address_internal (vlib_main_t * vm,
645                                         u32 sw_if_index,
646                                         ip4_address_t * address,
647                                         u32 address_length, u32 is_del)
648 {
649   vnet_main_t *vnm = vnet_get_main ();
650   ip4_main_t *im = &ip4_main;
651   ip_lookup_main_t *lm = &im->lookup_main;
652   clib_error_t *error = 0;
653   u32 if_address_index;
654   ip4_address_fib_t ip4_af, *addr_fib = 0;
655
656   error = vnet_sw_interface_supports_addressing (vnm, sw_if_index);
657   if (error)
658     return error;
659
660   ip4_addr_fib_init (&ip4_af, address,
661                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
662   vec_add1 (addr_fib, ip4_af);
663
664   /*
665    * there is no support for adj-fib handling in the presence of overlapping
666    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
667    * most routers do.
668    */
669   /* *INDENT-OFF* */
670   if (!is_del)
671     {
672       /* When adding an address check that it does not conflict
673          with an existing address on any interface in this table. */
674       ip_interface_address_t *ia;
675       vnet_sw_interface_t *sif;
676
677       pool_foreach (sif, vnm->interface_main.sw_interfaces)
678        {
679           if (im->fib_index_by_sw_if_index[sw_if_index] ==
680               im->fib_index_by_sw_if_index[sif->sw_if_index])
681             {
682               foreach_ip_interface_address
683                 (&im->lookup_main, ia, sif->sw_if_index,
684                  0 /* honor unnumbered */ ,
685                  ({
686                    ip4_address_t * x =
687                      ip_interface_address_get_address
688                      (&im->lookup_main, ia);
689
690                    if (ip4_destination_matches_route
691                        (im, address, x, ia->address_length) ||
692                        ip4_destination_matches_route (im,
693                                                       x,
694                                                       address,
695                                                       address_length))
696                      {
697                        /* an intf may have >1 addr from the same prefix */
698                        if ((sw_if_index == sif->sw_if_index) &&
699                            (ia->address_length == address_length) &&
700                            (x->as_u32 != address->as_u32))
701                          continue;
702
703                        if (ia->flags & IP_INTERFACE_ADDRESS_FLAG_STALE)
704                          /* if the address we're comparing against is stale
705                           * then the CP has not added this one back yet, maybe
706                           * it never will, so we have to assume it won't and
707                           * ignore it. if it does add it back, then it will fail
708                           * because this one is now present */
709                          continue;
710
711                        /* error if the length or intf was different */
712                        vnm->api_errno = VNET_API_ERROR_ADDRESS_IN_USE;
713
714                        error = clib_error_create
715                          ("failed to add %U on %U which conflicts with %U for interface %U",
716                           format_ip4_address_and_length, address,
717                           address_length,
718                           format_vnet_sw_if_index_name, vnm,
719                           sw_if_index,
720                           format_ip4_address_and_length, x,
721                           ia->address_length,
722                           format_vnet_sw_if_index_name, vnm,
723                           sif->sw_if_index);
724                        goto done;
725                      }
726                  }));
727             }
728       }
729     }
730   /* *INDENT-ON* */
731
732   if_address_index = ip_interface_address_find (lm, addr_fib, address_length);
733
734   if (is_del)
735     {
736       if (~0 == if_address_index)
737         {
738           vnm->api_errno = VNET_API_ERROR_ADDRESS_NOT_FOUND_FOR_INTERFACE;
739           error = clib_error_create ("%U not found for interface %U",
740                                      lm->format_address_and_length,
741                                      addr_fib, address_length,
742                                      format_vnet_sw_if_index_name, vnm,
743                                      sw_if_index);
744           goto done;
745         }
746
747       error = ip_interface_address_del (lm, vnm, if_address_index, addr_fib,
748                                         address_length, sw_if_index);
749       if (error)
750         goto done;
751     }
752   else
753     {
754       if (~0 != if_address_index)
755         {
756           ip_interface_address_t *ia;
757
758           ia = pool_elt_at_index (lm->if_address_pool, if_address_index);
759
760           if (ia->flags & IP_INTERFACE_ADDRESS_FLAG_STALE)
761             {
762               if (ia->sw_if_index == sw_if_index)
763                 {
764                   /* re-adding an address during the replace action.
765                    * consdier this the update. clear the flag and
766                    * we're done */
767                   ia->flags &= ~IP_INTERFACE_ADDRESS_FLAG_STALE;
768                   goto done;
769                 }
770               else
771                 {
772                   /* The prefix is moving from one interface to another.
773                    * delete the stale and add the new */
774                   ip4_add_del_interface_address_internal (vm,
775                                                           ia->sw_if_index,
776                                                           address,
777                                                           address_length, 1);
778                   ia = NULL;
779                   error = ip_interface_address_add (lm, sw_if_index,
780                                                     addr_fib, address_length,
781                                                     &if_address_index);
782                 }
783             }
784           else
785             {
786               vnm->api_errno = VNET_API_ERROR_DUPLICATE_IF_ADDRESS;
787               error = clib_error_create
788                 ("Prefix %U already found on interface %U",
789                  lm->format_address_and_length, addr_fib, address_length,
790                  format_vnet_sw_if_index_name, vnm, ia->sw_if_index);
791             }
792         }
793       else
794         error = ip_interface_address_add (lm, sw_if_index,
795                                           addr_fib, address_length,
796                                           &if_address_index);
797     }
798
799   if (error)
800     goto done;
801
802   ip4_sw_interface_enable_disable (sw_if_index, !is_del);
803   ip4_mfib_interface_enable_disable (sw_if_index, !is_del);
804
805   /* intf addr routes are added/deleted on admin up/down */
806   if (vnet_sw_interface_is_admin_up (vnm, sw_if_index))
807     {
808       if (is_del)
809         ip4_del_interface_routes (sw_if_index,
810                                   im, ip4_af.fib_index, address,
811                                   address_length);
812       else
813         ip4_add_interface_routes (sw_if_index,
814                                   im, ip4_af.fib_index,
815                                   pool_elt_at_index
816                                   (lm->if_address_pool, if_address_index));
817     }
818
819   ip4_add_del_interface_address_callback_t *cb;
820   vec_foreach (cb, im->add_del_interface_address_callbacks)
821     cb->function (im, cb->function_opaque, sw_if_index,
822                   address, address_length, if_address_index, is_del);
823
824 done:
825   vec_free (addr_fib);
826   return error;
827 }
828
829 clib_error_t *
830 ip4_add_del_interface_address (vlib_main_t * vm,
831                                u32 sw_if_index,
832                                ip4_address_t * address,
833                                u32 address_length, u32 is_del)
834 {
835   return ip4_add_del_interface_address_internal
836     (vm, sw_if_index, address, address_length, is_del);
837 }
838
839 void
840 ip4_directed_broadcast (u32 sw_if_index, u8 enable)
841 {
842   ip_interface_address_t *ia;
843   ip4_main_t *im;
844
845   im = &ip4_main;
846
847   /*
848    * when directed broadcast is enabled, the subnet braodcast route will forward
849    * packets using an adjacency with a broadcast MAC. otherwise it drops
850    */
851   /* *INDENT-OFF* */
852   foreach_ip_interface_address(&im->lookup_main, ia,
853                                sw_if_index, 0,
854      ({
855        if (ia->address_length <= 30)
856          {
857            ip4_address_t *ipa;
858
859            ipa = ip_interface_address_get_address (&im->lookup_main, ia);
860
861            fib_prefix_t pfx = {
862              .fp_len = 32,
863              .fp_proto = FIB_PROTOCOL_IP4,
864              .fp_addr = {
865                .ip4.as_u32 = (ipa->as_u32 | ~im->fib_masks[ia->address_length]),
866              },
867            };
868
869            ip4_add_subnet_bcast_route
870              (fib_table_get_index_for_sw_if_index(FIB_PROTOCOL_IP4,
871                                                   sw_if_index),
872               &pfx, sw_if_index);
873          }
874      }));
875   /* *INDENT-ON* */
876 }
877 #endif
878
879 static clib_error_t *
880 ip4_sw_interface_admin_up_down (vnet_main_t * vnm, u32 sw_if_index, u32 flags)
881 {
882   ip4_main_t *im = &ip4_main;
883   ip_interface_address_t *ia;
884   ip4_address_t *a;
885   u32 is_admin_up, fib_index;
886
887   vec_validate_init_empty (im->
888                            lookup_main.if_address_pool_index_by_sw_if_index,
889                            sw_if_index, ~0);
890
891   is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
892
893   fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
894
895   /* *INDENT-OFF* */
896   foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
897                                 0 /* honor unnumbered */,
898   ({
899     a = ip_interface_address_get_address (&im->lookup_main, ia);
900     if (is_admin_up)
901       ip4_add_interface_routes (sw_if_index,
902                                 im, fib_index,
903                                 ia);
904     else
905       ip4_del_interface_routes (sw_if_index,
906                                 im, fib_index,
907                                 a, ia->address_length);
908   }));
909   /* *INDENT-ON* */
910
911   return 0;
912 }
913
914 VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
915
916 /* Built-in ip4 unicast rx feature path definition */
917 /* *INDENT-OFF* */
918 VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
919 {
920   .arc_name = "ip4-unicast",
921   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
922   .last_in_arc = "ip4-lookup",
923   .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
924 };
925
926 VNET_FEATURE_INIT (ip4_flow_classify, static) =
927 {
928   .arc_name = "ip4-unicast",
929   .node_name = "ip4-flow-classify",
930   .runs_before = VNET_FEATURES ("ip4-inacl"),
931 };
932
933 VNET_FEATURE_INIT (ip4_inacl, static) =
934 {
935   .arc_name = "ip4-unicast",
936   .node_name = "ip4-inacl",
937   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
938 };
939
940 VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) =
941 {
942   .arc_name = "ip4-unicast",
943   .node_name = "ip4-source-and-port-range-check-rx",
944   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
945 };
946
947 VNET_FEATURE_INIT (ip4_policer_classify, static) =
948 {
949   .arc_name = "ip4-unicast",
950   .node_name = "ip4-policer-classify",
951   .runs_before = VNET_FEATURES ("ipsec4-input-feature"),
952 };
953
954 VNET_FEATURE_INIT (ip4_ipsec, static) =
955 {
956   .arc_name = "ip4-unicast",
957   .node_name = "ipsec4-input-feature",
958   .runs_before = VNET_FEATURES ("vpath-input-ip4"),
959 };
960
961 VNET_FEATURE_INIT (ip4_vpath, static) =
962 {
963   .arc_name = "ip4-unicast",
964   .node_name = "vpath-input-ip4",
965   .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
966 };
967
968 VNET_FEATURE_INIT (ip4_vxlan_bypass, static) =
969 {
970   .arc_name = "ip4-unicast",
971   .node_name = "ip4-vxlan-bypass",
972   .runs_before = VNET_FEATURES ("ip4-lookup"),
973 };
974
975 VNET_FEATURE_INIT (ip4_not_enabled, static) =
976 {
977   .arc_name = "ip4-unicast",
978   .node_name = "ip4-not-enabled",
979   .runs_before = VNET_FEATURES ("ip4-lookup"),
980 };
981
982 VNET_FEATURE_INIT (ip4_lookup, static) =
983 {
984   .arc_name = "ip4-unicast",
985   .node_name = "ip4-lookup",
986   .runs_before = 0,     /* not before any other features */
987 };
988
989 /* Built-in ip4 multicast rx feature path definition */
990 VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
991 {
992   .arc_name = "ip4-multicast",
993   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
994   .last_in_arc = "ip4-mfib-forward-lookup",
995   .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
996 };
997
998 VNET_FEATURE_INIT (ip4_vpath_mc, static) =
999 {
1000   .arc_name = "ip4-multicast",
1001   .node_name = "vpath-input-ip4",
1002   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
1003 };
1004
1005 VNET_FEATURE_INIT (ip4_mc_not_enabled, static) =
1006 {
1007   .arc_name = "ip4-multicast",
1008   .node_name = "ip4-not-enabled",
1009   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
1010 };
1011
1012 VNET_FEATURE_INIT (ip4_lookup_mc, static) =
1013 {
1014   .arc_name = "ip4-multicast",
1015   .node_name = "ip4-mfib-forward-lookup",
1016   .runs_before = 0,     /* last feature */
1017 };
1018
1019 /* Source and port-range check ip4 tx feature path definition */
1020 VNET_FEATURE_ARC_INIT (ip4_output, static) =
1021 {
1022   .arc_name = "ip4-output",
1023   .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain", "ip4-dvr-dpo"),
1024   .last_in_arc = "interface-output",
1025   .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
1026 };
1027
1028 VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) =
1029 {
1030   .arc_name = "ip4-output",
1031   .node_name = "ip4-source-and-port-range-check-tx",
1032   .runs_before = VNET_FEATURES ("ip4-outacl"),
1033 };
1034
1035 VNET_FEATURE_INIT (ip4_outacl, static) =
1036 {
1037   .arc_name = "ip4-output",
1038   .node_name = "ip4-outacl",
1039   .runs_before = VNET_FEATURES ("ipsec4-output-feature"),
1040 };
1041
1042 VNET_FEATURE_INIT (ip4_ipsec_output, static) =
1043 {
1044   .arc_name = "ip4-output",
1045   .node_name = "ipsec4-output-feature",
1046   .runs_before = VNET_FEATURES ("interface-output"),
1047 };
1048
1049 /* Built-in ip4 tx feature path definition */
1050 VNET_FEATURE_INIT (ip4_interface_output, static) =
1051 {
1052   .arc_name = "ip4-output",
1053   .node_name = "interface-output",
1054   .runs_before = 0,     /* not before any other features */
1055 };
1056 /* *INDENT-ON* */
1057
1058 static clib_error_t *
1059 ip4_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add)
1060 {
1061   ip4_main_t *im = &ip4_main;
1062
1063   vec_validate_init_empty (im->fib_index_by_sw_if_index, sw_if_index, ~0);
1064   vec_validate_init_empty (im->mfib_index_by_sw_if_index, sw_if_index, ~0);
1065
1066   if (is_add)
1067     {
1068       /* Fill in lookup tables with default table (0). */
1069       im->fib_index_by_sw_if_index[sw_if_index] = 0;
1070       im->mfib_index_by_sw_if_index[sw_if_index] = 0;
1071     }
1072   else
1073     {
1074       ip4_main_t *im4 = &ip4_main;
1075       ip_lookup_main_t *lm4 = &im4->lookup_main;
1076       ip_interface_address_t *ia = 0;
1077       ip4_address_t *address;
1078       vlib_main_t *vm = vlib_get_main ();
1079
1080       vnet_sw_interface_update_unnumbered (sw_if_index, ~0, 0);
1081       /* *INDENT-OFF* */
1082       foreach_ip_interface_address (lm4, ia, sw_if_index, 0,
1083       ({
1084         address = ip_interface_address_get_address (lm4, ia);
1085         ip4_add_del_interface_address(vm, sw_if_index, address, ia->address_length, 1);
1086       }));
1087       /* *INDENT-ON* */
1088       ip4_mfib_interface_enable_disable (sw_if_index, 0);
1089     }
1090
1091   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
1092                                is_add, 0, 0);
1093
1094   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
1095                                sw_if_index, is_add, 0, 0);
1096
1097   return /* no error */ 0;
1098 }
1099
1100 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1101
1102 /* Global IP4 main. */
1103 #ifndef CLIB_MARCH_VARIANT
1104 ip4_main_t ip4_main;
1105 #endif /* CLIB_MARCH_VARIANT */
1106
1107 static clib_error_t *
1108 ip4_lookup_init (vlib_main_t * vm)
1109 {
1110   ip4_main_t *im = &ip4_main;
1111   clib_error_t *error;
1112   uword i;
1113
1114   if ((error = vlib_call_init_function (vm, vnet_feature_init)))
1115     return error;
1116   if ((error = vlib_call_init_function (vm, ip4_mtrie_module_init)))
1117     return (error);
1118   if ((error = vlib_call_init_function (vm, fib_module_init)))
1119     return error;
1120   if ((error = vlib_call_init_function (vm, mfib_module_init)))
1121     return error;
1122
1123   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1124     {
1125       u32 m;
1126
1127       if (i < 32)
1128         m = pow2_mask (i) << (32 - i);
1129       else
1130         m = ~0;
1131       im->fib_masks[i] = clib_host_to_net_u32 (m);
1132     }
1133
1134   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1135
1136   /* Create FIB with index 0 and table id of 0. */
1137   fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1138                                      FIB_SOURCE_DEFAULT_ROUTE);
1139   mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1140                                       MFIB_SOURCE_DEFAULT_ROUTE);
1141
1142   {
1143     pg_node_t *pn;
1144     pn = pg_get_node (ip4_lookup_node.index);
1145     pn->unformat_edit = unformat_pg_ip4_header;
1146   }
1147
1148   {
1149     ethernet_arp_header_t h;
1150
1151     clib_memset (&h, 0, sizeof (h));
1152
1153 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1154 #define _8(f,v) h.f = v;
1155     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1156     _16 (l3_type, ETHERNET_TYPE_IP4);
1157     _8 (n_l2_address_bytes, 6);
1158     _8 (n_l3_address_bytes, 4);
1159     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1160 #undef _16
1161 #undef _8
1162
1163     vlib_packet_template_init (vm, &im->ip4_arp_request_packet_template,
1164                                /* data */ &h,
1165                                sizeof (h),
1166                                /* alloc chunk size */ 8,
1167                                "ip4 arp");
1168   }
1169
1170   return error;
1171 }
1172
1173 VLIB_INIT_FUNCTION (ip4_lookup_init);
1174
1175 typedef struct
1176 {
1177   /* Adjacency taken. */
1178   u32 dpo_index;
1179   u32 flow_hash;
1180   u32 fib_index;
1181
1182   /* Packet data, possibly *after* rewrite. */
1183   u8 packet_data[64 - 1 * sizeof (u32)];
1184 }
1185 ip4_forward_next_trace_t;
1186
1187 #ifndef CLIB_MARCH_VARIANT
1188 u8 *
1189 format_ip4_forward_next_trace (u8 * s, va_list * args)
1190 {
1191   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1192   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1193   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1194   u32 indent = format_get_indent (s);
1195   s = format (s, "%U%U",
1196               format_white_space, indent,
1197               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1198   return s;
1199 }
1200 #endif
1201
1202 static u8 *
1203 format_ip4_lookup_trace (u8 * s, va_list * args)
1204 {
1205   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1206   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1207   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1208   u32 indent = format_get_indent (s);
1209
1210   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1211               t->fib_index, t->dpo_index, t->flow_hash);
1212   s = format (s, "\n%U%U",
1213               format_white_space, indent,
1214               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1215   return s;
1216 }
1217
1218 static u8 *
1219 format_ip4_rewrite_trace (u8 * s, va_list * args)
1220 {
1221   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1222   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1223   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1224   u32 indent = format_get_indent (s);
1225
1226   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1227               t->fib_index, t->dpo_index, format_ip_adjacency,
1228               t->dpo_index, FORMAT_IP_ADJACENCY_NONE, t->flow_hash);
1229   s = format (s, "\n%U%U",
1230               format_white_space, indent,
1231               format_ip_adjacency_packet_data,
1232               t->packet_data, sizeof (t->packet_data));
1233   return s;
1234 }
1235
1236 #ifndef CLIB_MARCH_VARIANT
1237 /* Common trace function for all ip4-forward next nodes. */
1238 void
1239 ip4_forward_next_trace (vlib_main_t * vm,
1240                         vlib_node_runtime_t * node,
1241                         vlib_frame_t * frame, vlib_rx_or_tx_t which_adj_index)
1242 {
1243   u32 *from, n_left;
1244   ip4_main_t *im = &ip4_main;
1245
1246   n_left = frame->n_vectors;
1247   from = vlib_frame_vector_args (frame);
1248
1249   while (n_left >= 4)
1250     {
1251       u32 bi0, bi1;
1252       vlib_buffer_t *b0, *b1;
1253       ip4_forward_next_trace_t *t0, *t1;
1254
1255       /* Prefetch next iteration. */
1256       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1257       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1258
1259       bi0 = from[0];
1260       bi1 = from[1];
1261
1262       b0 = vlib_get_buffer (vm, bi0);
1263       b1 = vlib_get_buffer (vm, bi1);
1264
1265       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1266         {
1267           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1268           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1269           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1270           t0->fib_index =
1271             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1272              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1273             vec_elt (im->fib_index_by_sw_if_index,
1274                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1275
1276           clib_memcpy_fast (t0->packet_data,
1277                             vlib_buffer_get_current (b0),
1278                             sizeof (t0->packet_data));
1279         }
1280       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1281         {
1282           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1283           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1284           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1285           t1->fib_index =
1286             (vnet_buffer (b1)->sw_if_index[VLIB_TX] !=
1287              (u32) ~ 0) ? vnet_buffer (b1)->sw_if_index[VLIB_TX] :
1288             vec_elt (im->fib_index_by_sw_if_index,
1289                      vnet_buffer (b1)->sw_if_index[VLIB_RX]);
1290           clib_memcpy_fast (t1->packet_data, vlib_buffer_get_current (b1),
1291                             sizeof (t1->packet_data));
1292         }
1293       from += 2;
1294       n_left -= 2;
1295     }
1296
1297   while (n_left >= 1)
1298     {
1299       u32 bi0;
1300       vlib_buffer_t *b0;
1301       ip4_forward_next_trace_t *t0;
1302
1303       bi0 = from[0];
1304
1305       b0 = vlib_get_buffer (vm, bi0);
1306
1307       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1308         {
1309           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1310           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1311           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1312           t0->fib_index =
1313             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1314              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1315             vec_elt (im->fib_index_by_sw_if_index,
1316                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1317           clib_memcpy_fast (t0->packet_data, vlib_buffer_get_current (b0),
1318                             sizeof (t0->packet_data));
1319         }
1320       from += 1;
1321       n_left -= 1;
1322     }
1323 }
1324
1325 /* Compute TCP/UDP/ICMP4 checksum in software. */
1326 u16
1327 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1328                               ip4_header_t * ip0)
1329 {
1330   ip_csum_t sum0;
1331   u32 ip_header_length, payload_length_host_byte_order;
1332
1333   /* Initialize checksum with ip header. */
1334   ip_header_length = ip4_header_bytes (ip0);
1335   payload_length_host_byte_order =
1336     clib_net_to_host_u16 (ip0->length) - ip_header_length;
1337   sum0 =
1338     clib_host_to_net_u32 (payload_length_host_byte_order +
1339                           (ip0->protocol << 16));
1340
1341   if (BITS (uword) == 32)
1342     {
1343       sum0 =
1344         ip_csum_with_carry (sum0,
1345                             clib_mem_unaligned (&ip0->src_address, u32));
1346       sum0 =
1347         ip_csum_with_carry (sum0,
1348                             clib_mem_unaligned (&ip0->dst_address, u32));
1349     }
1350   else
1351     sum0 =
1352       ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1353
1354   return ip_calculate_l4_checksum (vm, p0, sum0,
1355                                    payload_length_host_byte_order, (u8 *) ip0,
1356                                    ip_header_length, NULL);
1357 }
1358
1359 u32
1360 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1361 {
1362   ip4_header_t *ip0 = vlib_buffer_get_current (p0);
1363   udp_header_t *udp0;
1364   u16 sum16;
1365
1366   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1367           || ip0->protocol == IP_PROTOCOL_UDP);
1368
1369   udp0 = (void *) (ip0 + 1);
1370   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1371     {
1372       p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1373                     | VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
1374       return p0->flags;
1375     }
1376
1377   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1378
1379   p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1380                 | ((sum16 == 0) << VNET_BUFFER_F_LOG2_L4_CHECKSUM_CORRECT));
1381
1382   return p0->flags;
1383 }
1384 #endif
1385
1386 /* *INDENT-OFF* */
1387 VNET_FEATURE_ARC_INIT (ip4_local) =
1388 {
1389   .arc_name  = "ip4-local",
1390   .start_nodes = VNET_FEATURES ("ip4-local"),
1391   .last_in_arc = "ip4-local-end-of-arc",
1392 };
1393 /* *INDENT-ON* */
1394
1395 static inline void
1396 ip4_local_l4_csum_validate (vlib_main_t * vm, vlib_buffer_t * p,
1397                             ip4_header_t * ip, u8 is_udp, u8 * error,
1398                             u8 * good_tcp_udp)
1399 {
1400   u32 flags0;
1401   flags0 = ip4_tcp_udp_validate_checksum (vm, p);
1402   *good_tcp_udp = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0;
1403   if (is_udp)
1404     {
1405       udp_header_t *udp;
1406       u32 ip_len, udp_len;
1407       i32 len_diff;
1408       udp = ip4_next_header (ip);
1409       /* Verify UDP length. */
1410       ip_len = clib_net_to_host_u16 (ip->length);
1411       udp_len = clib_net_to_host_u16 (udp->length);
1412
1413       len_diff = ip_len - udp_len;
1414       *good_tcp_udp &= len_diff >= 0;
1415       *error = len_diff < 0 ? IP4_ERROR_UDP_LENGTH : *error;
1416     }
1417 }
1418
1419 #define ip4_local_csum_is_offloaded(_b)                                       \
1420   ((_b->flags & VNET_BUFFER_F_OFFLOAD) &&                                     \
1421    (vnet_buffer (_b)->oflags &                                                \
1422     (VNET_BUFFER_OFFLOAD_F_TCP_CKSUM | VNET_BUFFER_OFFLOAD_F_UDP_CKSUM)))
1423
1424 #define ip4_local_need_csum_check(is_tcp_udp, _b)                       \
1425     (is_tcp_udp && !(_b->flags & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED     \
1426         || ip4_local_csum_is_offloaded (_b)))
1427
1428 #define ip4_local_csum_is_valid(_b)                                     \
1429     (_b->flags & VNET_BUFFER_F_L4_CHECKSUM_CORRECT                      \
1430         || (ip4_local_csum_is_offloaded (_b))) != 0
1431
1432 static inline void
1433 ip4_local_check_l4_csum (vlib_main_t * vm, vlib_buffer_t * b,
1434                          ip4_header_t * ih, u8 * error)
1435 {
1436   u8 is_udp, is_tcp_udp, good_tcp_udp;
1437
1438   is_udp = ih->protocol == IP_PROTOCOL_UDP;
1439   is_tcp_udp = is_udp || ih->protocol == IP_PROTOCOL_TCP;
1440
1441   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp, b)))
1442     ip4_local_l4_csum_validate (vm, b, ih, is_udp, error, &good_tcp_udp);
1443   else
1444     good_tcp_udp = ip4_local_csum_is_valid (b);
1445
1446   ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1447   *error = (is_tcp_udp && !good_tcp_udp
1448             ? IP4_ERROR_TCP_CHECKSUM + is_udp : *error);
1449 }
1450
1451 static inline void
1452 ip4_local_check_l4_csum_x2 (vlib_main_t * vm, vlib_buffer_t ** b,
1453                             ip4_header_t ** ih, u8 * error)
1454 {
1455   u8 is_udp[2], is_tcp_udp[2], good_tcp_udp[2];
1456
1457   is_udp[0] = ih[0]->protocol == IP_PROTOCOL_UDP;
1458   is_udp[1] = ih[1]->protocol == IP_PROTOCOL_UDP;
1459
1460   is_tcp_udp[0] = is_udp[0] || ih[0]->protocol == IP_PROTOCOL_TCP;
1461   is_tcp_udp[1] = is_udp[1] || ih[1]->protocol == IP_PROTOCOL_TCP;
1462
1463   good_tcp_udp[0] = ip4_local_csum_is_valid (b[0]);
1464   good_tcp_udp[1] = ip4_local_csum_is_valid (b[1]);
1465
1466   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp[0], b[0])
1467                      || ip4_local_need_csum_check (is_tcp_udp[1], b[1])))
1468     {
1469       if (is_tcp_udp[0])
1470         ip4_local_l4_csum_validate (vm, b[0], ih[0], is_udp[0], &error[0],
1471                                     &good_tcp_udp[0]);
1472       if (is_tcp_udp[1])
1473         ip4_local_l4_csum_validate (vm, b[1], ih[1], is_udp[1], &error[1],
1474                                     &good_tcp_udp[1]);
1475     }
1476
1477   error[0] = (is_tcp_udp[0] && !good_tcp_udp[0] ?
1478               IP4_ERROR_TCP_CHECKSUM + is_udp[0] : error[0]);
1479   error[1] = (is_tcp_udp[1] && !good_tcp_udp[1] ?
1480               IP4_ERROR_TCP_CHECKSUM + is_udp[1] : error[1]);
1481 }
1482
1483 static inline void
1484 ip4_local_set_next_and_error (vlib_node_runtime_t * error_node,
1485                               vlib_buffer_t * b, u16 * next, u8 error,
1486                               u8 head_of_feature_arc)
1487 {
1488   u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;
1489   u32 next_index;
1490
1491   *next = error != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : *next;
1492   b->error = error ? error_node->errors[error] : 0;
1493   if (head_of_feature_arc)
1494     {
1495       next_index = *next;
1496       if (PREDICT_TRUE (error == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1497         {
1498           vnet_feature_arc_start (arc_index,
1499                                   vnet_buffer (b)->sw_if_index[VLIB_RX],
1500                                   &next_index, b);
1501           *next = next_index;
1502         }
1503     }
1504 }
1505
1506 typedef struct
1507 {
1508   ip4_address_t src;
1509   u32 lbi;
1510   u8 error;
1511   u8 first;
1512 } ip4_local_last_check_t;
1513
1514 static inline void
1515 ip4_local_check_src (vlib_buffer_t * b, ip4_header_t * ip0,
1516                      ip4_local_last_check_t * last_check, u8 * error0)
1517 {
1518   const dpo_id_t *dpo0;
1519   load_balance_t *lb0;
1520   u32 lbi0;
1521
1522   vnet_buffer (b)->ip.fib_index =
1523     vnet_buffer (b)->sw_if_index[VLIB_TX] != ~0 ?
1524     vnet_buffer (b)->sw_if_index[VLIB_TX] : vnet_buffer (b)->ip.fib_index;
1525
1526   /*
1527    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1528    *  adjacency for the destination address (the local interface address).
1529    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1530    *  adjacency for the source address (the remote sender's address)
1531    */
1532   if (PREDICT_TRUE (last_check->src.as_u32 != ip0->src_address.as_u32) ||
1533       last_check->first)
1534     {
1535       lbi0 = ip4_fib_forwarding_lookup (vnet_buffer (b)->ip.fib_index,
1536                                         &ip0->src_address);
1537
1538       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1539         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1540       vnet_buffer (b)->ip.adj_index[VLIB_TX] = lbi0;
1541
1542       lb0 = load_balance_get (lbi0);
1543       dpo0 = load_balance_get_bucket_i (lb0, 0);
1544
1545       /*
1546        * Must have a route to source otherwise we drop the packet.
1547        * ip4 broadcasts are accepted, e.g. to make dhcp client work
1548        *
1549        * The checks are:
1550        *  - the source is a recieve => it's from us => bogus, do this
1551        *    first since it sets a different error code.
1552        *  - uRPF check for any route to source - accept if passes.
1553        *  - allow packets destined to the broadcast address from unknown sources
1554        */
1555
1556       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1557                   && dpo0->dpoi_type == DPO_RECEIVE) ?
1558                  IP4_ERROR_SPOOFED_LOCAL_PACKETS : *error0);
1559       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1560                   && !fib_urpf_check_size (lb0->lb_urpf)
1561                   && ip0->dst_address.as_u32 != 0xFFFFFFFF) ?
1562                  IP4_ERROR_SRC_LOOKUP_MISS : *error0);
1563
1564       last_check->src.as_u32 = ip0->src_address.as_u32;
1565       last_check->lbi = lbi0;
1566       last_check->error = *error0;
1567       last_check->first = 0;
1568     }
1569   else
1570     {
1571       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1572         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1573       vnet_buffer (b)->ip.adj_index[VLIB_TX] = last_check->lbi;
1574       *error0 = last_check->error;
1575     }
1576 }
1577
1578 static inline void
1579 ip4_local_check_src_x2 (vlib_buffer_t ** b, ip4_header_t ** ip,
1580                         ip4_local_last_check_t * last_check, u8 * error)
1581 {
1582   const dpo_id_t *dpo[2];
1583   load_balance_t *lb[2];
1584   u32 not_last_hit;
1585   u32 lbi[2];
1586
1587   not_last_hit = last_check->first;
1588   not_last_hit |= ip[0]->src_address.as_u32 ^ last_check->src.as_u32;
1589   not_last_hit |= ip[1]->src_address.as_u32 ^ last_check->src.as_u32;
1590
1591   vnet_buffer (b[0])->ip.fib_index =
1592     vnet_buffer (b[0])->sw_if_index[VLIB_TX] != ~0 ?
1593     vnet_buffer (b[0])->sw_if_index[VLIB_TX] :
1594     vnet_buffer (b[0])->ip.fib_index;
1595
1596   vnet_buffer (b[1])->ip.fib_index =
1597     vnet_buffer (b[1])->sw_if_index[VLIB_TX] != ~0 ?
1598     vnet_buffer (b[1])->sw_if_index[VLIB_TX] :
1599     vnet_buffer (b[1])->ip.fib_index;
1600
1601   /*
1602    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1603    *  adjacency for the destination address (the local interface address).
1604    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1605    *  adjacency for the source address (the remote sender's address)
1606    */
1607   if (PREDICT_TRUE (not_last_hit))
1608     {
1609       ip4_fib_forwarding_lookup_x2 (
1610         vnet_buffer (b[0])->ip.fib_index, vnet_buffer (b[1])->ip.fib_index,
1611         &ip[0]->src_address, &ip[1]->src_address, &lbi[0], &lbi[1]);
1612
1613       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1614         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1615       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = lbi[0];
1616
1617       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1618         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1619       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = lbi[1];
1620
1621       lb[0] = load_balance_get (lbi[0]);
1622       lb[1] = load_balance_get (lbi[1]);
1623
1624       dpo[0] = load_balance_get_bucket_i (lb[0], 0);
1625       dpo[1] = load_balance_get_bucket_i (lb[1], 0);
1626
1627       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1628                    dpo[0]->dpoi_type == DPO_RECEIVE) ?
1629                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[0]);
1630       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1631                    !fib_urpf_check_size (lb[0]->lb_urpf) &&
1632                    ip[0]->dst_address.as_u32 != 0xFFFFFFFF)
1633                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[0]);
1634
1635       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1636                    dpo[1]->dpoi_type == DPO_RECEIVE) ?
1637                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[1]);
1638       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1639                    !fib_urpf_check_size (lb[1]->lb_urpf) &&
1640                    ip[1]->dst_address.as_u32 != 0xFFFFFFFF)
1641                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[1]);
1642
1643       last_check->src.as_u32 = ip[1]->src_address.as_u32;
1644       last_check->lbi = lbi[1];
1645       last_check->error = error[1];
1646       last_check->first = 0;
1647     }
1648   else
1649     {
1650       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1651         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1652       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = last_check->lbi;
1653
1654       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1655         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1656       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = last_check->lbi;
1657
1658       error[0] = last_check->error;
1659       error[1] = last_check->error;
1660     }
1661 }
1662
1663 enum ip_local_packet_type_e
1664 {
1665   IP_LOCAL_PACKET_TYPE_L4,
1666   IP_LOCAL_PACKET_TYPE_NAT,
1667   IP_LOCAL_PACKET_TYPE_FRAG,
1668 };
1669
1670 /**
1671  * Determine packet type and next node.
1672  *
1673  * The expectation is that all packets that are not L4 will skip
1674  * checksums and source checks.
1675  */
1676 always_inline u8
1677 ip4_local_classify (vlib_buffer_t * b, ip4_header_t * ip, u16 * next)
1678 {
1679   ip_lookup_main_t *lm = &ip4_main.lookup_main;
1680
1681   if (PREDICT_FALSE (ip4_is_fragment (ip)))
1682     {
1683       *next = IP_LOCAL_NEXT_REASSEMBLY;
1684       return IP_LOCAL_PACKET_TYPE_FRAG;
1685     }
1686   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_IS_NATED))
1687     {
1688       *next = lm->local_next_by_ip_protocol[ip->protocol];
1689       return IP_LOCAL_PACKET_TYPE_NAT;
1690     }
1691
1692   *next = lm->local_next_by_ip_protocol[ip->protocol];
1693   return IP_LOCAL_PACKET_TYPE_L4;
1694 }
1695
1696 static inline uword
1697 ip4_local_inline (vlib_main_t * vm,
1698                   vlib_node_runtime_t * node,
1699                   vlib_frame_t * frame, int head_of_feature_arc)
1700 {
1701   u32 *from, n_left_from;
1702   vlib_node_runtime_t *error_node =
1703     vlib_node_get_runtime (vm, ip4_local_node.index);
1704   u16 nexts[VLIB_FRAME_SIZE], *next;
1705   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1706   ip4_header_t *ip[2];
1707   u8 error[2], pt[2];
1708
1709   ip4_local_last_check_t last_check = {
1710     /*
1711      * 0.0.0.0 can appear as the source address of an IP packet,
1712      * as can any other address, hence the need to use the 'first'
1713      * member to make sure the .lbi is initialised for the first
1714      * packet.
1715      */
1716     .src = {.as_u32 = 0},
1717     .lbi = ~0,
1718     .error = IP4_ERROR_UNKNOWN_PROTOCOL,
1719     .first = 1,
1720   };
1721
1722   from = vlib_frame_vector_args (frame);
1723   n_left_from = frame->n_vectors;
1724
1725   if (node->flags & VLIB_NODE_FLAG_TRACE)
1726     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1727
1728   vlib_get_buffers (vm, from, bufs, n_left_from);
1729   b = bufs;
1730   next = nexts;
1731
1732   while (n_left_from >= 6)
1733     {
1734       u8 not_batch = 0;
1735
1736       /* Prefetch next iteration. */
1737       {
1738         vlib_prefetch_buffer_header (b[4], LOAD);
1739         vlib_prefetch_buffer_header (b[5], LOAD);
1740
1741         clib_prefetch_load (b[4]->data);
1742         clib_prefetch_load (b[5]->data);
1743       }
1744
1745       error[0] = error[1] = IP4_ERROR_UNKNOWN_PROTOCOL;
1746
1747       ip[0] = vlib_buffer_get_current (b[0]);
1748       ip[1] = vlib_buffer_get_current (b[1]);
1749
1750       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1751       vnet_buffer (b[1])->l3_hdr_offset = b[1]->current_data;
1752
1753       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1754       pt[1] = ip4_local_classify (b[1], ip[1], &next[1]);
1755
1756       not_batch = pt[0] ^ pt[1];
1757
1758       if (head_of_feature_arc == 0 || (pt[0] && not_batch == 0))
1759         goto skip_checks;
1760
1761       if (PREDICT_TRUE (not_batch == 0))
1762         {
1763           ip4_local_check_l4_csum_x2 (vm, b, ip, error);
1764           ip4_local_check_src_x2 (b, ip, &last_check, error);
1765         }
1766       else
1767         {
1768           if (!pt[0])
1769             {
1770               ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1771               ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1772             }
1773           if (!pt[1])
1774             {
1775               ip4_local_check_l4_csum (vm, b[1], ip[1], &error[1]);
1776               ip4_local_check_src (b[1], ip[1], &last_check, &error[1]);
1777             }
1778         }
1779
1780     skip_checks:
1781
1782       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1783                                     head_of_feature_arc);
1784       ip4_local_set_next_and_error (error_node, b[1], &next[1], error[1],
1785                                     head_of_feature_arc);
1786
1787       b += 2;
1788       next += 2;
1789       n_left_from -= 2;
1790     }
1791
1792   while (n_left_from > 0)
1793     {
1794       error[0] = IP4_ERROR_UNKNOWN_PROTOCOL;
1795
1796       ip[0] = vlib_buffer_get_current (b[0]);
1797       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1798       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1799
1800       if (head_of_feature_arc == 0 || pt[0])
1801         goto skip_check;
1802
1803       ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1804       ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1805
1806     skip_check:
1807
1808       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1809                                     head_of_feature_arc);
1810
1811       b += 1;
1812       next += 1;
1813       n_left_from -= 1;
1814     }
1815
1816   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
1817   return frame->n_vectors;
1818 }
1819
1820 VLIB_NODE_FN (ip4_local_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1821                                vlib_frame_t * frame)
1822 {
1823   return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ );
1824 }
1825
1826 /* *INDENT-OFF* */
1827 VLIB_REGISTER_NODE (ip4_local_node) =
1828 {
1829   .name = "ip4-local",
1830   .vector_size = sizeof (u32),
1831   .format_trace = format_ip4_forward_next_trace,
1832   .n_errors = IP4_N_ERROR,
1833   .error_strings = ip4_error_strings,
1834   .n_next_nodes = IP_LOCAL_N_NEXT,
1835   .next_nodes =
1836   {
1837     [IP_LOCAL_NEXT_DROP] = "ip4-drop",
1838     [IP_LOCAL_NEXT_PUNT] = "ip4-punt",
1839     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1840     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
1841     [IP_LOCAL_NEXT_REASSEMBLY] = "ip4-full-reassembly",
1842   },
1843 };
1844 /* *INDENT-ON* */
1845
1846
1847 VLIB_NODE_FN (ip4_local_end_of_arc_node) (vlib_main_t * vm,
1848                                           vlib_node_runtime_t * node,
1849                                           vlib_frame_t * frame)
1850 {
1851   return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ );
1852 }
1853
1854 /* *INDENT-OFF* */
1855 VLIB_REGISTER_NODE (ip4_local_end_of_arc_node) = {
1856   .name = "ip4-local-end-of-arc",
1857   .vector_size = sizeof (u32),
1858
1859   .format_trace = format_ip4_forward_next_trace,
1860   .sibling_of = "ip4-local",
1861 };
1862
1863 VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = {
1864   .arc_name = "ip4-local",
1865   .node_name = "ip4-local-end-of-arc",
1866   .runs_before = 0, /* not before any other features */
1867 };
1868 /* *INDENT-ON* */
1869
1870 #ifndef CLIB_MARCH_VARIANT
1871 void
1872 ip4_register_protocol (u32 protocol, u32 node_index)
1873 {
1874   vlib_main_t *vm = vlib_get_main ();
1875   ip4_main_t *im = &ip4_main;
1876   ip_lookup_main_t *lm = &im->lookup_main;
1877
1878   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1879   lm->local_next_by_ip_protocol[protocol] =
1880     vlib_node_add_next (vm, ip4_local_node.index, node_index);
1881 }
1882
1883 void
1884 ip4_unregister_protocol (u32 protocol)
1885 {
1886   ip4_main_t *im = &ip4_main;
1887   ip_lookup_main_t *lm = &im->lookup_main;
1888
1889   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1890   lm->local_next_by_ip_protocol[protocol] = IP_LOCAL_NEXT_PUNT;
1891 }
1892 #endif
1893
1894 static clib_error_t *
1895 show_ip_local_command_fn (vlib_main_t * vm,
1896                           unformat_input_t * input, vlib_cli_command_t * cmd)
1897 {
1898   ip4_main_t *im = &ip4_main;
1899   ip_lookup_main_t *lm = &im->lookup_main;
1900   int i;
1901
1902   vlib_cli_output (vm, "Protocols handled by ip4_local");
1903   for (i = 0; i < ARRAY_LEN (lm->local_next_by_ip_protocol); i++)
1904     {
1905       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1906         {
1907           u32 node_index = vlib_get_node (vm,
1908                                           ip4_local_node.index)->
1909             next_nodes[lm->local_next_by_ip_protocol[i]];
1910           vlib_cli_output (vm, "%U: %U", format_ip_protocol, i,
1911                            format_vlib_node_name, vm, node_index);
1912         }
1913     }
1914   return 0;
1915 }
1916
1917
1918
1919 /*?
1920  * Display the set of protocols handled by the local IPv4 stack.
1921  *
1922  * @cliexpar
1923  * Example of how to display local protocol table:
1924  * @cliexstart{show ip local}
1925  * Protocols handled by ip4_local
1926  * 1
1927  * 17
1928  * 47
1929  * @cliexend
1930 ?*/
1931 /* *INDENT-OFF* */
1932 VLIB_CLI_COMMAND (show_ip_local, static) =
1933 {
1934   .path = "show ip local",
1935   .function = show_ip_local_command_fn,
1936   .short_help = "show ip local",
1937 };
1938 /* *INDENT-ON* */
1939
1940 typedef enum
1941 {
1942   IP4_REWRITE_NEXT_DROP,
1943   IP4_REWRITE_NEXT_ICMP_ERROR,
1944   IP4_REWRITE_NEXT_FRAGMENT,
1945   IP4_REWRITE_N_NEXT            /* Last */
1946 } ip4_rewrite_next_t;
1947
1948 /**
1949  * This bits of an IPv4 address to mask to construct a multicast
1950  * MAC address
1951  */
1952 #if CLIB_ARCH_IS_BIG_ENDIAN
1953 #define IP4_MCAST_ADDR_MASK 0x007fffff
1954 #else
1955 #define IP4_MCAST_ADDR_MASK 0xffff7f00
1956 #endif
1957
1958 always_inline void
1959 ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
1960                u16 adj_packet_bytes, bool df, u16 * next,
1961                u8 is_midchain, u32 * error)
1962 {
1963   if (packet_len > adj_packet_bytes)
1964     {
1965       *error = IP4_ERROR_MTU_EXCEEDED;
1966       if (df)
1967         {
1968           icmp4_error_set_vnet_buffer
1969             (b, ICMP4_destination_unreachable,
1970              ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
1971              adj_packet_bytes);
1972           *next = IP4_REWRITE_NEXT_ICMP_ERROR;
1973         }
1974       else
1975         {
1976           /* IP fragmentation */
1977           ip_frag_set_vnet_buffer (b, adj_packet_bytes,
1978                                    (is_midchain ?
1979                                     IP_FRAG_NEXT_IP_REWRITE_MIDCHAIN :
1980                                     IP_FRAG_NEXT_IP_REWRITE), 0);
1981           *next = IP4_REWRITE_NEXT_FRAGMENT;
1982         }
1983     }
1984 }
1985
1986 /* increment TTL & update checksum.
1987    Works either endian, so no need for byte swap. */
1988 static_always_inline void
1989 ip4_ttl_inc (vlib_buffer_t * b, ip4_header_t * ip)
1990 {
1991   i32 ttl;
1992   u32 checksum;
1993   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
1994     return;
1995
1996   ttl = ip->ttl;
1997
1998   checksum = ip->checksum - clib_host_to_net_u16 (0x0100);
1999   checksum += checksum >= 0xffff;
2000
2001   ip->checksum = checksum;
2002   ttl += 1;
2003   ip->ttl = ttl;
2004
2005   ASSERT (ip4_header_checksum_is_valid (ip));
2006 }
2007
2008 /* Decrement TTL & update checksum.
2009    Works either endian, so no need for byte swap. */
2010 static_always_inline void
2011 ip4_ttl_and_checksum_check (vlib_buffer_t * b, ip4_header_t * ip, u16 * next,
2012                             u32 * error)
2013 {
2014   i32 ttl;
2015   u32 checksum;
2016   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
2017     return;
2018
2019   ttl = ip->ttl;
2020
2021   /* Input node should have reject packets with ttl 0. */
2022   ASSERT (ip->ttl > 0);
2023
2024   checksum = ip->checksum + clib_host_to_net_u16 (0x0100);
2025   checksum += checksum >= 0xffff;
2026
2027   ip->checksum = checksum;
2028   ttl -= 1;
2029   ip->ttl = ttl;
2030
2031   /*
2032    * If the ttl drops below 1 when forwarding, generate
2033    * an ICMP response.
2034    */
2035   if (PREDICT_FALSE (ttl <= 0))
2036     {
2037       *error = IP4_ERROR_TIME_EXPIRED;
2038       vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2039       icmp4_error_set_vnet_buffer (b, ICMP4_time_exceeded,
2040                                    ICMP4_time_exceeded_ttl_exceeded_in_transit,
2041                                    0);
2042       *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2043     }
2044
2045   /* Verify checksum. */
2046   ASSERT (ip4_header_checksum_is_valid (ip) ||
2047           (vnet_buffer (b)->oflags & VNET_BUFFER_OFFLOAD_F_IP_CKSUM) ||
2048           (vnet_buffer (b)->oflags & VNET_BUFFER_OFFLOAD_F_OUTER_IP_CKSUM));
2049 }
2050
2051 always_inline uword
2052 ip4_rewrite_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
2053                     vlib_frame_t *frame, int do_counters, int is_midchain,
2054                     int is_mcast)
2055 {
2056   ip_lookup_main_t *lm = &ip4_main.lookup_main;
2057   u32 *from = vlib_frame_vector_args (frame);
2058   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2059   u16 nexts[VLIB_FRAME_SIZE], *next;
2060   u32 n_left_from;
2061   vlib_node_runtime_t *error_node =
2062     vlib_node_get_runtime (vm, ip4_input_node.index);
2063
2064   n_left_from = frame->n_vectors;
2065   u32 thread_index = vm->thread_index;
2066
2067   vlib_get_buffers (vm, from, bufs, n_left_from);
2068   clib_memset_u16 (nexts, IP4_REWRITE_NEXT_DROP, n_left_from);
2069
2070 #if (CLIB_N_PREFETCHES >= 8)
2071   if (n_left_from >= 6)
2072     {
2073       int i;
2074       for (i = 2; i < 6; i++)
2075         vlib_prefetch_buffer_header (bufs[i], LOAD);
2076     }
2077
2078   next = nexts;
2079   b = bufs;
2080   while (n_left_from >= 8)
2081     {
2082       const ip_adjacency_t *adj0, *adj1;
2083       ip4_header_t *ip0, *ip1;
2084       u32 rw_len0, error0, adj_index0;
2085       u32 rw_len1, error1, adj_index1;
2086       u32 tx_sw_if_index0, tx_sw_if_index1;
2087       u8 *p;
2088
2089       if (is_midchain)
2090         {
2091           vlib_prefetch_buffer_header (b[6], LOAD);
2092           vlib_prefetch_buffer_header (b[7], LOAD);
2093         }
2094
2095       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2096       adj_index1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
2097
2098       /*
2099        * pre-fetch the per-adjacency counters
2100        */
2101       if (do_counters)
2102         {
2103           vlib_prefetch_combined_counter (&adjacency_counters,
2104                                           thread_index, adj_index0);
2105           vlib_prefetch_combined_counter (&adjacency_counters,
2106                                           thread_index, adj_index1);
2107         }
2108
2109       ip0 = vlib_buffer_get_current (b[0]);
2110       ip1 = vlib_buffer_get_current (b[1]);
2111
2112       error0 = error1 = IP4_ERROR_NONE;
2113
2114       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2115       ip4_ttl_and_checksum_check (b[1], ip1, next + 1, &error1);
2116
2117       /* Rewrite packet header and updates lengths. */
2118       adj0 = adj_get (adj_index0);
2119       adj1 = adj_get (adj_index1);
2120
2121       /* Worth pipelining. No guarantee that adj0,1 are hot... */
2122       rw_len0 = adj0[0].rewrite_header.data_bytes;
2123       rw_len1 = adj1[0].rewrite_header.data_bytes;
2124       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2125       vnet_buffer (b[1])->ip.save_rewrite_length = rw_len1;
2126
2127       p = vlib_buffer_get_current (b[2]);
2128       clib_prefetch_store (p - CLIB_CACHE_LINE_BYTES);
2129       clib_prefetch_load (p);
2130
2131       p = vlib_buffer_get_current (b[3]);
2132       clib_prefetch_store (p - CLIB_CACHE_LINE_BYTES);
2133       clib_prefetch_load (p);
2134
2135       /* Check MTU of outgoing interface. */
2136       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2137       u16 ip1_len = clib_net_to_host_u16 (ip1->length);
2138
2139       if (b[0]->flags & VNET_BUFFER_F_GSO)
2140         ip0_len = gso_mtu_sz (b[0]);
2141       if (b[1]->flags & VNET_BUFFER_F_GSO)
2142         ip1_len = gso_mtu_sz (b[1]);
2143
2144       ip4_mtu_check (b[0], ip0_len,
2145                      adj0[0].rewrite_header.max_l3_packet_bytes,
2146                      ip0->flags_and_fragment_offset &
2147                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2148                      next + 0, is_midchain, &error0);
2149       ip4_mtu_check (b[1], ip1_len,
2150                      adj1[0].rewrite_header.max_l3_packet_bytes,
2151                      ip1->flags_and_fragment_offset &
2152                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2153                      next + 1, is_midchain, &error1);
2154
2155       if (is_mcast)
2156         {
2157           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2158                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2159                     IP4_ERROR_SAME_INTERFACE : error0);
2160           error1 = ((adj1[0].rewrite_header.sw_if_index ==
2161                      vnet_buffer (b[1])->sw_if_index[VLIB_RX]) ?
2162                     IP4_ERROR_SAME_INTERFACE : error1);
2163         }
2164
2165       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2166        * to see the IP header */
2167       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2168         {
2169           u32 next_index = adj0[0].rewrite_header.next_index;
2170           vlib_buffer_advance (b[0], -(word) rw_len0);
2171
2172           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2173           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2174
2175           if (PREDICT_FALSE
2176               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2177             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2178                                                 tx_sw_if_index0,
2179                                                 &next_index, b[0],
2180                                                 adj0->ia_cfg_index);
2181
2182           next[0] = next_index;
2183           if (is_midchain)
2184             vnet_calc_checksums_inline (vm, b[0], 1 /* is_ip4 */ ,
2185                                         0 /* is_ip6 */ );
2186         }
2187       else
2188         {
2189           b[0]->error = error_node->errors[error0];
2190           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2191             ip4_ttl_inc (b[0], ip0);
2192         }
2193       if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
2194         {
2195           u32 next_index = adj1[0].rewrite_header.next_index;
2196           vlib_buffer_advance (b[1], -(word) rw_len1);
2197
2198           tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2199           vnet_buffer (b[1])->sw_if_index[VLIB_TX] = tx_sw_if_index1;
2200
2201           if (PREDICT_FALSE
2202               (adj1[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2203             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2204                                                 tx_sw_if_index1,
2205                                                 &next_index, b[1],
2206                                                 adj1->ia_cfg_index);
2207           next[1] = next_index;
2208           if (is_midchain)
2209             vnet_calc_checksums_inline (vm, b[1], 1 /* is_ip4 */ ,
2210                                         0 /* is_ip6 */ );
2211         }
2212       else
2213         {
2214           b[1]->error = error_node->errors[error1];
2215           if (error1 == IP4_ERROR_MTU_EXCEEDED)
2216             ip4_ttl_inc (b[1], ip1);
2217         }
2218
2219       if (is_midchain)
2220         /* Guess we are only writing on ipv4 header. */
2221         vnet_rewrite_two_headers (adj0[0], adj1[0],
2222                                   ip0, ip1, sizeof (ip4_header_t));
2223       else
2224         /* Guess we are only writing on simple Ethernet header. */
2225         vnet_rewrite_two_headers (adj0[0], adj1[0],
2226                                   ip0, ip1, sizeof (ethernet_header_t));
2227
2228       if (do_counters)
2229         {
2230           if (error0 == IP4_ERROR_NONE)
2231             vlib_increment_combined_counter
2232               (&adjacency_counters,
2233                thread_index,
2234                adj_index0, 1,
2235                vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2236
2237           if (error1 == IP4_ERROR_NONE)
2238             vlib_increment_combined_counter
2239               (&adjacency_counters,
2240                thread_index,
2241                adj_index1, 1,
2242                vlib_buffer_length_in_chain (vm, b[1]) + rw_len1);
2243         }
2244
2245       if (is_midchain)
2246         {
2247           if (error0 == IP4_ERROR_NONE)
2248             adj_midchain_fixup (vm, adj0, b[0], VNET_LINK_IP4);
2249           if (error1 == IP4_ERROR_NONE)
2250             adj_midchain_fixup (vm, adj1, b[1], VNET_LINK_IP4);
2251         }
2252
2253       if (is_mcast)
2254         {
2255           /* copy bytes from the IP address into the MAC rewrite */
2256           if (error0 == IP4_ERROR_NONE)
2257             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2258                                         adj0->rewrite_header.dst_mcast_offset,
2259                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2260           if (error1 == IP4_ERROR_NONE)
2261             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2262                                         adj1->rewrite_header.dst_mcast_offset,
2263                                         &ip1->dst_address.as_u32, (u8 *) ip1);
2264         }
2265
2266       next += 2;
2267       b += 2;
2268       n_left_from -= 2;
2269     }
2270 #elif (CLIB_N_PREFETCHES >= 4)
2271   next = nexts;
2272   b = bufs;
2273   while (n_left_from >= 1)
2274     {
2275       ip_adjacency_t *adj0;
2276       ip4_header_t *ip0;
2277       u32 rw_len0, error0, adj_index0;
2278       u32 tx_sw_if_index0;
2279       u8 *p;
2280
2281       /* Prefetch next iteration */
2282       if (PREDICT_TRUE (n_left_from >= 4))
2283         {
2284           ip_adjacency_t *adj2;
2285           u32 adj_index2;
2286
2287           vlib_prefetch_buffer_header (b[3], LOAD);
2288           vlib_prefetch_buffer_data (b[2], LOAD);
2289
2290           /* Prefetch adj->rewrite_header */
2291           adj_index2 = vnet_buffer (b[2])->ip.adj_index[VLIB_TX];
2292           adj2 = adj_get (adj_index2);
2293           p = (u8 *) adj2;
2294           CLIB_PREFETCH (p + CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES,
2295                          LOAD);
2296         }
2297
2298       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2299
2300       /*
2301        * Prefetch the per-adjacency counters
2302        */
2303       if (do_counters)
2304         {
2305           vlib_prefetch_combined_counter (&adjacency_counters,
2306                                           thread_index, adj_index0);
2307         }
2308
2309       ip0 = vlib_buffer_get_current (b[0]);
2310
2311       error0 = IP4_ERROR_NONE;
2312
2313       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2314
2315       /* Rewrite packet header and updates lengths. */
2316       adj0 = adj_get (adj_index0);
2317
2318       /* Rewrite header was prefetched. */
2319       rw_len0 = adj0[0].rewrite_header.data_bytes;
2320       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2321
2322       /* Check MTU of outgoing interface. */
2323       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2324
2325       if (b[0]->flags & VNET_BUFFER_F_GSO)
2326         ip0_len = gso_mtu_sz (b[0]);
2327
2328       ip4_mtu_check (b[0], ip0_len,
2329                      adj0[0].rewrite_header.max_l3_packet_bytes,
2330                      ip0->flags_and_fragment_offset &
2331                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2332                      next + 0, is_midchain, &error0);
2333
2334       if (is_mcast)
2335         {
2336           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2337                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2338                     IP4_ERROR_SAME_INTERFACE : error0);
2339         }
2340
2341       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2342        * to see the IP header */
2343       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2344         {
2345           u32 next_index = adj0[0].rewrite_header.next_index;
2346           vlib_buffer_advance (b[0], -(word) rw_len0);
2347           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2348           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2349
2350           if (PREDICT_FALSE
2351               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2352             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2353                                                 tx_sw_if_index0,
2354                                                 &next_index, b[0],
2355                                                 adj0->ia_cfg_index);
2356           next[0] = next_index;
2357
2358           if (is_midchain)
2359             {
2360               vnet_calc_checksums_inline (vm, b[0], 1 /* is_ip4 */ ,
2361                                           0 /* is_ip6 */ );
2362
2363               /* Guess we are only writing on ipv4 header. */
2364               vnet_rewrite_one_header (adj0[0], ip0, sizeof (ip4_header_t));
2365             }
2366           else
2367             /* Guess we are only writing on simple Ethernet header. */
2368             vnet_rewrite_one_header (adj0[0], ip0,
2369                                      sizeof (ethernet_header_t));
2370
2371           /*
2372            * Bump the per-adjacency counters
2373            */
2374           if (do_counters)
2375             vlib_increment_combined_counter
2376               (&adjacency_counters,
2377                thread_index,
2378                adj_index0, 1, vlib_buffer_length_in_chain (vm,
2379                                                            b[0]) + rw_len0);
2380
2381           if (is_midchain)
2382             adj_midchain_fixup (vm, adj0, b[0], VNET_LINK_IP4);
2383
2384           if (is_mcast)
2385             /* copy bytes from the IP address into the MAC rewrite */
2386             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2387                                         adj0->rewrite_header.dst_mcast_offset,
2388                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2389         }
2390       else
2391         {
2392           b[0]->error = error_node->errors[error0];
2393           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2394             ip4_ttl_inc (b[0], ip0);
2395         }
2396
2397       next += 1;
2398       b += 1;
2399       n_left_from -= 1;
2400     }
2401 #endif
2402
2403   while (n_left_from > 0)
2404     {
2405       ip_adjacency_t *adj0;
2406       ip4_header_t *ip0;
2407       u32 rw_len0, adj_index0, error0;
2408       u32 tx_sw_if_index0;
2409
2410       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2411
2412       adj0 = adj_get (adj_index0);
2413
2414       if (do_counters)
2415         vlib_prefetch_combined_counter (&adjacency_counters,
2416                                         thread_index, adj_index0);
2417
2418       ip0 = vlib_buffer_get_current (b[0]);
2419
2420       error0 = IP4_ERROR_NONE;
2421
2422       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2423
2424
2425       /* Update packet buffer attributes/set output interface. */
2426       rw_len0 = adj0[0].rewrite_header.data_bytes;
2427       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2428
2429       /* Check MTU of outgoing interface. */
2430       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2431       if (b[0]->flags & VNET_BUFFER_F_GSO)
2432         ip0_len = gso_mtu_sz (b[0]);
2433
2434       ip4_mtu_check (b[0], ip0_len,
2435                      adj0[0].rewrite_header.max_l3_packet_bytes,
2436                      ip0->flags_and_fragment_offset &
2437                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2438                      next + 0, is_midchain, &error0);
2439
2440       if (is_mcast)
2441         {
2442           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2443                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2444                     IP4_ERROR_SAME_INTERFACE : error0);
2445         }
2446
2447       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2448        * to see the IP header */
2449       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2450         {
2451           u32 next_index = adj0[0].rewrite_header.next_index;
2452           vlib_buffer_advance (b[0], -(word) rw_len0);
2453           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2454           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2455
2456           if (PREDICT_FALSE
2457               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2458             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2459                                                 tx_sw_if_index0,
2460                                                 &next_index, b[0],
2461                                                 adj0->ia_cfg_index);
2462           next[0] = next_index;
2463
2464           if (is_midchain)
2465             {
2466               /* this acts on the packet that is about to be encapped */
2467               vnet_calc_checksums_inline (vm, b[0], 1 /* is_ip4 */ ,
2468                                           0 /* is_ip6 */ );
2469
2470               /* Guess we are only writing on ipv4 header. */
2471               vnet_rewrite_one_header (adj0[0], ip0, sizeof (ip4_header_t));
2472             }
2473           else
2474             /* Guess we are only writing on simple Ethernet header. */
2475             vnet_rewrite_one_header (adj0[0], ip0,
2476                                      sizeof (ethernet_header_t));
2477
2478           if (do_counters)
2479             vlib_increment_combined_counter
2480               (&adjacency_counters,
2481                thread_index, adj_index0, 1,
2482                vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2483
2484           if (is_midchain)
2485             adj_midchain_fixup (vm, adj0, b[0], VNET_LINK_IP4);
2486
2487           if (is_mcast)
2488             /* copy bytes from the IP address into the MAC rewrite */
2489             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2490                                         adj0->rewrite_header.dst_mcast_offset,
2491                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2492         }
2493       else
2494         {
2495           b[0]->error = error_node->errors[error0];
2496           /* undo the TTL decrement - we'll be back to do it again */
2497           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2498             ip4_ttl_inc (b[0], ip0);
2499         }
2500
2501       next += 1;
2502       b += 1;
2503       n_left_from -= 1;
2504     }
2505
2506
2507   /* Need to do trace after rewrites to pick up new packet data. */
2508   if (node->flags & VLIB_NODE_FLAG_TRACE)
2509     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2510
2511   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2512   return frame->n_vectors;
2513 }
2514
2515 /** @brief IPv4 rewrite node.
2516     @node ip4-rewrite
2517
2518     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2519     header checksum, fetch the ip adjacency, check the outbound mtu,
2520     apply the adjacency rewrite, and send pkts to the adjacency
2521     rewrite header's rewrite_next_index.
2522
2523     @param vm vlib_main_t corresponding to the current thread
2524     @param node vlib_node_runtime_t
2525     @param frame vlib_frame_t whose contents should be dispatched
2526
2527     @par Graph mechanics: buffer metadata, next index usage
2528
2529     @em Uses:
2530     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2531         - the rewrite adjacency index
2532     - <code>adj->lookup_next_index</code>
2533         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2534           the packet will be dropped.
2535     - <code>adj->rewrite_header</code>
2536         - Rewrite string length, rewrite string, next_index
2537
2538     @em Sets:
2539     - <code>b->current_data, b->current_length</code>
2540         - Updated net of applying the rewrite string
2541
2542     <em>Next Indices:</em>
2543     - <code> adj->rewrite_header.next_index </code>
2544       or @c ip4-drop
2545 */
2546
2547 VLIB_NODE_FN (ip4_rewrite_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2548                                  vlib_frame_t * frame)
2549 {
2550   if (adj_are_counters_enabled ())
2551     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2552   else
2553     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2554 }
2555
2556 VLIB_NODE_FN (ip4_rewrite_bcast_node) (vlib_main_t * vm,
2557                                        vlib_node_runtime_t * node,
2558                                        vlib_frame_t * frame)
2559 {
2560   if (adj_are_counters_enabled ())
2561     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2562   else
2563     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2564 }
2565
2566 VLIB_NODE_FN (ip4_midchain_node) (vlib_main_t * vm,
2567                                   vlib_node_runtime_t * node,
2568                                   vlib_frame_t * frame)
2569 {
2570   if (adj_are_counters_enabled ())
2571     return ip4_rewrite_inline (vm, node, frame, 1, 1, 0);
2572   else
2573     return ip4_rewrite_inline (vm, node, frame, 0, 1, 0);
2574 }
2575
2576 VLIB_NODE_FN (ip4_rewrite_mcast_node) (vlib_main_t * vm,
2577                                        vlib_node_runtime_t * node,
2578                                        vlib_frame_t * frame)
2579 {
2580   if (adj_are_counters_enabled ())
2581     return ip4_rewrite_inline (vm, node, frame, 1, 0, 1);
2582   else
2583     return ip4_rewrite_inline (vm, node, frame, 0, 0, 1);
2584 }
2585
2586 VLIB_NODE_FN (ip4_mcast_midchain_node) (vlib_main_t * vm,
2587                                         vlib_node_runtime_t * node,
2588                                         vlib_frame_t * frame)
2589 {
2590   if (adj_are_counters_enabled ())
2591     return ip4_rewrite_inline (vm, node, frame, 1, 1, 1);
2592   else
2593     return ip4_rewrite_inline (vm, node, frame, 0, 1, 1);
2594 }
2595
2596 /* *INDENT-OFF* */
2597 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2598   .name = "ip4-rewrite",
2599   .vector_size = sizeof (u32),
2600
2601   .format_trace = format_ip4_rewrite_trace,
2602
2603   .n_next_nodes = IP4_REWRITE_N_NEXT,
2604   .next_nodes = {
2605     [IP4_REWRITE_NEXT_DROP] = "ip4-drop",
2606     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2607     [IP4_REWRITE_NEXT_FRAGMENT] = "ip4-frag",
2608   },
2609 };
2610
2611 VLIB_REGISTER_NODE (ip4_rewrite_bcast_node) = {
2612   .name = "ip4-rewrite-bcast",
2613   .vector_size = sizeof (u32),
2614
2615   .format_trace = format_ip4_rewrite_trace,
2616   .sibling_of = "ip4-rewrite",
2617 };
2618
2619 VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = {
2620   .name = "ip4-rewrite-mcast",
2621   .vector_size = sizeof (u32),
2622
2623   .format_trace = format_ip4_rewrite_trace,
2624   .sibling_of = "ip4-rewrite",
2625 };
2626
2627 VLIB_REGISTER_NODE (ip4_mcast_midchain_node) = {
2628   .name = "ip4-mcast-midchain",
2629   .vector_size = sizeof (u32),
2630
2631   .format_trace = format_ip4_rewrite_trace,
2632   .sibling_of = "ip4-rewrite",
2633 };
2634
2635 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2636   .name = "ip4-midchain",
2637   .vector_size = sizeof (u32),
2638   .format_trace = format_ip4_rewrite_trace,
2639   .sibling_of = "ip4-rewrite",
2640 };
2641 /* *INDENT-ON */
2642
2643 static clib_error_t *
2644 set_ip_flow_hash_command_fn (vlib_main_t * vm,
2645                              unformat_input_t * input,
2646                              vlib_cli_command_t * cmd)
2647 {
2648   int matched = 0;
2649   u32 table_id = 0;
2650   u32 flow_hash_config = 0;
2651   int rv;
2652
2653   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
2654     {
2655       if (unformat (input, "table %d", &table_id))
2656         matched = 1;
2657 #define _(a, b, v)                                                            \
2658   else if (unformat (input, #a))                                              \
2659   {                                                                           \
2660     flow_hash_config |= v;                                                    \
2661     matched = 1;                                                              \
2662   }
2663       foreach_flow_hash_bit
2664 #undef _
2665         else
2666         break;
2667     }
2668
2669   if (matched == 0)
2670     return clib_error_return (0, "unknown input `%U'",
2671                               format_unformat_error, input);
2672
2673   rv = ip_flow_hash_set (AF_IP4, table_id, flow_hash_config);
2674   switch (rv)
2675     {
2676     case 0:
2677       break;
2678
2679     case VNET_API_ERROR_NO_SUCH_FIB:
2680       return clib_error_return (0, "no such FIB table %d", table_id);
2681
2682     default:
2683       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
2684       break;
2685     }
2686
2687   return 0;
2688 }
2689
2690 /*?
2691  * Configure the set of IPv4 fields used by the flow hash.
2692  *
2693  * @cliexpar
2694  * Example of how to set the flow hash on a given table:
2695  * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
2696  * Example of display the configured flow hash:
2697  * @cliexstart{show ip fib}
2698  * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
2699  * 0.0.0.0/0
2700  *   unicast-ip4-chain
2701  *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
2702  *     [0] [@0]: dpo-drop ip6
2703  * 0.0.0.0/32
2704  *   unicast-ip4-chain
2705  *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
2706  *     [0] [@0]: dpo-drop ip6
2707  * 224.0.0.0/8
2708  *   unicast-ip4-chain
2709  *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
2710  *     [0] [@0]: dpo-drop ip6
2711  * 6.0.1.2/32
2712  *   unicast-ip4-chain
2713  *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
2714  *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
2715  * 7.0.0.1/32
2716  *   unicast-ip4-chain
2717  *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
2718  *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
2719  *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
2720  *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
2721  *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
2722  * 240.0.0.0/8
2723  *   unicast-ip4-chain
2724  *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
2725  *     [0] [@0]: dpo-drop ip6
2726  * 255.255.255.255/32
2727  *   unicast-ip4-chain
2728  *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
2729  *     [0] [@0]: dpo-drop ip6
2730  * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
2731  * 0.0.0.0/0
2732  *   unicast-ip4-chain
2733  *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
2734  *     [0] [@0]: dpo-drop ip6
2735  * 0.0.0.0/32
2736  *   unicast-ip4-chain
2737  *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
2738  *     [0] [@0]: dpo-drop ip6
2739  * 172.16.1.0/24
2740  *   unicast-ip4-chain
2741  *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
2742  *     [0] [@4]: ipv4-glean: af_packet0
2743  * 172.16.1.1/32
2744  *   unicast-ip4-chain
2745  *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
2746  *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
2747  * 172.16.1.2/32
2748  *   unicast-ip4-chain
2749  *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
2750  *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
2751  * 172.16.2.0/24
2752  *   unicast-ip4-chain
2753  *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
2754  *     [0] [@4]: ipv4-glean: af_packet1
2755  * 172.16.2.1/32
2756  *   unicast-ip4-chain
2757  *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
2758  *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
2759  * 224.0.0.0/8
2760  *   unicast-ip4-chain
2761  *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
2762  *     [0] [@0]: dpo-drop ip6
2763  * 240.0.0.0/8
2764  *   unicast-ip4-chain
2765  *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
2766  *     [0] [@0]: dpo-drop ip6
2767  * 255.255.255.255/32
2768  *   unicast-ip4-chain
2769  *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
2770  *     [0] [@0]: dpo-drop ip6
2771  * @cliexend
2772 ?*/
2773 /* *INDENT-OFF* */
2774 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) =
2775 {
2776   .path = "set ip flow-hash",
2777   .short_help =
2778   "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
2779   .function = set_ip_flow_hash_command_fn,
2780 };
2781 /* *INDENT-ON* */
2782
2783 #ifndef CLIB_MARCH_VARIANT
2784 int
2785 vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
2786                              u32 table_index)
2787 {
2788   vnet_main_t *vnm = vnet_get_main ();
2789   vnet_interface_main_t *im = &vnm->interface_main;
2790   ip4_main_t *ipm = &ip4_main;
2791   ip_lookup_main_t *lm = &ipm->lookup_main;
2792   vnet_classify_main_t *cm = &vnet_classify_main;
2793   ip4_address_t *if_addr;
2794
2795   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
2796     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
2797
2798   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
2799     return VNET_API_ERROR_NO_SUCH_ENTRY;
2800
2801   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
2802   lm->classify_table_index_by_sw_if_index[sw_if_index] = table_index;
2803
2804   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
2805
2806   if (NULL != if_addr)
2807     {
2808       fib_prefix_t pfx = {
2809         .fp_len = 32,
2810         .fp_proto = FIB_PROTOCOL_IP4,
2811         .fp_addr.ip4 = *if_addr,
2812       };
2813       u32 fib_index;
2814
2815       fib_index = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
2816                                                        sw_if_index);
2817
2818
2819       if (table_index != (u32) ~ 0)
2820         {
2821           dpo_id_t dpo = DPO_INVALID;
2822
2823           dpo_set (&dpo,
2824                    DPO_CLASSIFY,
2825                    DPO_PROTO_IP4,
2826                    classify_dpo_create (DPO_PROTO_IP4, table_index));
2827
2828           fib_table_entry_special_dpo_add (fib_index,
2829                                            &pfx,
2830                                            FIB_SOURCE_CLASSIFY,
2831                                            FIB_ENTRY_FLAG_NONE, &dpo);
2832           dpo_reset (&dpo);
2833         }
2834       else
2835         {
2836           fib_table_entry_special_remove (fib_index,
2837                                           &pfx, FIB_SOURCE_CLASSIFY);
2838         }
2839     }
2840
2841   return 0;
2842 }
2843 #endif
2844
2845 static clib_error_t *
2846 set_ip_classify_command_fn (vlib_main_t * vm,
2847                             unformat_input_t * input,
2848                             vlib_cli_command_t * cmd)
2849 {
2850   u32 table_index = ~0;
2851   int table_index_set = 0;
2852   u32 sw_if_index = ~0;
2853   int rv;
2854
2855   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
2856     {
2857       if (unformat (input, "table-index %d", &table_index))
2858         table_index_set = 1;
2859       else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
2860                          vnet_get_main (), &sw_if_index))
2861         ;
2862       else
2863         break;
2864     }
2865
2866   if (table_index_set == 0)
2867     return clib_error_return (0, "classify table-index must be specified");
2868
2869   if (sw_if_index == ~0)
2870     return clib_error_return (0, "interface / subif must be specified");
2871
2872   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
2873
2874   switch (rv)
2875     {
2876     case 0:
2877       break;
2878
2879     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
2880       return clib_error_return (0, "No such interface");
2881
2882     case VNET_API_ERROR_NO_SUCH_ENTRY:
2883       return clib_error_return (0, "No such classifier table");
2884     }
2885   return 0;
2886 }
2887
2888 /*?
2889  * Assign a classification table to an interface. The classification
2890  * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
2891  * commands. Once the table is create, use this command to filter packets
2892  * on an interface.
2893  *
2894  * @cliexpar
2895  * Example of how to assign a classification table to an interface:
2896  * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
2897 ?*/
2898 /* *INDENT-OFF* */
2899 VLIB_CLI_COMMAND (set_ip_classify_command, static) =
2900 {
2901     .path = "set ip classify",
2902     .short_help =
2903     "set ip classify intfc <interface> table-index <classify-idx>",
2904     .function = set_ip_classify_command_fn,
2905 };
2906 /* *INDENT-ON* */
2907
2908 /*
2909  * fd.io coding-style-patch-verification: ON
2910  *
2911  * Local Variables:
2912  * eval: (c-set-style "gnu")
2913  * End:
2914  */