ip: Fix crash in ip address add on sub-int without exact-match
[vpp.git] / src / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ip/ip_frag.h>
43 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
44 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
45 #include <vnet/ppp/ppp.h>
46 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
47 #include <vnet/api_errno.h>     /* for API error numbers */
48 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
50 #include <vnet/fib/fib_urpf_list.h>     /* for FIB uRPF check */
51 #include <vnet/fib/ip4_fib.h>
52 #include <vnet/mfib/ip4_mfib.h>
53 #include <vnet/dpo/load_balance.h>
54 #include <vnet/dpo/load_balance_map.h>
55 #include <vnet/dpo/classify_dpo.h>
56 #include <vnet/mfib/mfib_table.h>       /* for mFIB table and entry creation */
57 #include <vnet/adj/adj_dp.h>
58 #include <vnet/pg/pg.h>
59
60 #include <vnet/ip/ip4_forward.h>
61 #include <vnet/interface_output.h>
62 #include <vnet/classify/vnet_classify.h>
63
64 /** @brief IPv4 lookup node.
65     @node ip4-lookup
66
67     This is the main IPv4 lookup dispatch node.
68
69     @param vm vlib_main_t corresponding to the current thread
70     @param node vlib_node_runtime_t
71     @param frame vlib_frame_t whose contents should be dispatched
72
73     @par Graph mechanics: buffer metadata, next index usage
74
75     @em Uses:
76     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
77         - Indicates the @c sw_if_index value of the interface that the
78           packet was received on.
79     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
80         - When the value is @c ~0 then the node performs a longest prefix
81           match (LPM) for the packet destination address in the FIB attached
82           to the receive interface.
83         - Otherwise perform LPM for the packet destination address in the
84           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
85           value (0, 1, ...) and not a VRF id.
86
87     @em Sets:
88     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
89         - The lookup result adjacency index.
90
91     <em>Next Index:</em>
92     - Dispatches the packet to the node index found in
93       ip_adjacency_t @c adj->lookup_next_index
94       (where @c adj is the lookup result adjacency).
95 */
96 VLIB_NODE_FN (ip4_lookup_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
97                                 vlib_frame_t * frame)
98 {
99   return ip4_lookup_inline (vm, node, frame);
100 }
101
102 static u8 *format_ip4_lookup_trace (u8 * s, va_list * args);
103
104 /* *INDENT-OFF* */
105 VLIB_REGISTER_NODE (ip4_lookup_node) =
106 {
107   .name = "ip4-lookup",
108   .vector_size = sizeof (u32),
109   .format_trace = format_ip4_lookup_trace,
110   .n_next_nodes = IP_LOOKUP_N_NEXT,
111   .next_nodes = IP4_LOOKUP_NEXT_NODES,
112 };
113 /* *INDENT-ON* */
114
115 VLIB_NODE_FN (ip4_load_balance_node) (vlib_main_t * vm,
116                                       vlib_node_runtime_t * node,
117                                       vlib_frame_t * frame)
118 {
119   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
120   u32 n_left, *from;
121   u32 thread_index = vm->thread_index;
122   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
123   u16 nexts[VLIB_FRAME_SIZE], *next;
124
125   from = vlib_frame_vector_args (frame);
126   n_left = frame->n_vectors;
127   next = nexts;
128
129   vlib_get_buffers (vm, from, bufs, n_left);
130
131   while (n_left >= 4)
132     {
133       const load_balance_t *lb0, *lb1;
134       const ip4_header_t *ip0, *ip1;
135       u32 lbi0, hc0, lbi1, hc1;
136       const dpo_id_t *dpo0, *dpo1;
137
138       /* Prefetch next iteration. */
139       {
140         vlib_prefetch_buffer_header (b[2], LOAD);
141         vlib_prefetch_buffer_header (b[3], LOAD);
142
143         CLIB_PREFETCH (b[2]->data, sizeof (ip0[0]), LOAD);
144         CLIB_PREFETCH (b[3]->data, sizeof (ip0[0]), LOAD);
145       }
146
147       ip0 = vlib_buffer_get_current (b[0]);
148       ip1 = vlib_buffer_get_current (b[1]);
149       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
150       lbi1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
151
152       lb0 = load_balance_get (lbi0);
153       lb1 = load_balance_get (lbi1);
154
155       /*
156        * this node is for via FIBs we can re-use the hash value from the
157        * to node if present.
158        * We don't want to use the same hash value at each level in the recursion
159        * graph as that would lead to polarisation
160        */
161       hc0 = hc1 = 0;
162
163       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
164         {
165           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
166             {
167               hc0 = vnet_buffer (b[0])->ip.flow_hash =
168                 vnet_buffer (b[0])->ip.flow_hash >> 1;
169             }
170           else
171             {
172               hc0 = vnet_buffer (b[0])->ip.flow_hash =
173                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
174             }
175           dpo0 = load_balance_get_fwd_bucket
176             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
177         }
178       else
179         {
180           dpo0 = load_balance_get_bucket_i (lb0, 0);
181         }
182       if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
183         {
184           if (PREDICT_TRUE (vnet_buffer (b[1])->ip.flow_hash))
185             {
186               hc1 = vnet_buffer (b[1])->ip.flow_hash =
187                 vnet_buffer (b[1])->ip.flow_hash >> 1;
188             }
189           else
190             {
191               hc1 = vnet_buffer (b[1])->ip.flow_hash =
192                 ip4_compute_flow_hash (ip1, lb1->lb_hash_config);
193             }
194           dpo1 = load_balance_get_fwd_bucket
195             (lb1, (hc1 & (lb1->lb_n_buckets_minus_1)));
196         }
197       else
198         {
199           dpo1 = load_balance_get_bucket_i (lb1, 0);
200         }
201
202       next[0] = dpo0->dpoi_next_node;
203       next[1] = dpo1->dpoi_next_node;
204
205       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
206       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
207
208       vlib_increment_combined_counter
209         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
210       vlib_increment_combined_counter
211         (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b[1]));
212
213       b += 2;
214       next += 2;
215       n_left -= 2;
216     }
217
218   while (n_left > 0)
219     {
220       const load_balance_t *lb0;
221       const ip4_header_t *ip0;
222       const dpo_id_t *dpo0;
223       u32 lbi0, hc0;
224
225       ip0 = vlib_buffer_get_current (b[0]);
226       lbi0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
227
228       lb0 = load_balance_get (lbi0);
229
230       hc0 = 0;
231       if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
232         {
233           if (PREDICT_TRUE (vnet_buffer (b[0])->ip.flow_hash))
234             {
235               hc0 = vnet_buffer (b[0])->ip.flow_hash =
236                 vnet_buffer (b[0])->ip.flow_hash >> 1;
237             }
238           else
239             {
240               hc0 = vnet_buffer (b[0])->ip.flow_hash =
241                 ip4_compute_flow_hash (ip0, lb0->lb_hash_config);
242             }
243           dpo0 = load_balance_get_fwd_bucket
244             (lb0, (hc0 & (lb0->lb_n_buckets_minus_1)));
245         }
246       else
247         {
248           dpo0 = load_balance_get_bucket_i (lb0, 0);
249         }
250
251       next[0] = dpo0->dpoi_next_node;
252       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
253
254       vlib_increment_combined_counter
255         (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b[0]));
256
257       b += 1;
258       next += 1;
259       n_left -= 1;
260     }
261
262   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
263   if (node->flags & VLIB_NODE_FLAG_TRACE)
264     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
265
266   return frame->n_vectors;
267 }
268
269 /* *INDENT-OFF* */
270 VLIB_REGISTER_NODE (ip4_load_balance_node) =
271 {
272   .name = "ip4-load-balance",
273   .vector_size = sizeof (u32),
274   .sibling_of = "ip4-lookup",
275   .format_trace = format_ip4_lookup_trace,
276 };
277 /* *INDENT-ON* */
278
279 #ifndef CLIB_MARCH_VARIANT
280 /* get first interface address */
281 ip4_address_t *
282 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
283                              ip_interface_address_t ** result_ia)
284 {
285   ip_lookup_main_t *lm = &im->lookup_main;
286   ip_interface_address_t *ia = 0;
287   ip4_address_t *result = 0;
288
289   /* *INDENT-OFF* */
290   foreach_ip_interface_address
291     (lm, ia, sw_if_index,
292      1 /* honor unnumbered */ ,
293      ({
294        ip4_address_t * a =
295          ip_interface_address_get_address (lm, ia);
296        result = a;
297        break;
298      }));
299   /* *INDENT-OFF* */
300   if (result_ia)
301     *result_ia = result ? ia : 0;
302   return result;
303 }
304 #endif
305
306 static void
307 ip4_add_subnet_bcast_route (u32 fib_index,
308                             fib_prefix_t *pfx,
309                             u32 sw_if_index)
310 {
311   vnet_sw_interface_flags_t iflags;
312
313   iflags = vnet_sw_interface_get_flags(vnet_get_main(), sw_if_index);
314
315   fib_table_entry_special_remove(fib_index,
316                                  pfx,
317                                  FIB_SOURCE_INTERFACE);
318
319   if (iflags & VNET_SW_INTERFACE_FLAG_DIRECTED_BCAST)
320     {
321       fib_table_entry_update_one_path (fib_index, pfx,
322                                        FIB_SOURCE_INTERFACE,
323                                        FIB_ENTRY_FLAG_NONE,
324                                        DPO_PROTO_IP4,
325                                        /* No next-hop address */
326                                        &ADJ_BCAST_ADDR,
327                                        sw_if_index,
328                                        // invalid FIB index
329                                        ~0,
330                                        1,
331                                        // no out-label stack
332                                        NULL,
333                                        FIB_ROUTE_PATH_FLAG_NONE);
334     }
335   else
336     {
337         fib_table_entry_special_add(fib_index,
338                                     pfx,
339                                     FIB_SOURCE_INTERFACE,
340                                     (FIB_ENTRY_FLAG_DROP |
341                                      FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
342     }
343 }
344
345 static void
346 ip4_add_interface_prefix_routes (ip4_main_t *im,
347                                  u32 sw_if_index,
348                                  u32 fib_index,
349                                  ip_interface_address_t * a)
350 {
351   ip_lookup_main_t *lm = &im->lookup_main;
352   ip_interface_prefix_t *if_prefix;
353   ip4_address_t *address = ip_interface_address_get_address (lm, a);
354
355   ip_interface_prefix_key_t key = {
356     .prefix = {
357       .fp_len = a->address_length,
358       .fp_proto = FIB_PROTOCOL_IP4,
359       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[a->address_length],
360     },
361     .sw_if_index = sw_if_index,
362   };
363
364   fib_prefix_t pfx_special = {
365     .fp_proto = FIB_PROTOCOL_IP4,
366   };
367
368   /* If prefix already set on interface, just increment ref count & return */
369   if_prefix = ip_get_interface_prefix (lm, &key);
370   if (if_prefix)
371     {
372       if_prefix->ref_count += 1;
373       return;
374     }
375
376   /* New prefix - allocate a pool entry, initialize it, add to the hash */
377   pool_get (lm->if_prefix_pool, if_prefix);
378   if_prefix->ref_count = 1;
379   if_prefix->src_ia_index = a - lm->if_address_pool;
380   clib_memcpy (&if_prefix->key, &key, sizeof (key));
381   mhash_set (&lm->prefix_to_if_prefix_index, &key,
382              if_prefix - lm->if_prefix_pool, 0 /* old value */);
383
384   pfx_special.fp_len = a->address_length;
385   pfx_special.fp_addr.ip4.as_u32 = address->as_u32;
386
387   /* set the glean route for the prefix */
388   fib_table_entry_update_one_path (fib_index, &pfx_special,
389                                    FIB_SOURCE_INTERFACE,
390                                    (FIB_ENTRY_FLAG_CONNECTED |
391                                     FIB_ENTRY_FLAG_ATTACHED),
392                                    DPO_PROTO_IP4,
393                                    /* No next-hop address */
394                                    NULL,
395                                    sw_if_index,
396                                    /* invalid FIB index */
397                                    ~0,
398                                    1,
399                                    /* no out-label stack */
400                                    NULL,
401                                    FIB_ROUTE_PATH_FLAG_NONE);
402
403   /* length <= 30 - add glean, drop first address, maybe drop bcast address */
404   if (a->address_length <= 30)
405     {
406       /* set a drop route for the base address of the prefix */
407       pfx_special.fp_len = 32;
408       pfx_special.fp_addr.ip4.as_u32 =
409         address->as_u32 & im->fib_masks[a->address_length];
410
411       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
412         fib_table_entry_special_add (fib_index, &pfx_special,
413                                      FIB_SOURCE_INTERFACE,
414                                      (FIB_ENTRY_FLAG_DROP |
415                                       FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT));
416
417       /* set a route for the broadcast address of the prefix */
418       pfx_special.fp_len = 32;
419       pfx_special.fp_addr.ip4.as_u32 =
420         address->as_u32 | ~im->fib_masks[a->address_length];
421       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
422         ip4_add_subnet_bcast_route (fib_index, &pfx_special, sw_if_index);
423
424
425     }
426   /* length == 31 - add an attached route for the other address */
427   else if (a->address_length == 31)
428     {
429       pfx_special.fp_len = 32;
430       pfx_special.fp_addr.ip4.as_u32 =
431         address->as_u32 ^ clib_host_to_net_u32(1);
432
433       fib_table_entry_update_one_path (fib_index, &pfx_special,
434                                        FIB_SOURCE_INTERFACE,
435                                        (FIB_ENTRY_FLAG_ATTACHED),
436                                        DPO_PROTO_IP4,
437                                        &pfx_special.fp_addr,
438                                        sw_if_index,
439                                        /* invalid FIB index */
440                                        ~0,
441                                        1,
442                                        NULL,
443                                        FIB_ROUTE_PATH_FLAG_NONE);
444     }
445 }
446
447 static void
448 ip4_add_interface_routes (u32 sw_if_index,
449                           ip4_main_t * im, u32 fib_index,
450                           ip_interface_address_t * a)
451 {
452   ip_lookup_main_t *lm = &im->lookup_main;
453   ip4_address_t *address = ip_interface_address_get_address (lm, a);
454   fib_prefix_t pfx = {
455     .fp_len = 32,
456     .fp_proto = FIB_PROTOCOL_IP4,
457     .fp_addr.ip4 = *address,
458   };
459
460   /* set special routes for the prefix if needed */
461   ip4_add_interface_prefix_routes (im, sw_if_index, fib_index, a);
462
463   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
464     {
465       u32 classify_table_index =
466         lm->classify_table_index_by_sw_if_index[sw_if_index];
467       if (classify_table_index != (u32) ~ 0)
468         {
469           dpo_id_t dpo = DPO_INVALID;
470
471           dpo_set (&dpo,
472                    DPO_CLASSIFY,
473                    DPO_PROTO_IP4,
474                    classify_dpo_create (DPO_PROTO_IP4, classify_table_index));
475
476           fib_table_entry_special_dpo_add (fib_index,
477                                            &pfx,
478                                            FIB_SOURCE_CLASSIFY,
479                                            FIB_ENTRY_FLAG_NONE, &dpo);
480           dpo_reset (&dpo);
481         }
482     }
483
484   fib_table_entry_update_one_path (fib_index, &pfx,
485                                    FIB_SOURCE_INTERFACE,
486                                    (FIB_ENTRY_FLAG_CONNECTED |
487                                     FIB_ENTRY_FLAG_LOCAL),
488                                    DPO_PROTO_IP4,
489                                    &pfx.fp_addr,
490                                    sw_if_index,
491                                    // invalid FIB index
492                                    ~0,
493                                    1, NULL,
494                                    FIB_ROUTE_PATH_FLAG_NONE);
495 }
496
497 static void
498 ip4_del_interface_prefix_routes (ip4_main_t * im,
499                                  u32 sw_if_index,
500                                  u32 fib_index,
501                                  ip4_address_t * address,
502                                  u32 address_length)
503 {
504   ip_lookup_main_t *lm = &im->lookup_main;
505   ip_interface_prefix_t *if_prefix;
506
507   ip_interface_prefix_key_t key = {
508     .prefix = {
509       .fp_len = address_length,
510       .fp_proto = FIB_PROTOCOL_IP4,
511       .fp_addr.ip4.as_u32 = address->as_u32 & im->fib_masks[address_length],
512     },
513     .sw_if_index = sw_if_index,
514   };
515
516   fib_prefix_t pfx_special = {
517     .fp_len = 32,
518     .fp_proto = FIB_PROTOCOL_IP4,
519   };
520
521   if_prefix = ip_get_interface_prefix (lm, &key);
522   if (!if_prefix)
523     {
524       clib_warning ("Prefix not found while deleting %U",
525                     format_ip4_address_and_length, address, address_length);
526       return;
527     }
528
529   if_prefix->ref_count -= 1;
530
531   /*
532    * Routes need to be adjusted if deleting last intf addr in prefix
533    *
534    * We're done now otherwise
535    */
536   if (if_prefix->ref_count > 0)
537     return;
538
539   /* length <= 30, delete glean route, first address, last address */
540   if (address_length <= 30)
541     {
542       /* Less work to do in FIB if we remove the covered /32s first */
543
544       /* first address in prefix */
545       pfx_special.fp_addr.ip4.as_u32 =
546         address->as_u32 & im->fib_masks[address_length];
547       pfx_special.fp_len = 32;
548
549       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
550         fib_table_entry_special_remove (fib_index,
551                                         &pfx_special,
552                                         FIB_SOURCE_INTERFACE);
553
554       /* prefix broadcast address */
555       pfx_special.fp_addr.ip4.as_u32 =
556         address->as_u32 | ~im->fib_masks[address_length];
557       pfx_special.fp_len = 32;
558
559       if (pfx_special.fp_addr.ip4.as_u32 != address->as_u32)
560         fib_table_entry_special_remove (fib_index,
561                                         &pfx_special,
562                                         FIB_SOURCE_INTERFACE);
563     }
564   else if (address_length == 31)
565     {
566       /* length == 31, delete attached route for the other address */
567       pfx_special.fp_addr.ip4.as_u32 =
568         address->as_u32 ^ clib_host_to_net_u32(1);
569
570       fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
571     }
572
573   /* remove glean route for prefix */
574   pfx_special.fp_addr.ip4 = *address;
575   pfx_special.fp_len = address_length;
576   fib_table_entry_delete (fib_index, &pfx_special, FIB_SOURCE_INTERFACE);
577
578   mhash_unset (&lm->prefix_to_if_prefix_index, &key, 0 /* old_value */);
579   pool_put (lm->if_prefix_pool, if_prefix);
580 }
581
582 static void
583 ip4_del_interface_routes (u32 sw_if_index,
584                           ip4_main_t * im,
585                           u32 fib_index,
586                           ip4_address_t * address, u32 address_length)
587 {
588   fib_prefix_t pfx = {
589     .fp_len = 32,
590     .fp_proto = FIB_PROTOCOL_IP4,
591     .fp_addr.ip4 = *address,
592   };
593
594   fib_table_entry_delete (fib_index, &pfx, FIB_SOURCE_INTERFACE);
595
596   ip4_del_interface_prefix_routes (im, sw_if_index, fib_index,
597                                    address, address_length);
598 }
599
600 #ifndef CLIB_MARCH_VARIANT
601 void
602 ip4_sw_interface_enable_disable (u32 sw_if_index, u32 is_enable)
603 {
604   ip4_main_t *im = &ip4_main;
605   vnet_main_t *vnm = vnet_get_main ();
606   vnet_hw_interface_t *hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
607
608   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
609
610   /*
611    * enable/disable only on the 1<->0 transition
612    */
613   if (is_enable)
614     {
615       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
616         return;
617     }
618   else
619     {
620       ASSERT (im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
621       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
622         return;
623     }
624   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
625                                !is_enable, 0, 0);
626
627
628   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
629                                sw_if_index, !is_enable, 0, 0);
630
631   if (is_enable)
632     hi->l3_if_count++;
633   else if (hi->l3_if_count)
634     hi->l3_if_count--;
635
636   {
637     ip4_enable_disable_interface_callback_t *cb;
638     vec_foreach (cb, im->enable_disable_interface_callbacks)
639       cb->function (im, cb->function_opaque, sw_if_index, is_enable);
640   }
641 }
642
643 static clib_error_t *
644 ip4_add_del_interface_address_internal (vlib_main_t * vm,
645                                         u32 sw_if_index,
646                                         ip4_address_t * address,
647                                         u32 address_length, u32 is_del)
648 {
649   vnet_main_t *vnm = vnet_get_main ();
650   ip4_main_t *im = &ip4_main;
651   ip_lookup_main_t *lm = &im->lookup_main;
652   clib_error_t *error = 0;
653   u32 if_address_index;
654   ip4_address_fib_t ip4_af, *addr_fib = 0;
655
656   error = vnet_sw_interface_supports_addressing (vnm, sw_if_index);
657   if (error)
658     return error;
659
660   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
661   ip4_addr_fib_init (&ip4_af, address,
662                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
663   vec_add1 (addr_fib, ip4_af);
664
665   /*
666    * there is no support for adj-fib handling in the presence of overlapping
667    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
668    * most routers do.
669    */
670   /* *INDENT-OFF* */
671   if (!is_del)
672     {
673       /* When adding an address check that it does not conflict
674          with an existing address on any interface in this table. */
675       ip_interface_address_t *ia;
676       vnet_sw_interface_t *sif;
677
678       pool_foreach (sif, vnm->interface_main.sw_interfaces)
679        {
680           if (im->fib_index_by_sw_if_index[sw_if_index] ==
681               im->fib_index_by_sw_if_index[sif->sw_if_index])
682             {
683               foreach_ip_interface_address
684                 (&im->lookup_main, ia, sif->sw_if_index,
685                  0 /* honor unnumbered */ ,
686                  ({
687                    ip4_address_t * x =
688                      ip_interface_address_get_address
689                      (&im->lookup_main, ia);
690
691                    if (ip4_destination_matches_route
692                        (im, address, x, ia->address_length) ||
693                        ip4_destination_matches_route (im,
694                                                       x,
695                                                       address,
696                                                       address_length))
697                      {
698                        /* an intf may have >1 addr from the same prefix */
699                        if ((sw_if_index == sif->sw_if_index) &&
700                            (ia->address_length == address_length) &&
701                            (x->as_u32 != address->as_u32))
702                          continue;
703
704                        if (ia->flags & IP_INTERFACE_ADDRESS_FLAG_STALE)
705                          /* if the address we're comparing against is stale
706                           * then the CP has not added this one back yet, maybe
707                           * it never will, so we have to assume it won't and
708                           * ignore it. if it does add it back, then it will fail
709                           * because this one is now present */
710                          continue;
711
712                        /* error if the length or intf was different */
713                        vnm->api_errno = VNET_API_ERROR_ADDRESS_IN_USE;
714
715                        error = clib_error_create
716                          ("failed to add %U on %U which conflicts with %U for interface %U",
717                           format_ip4_address_and_length, address,
718                           address_length,
719                           format_vnet_sw_if_index_name, vnm,
720                           sw_if_index,
721                           format_ip4_address_and_length, x,
722                           ia->address_length,
723                           format_vnet_sw_if_index_name, vnm,
724                           sif->sw_if_index);
725                        goto done;
726                      }
727                  }));
728             }
729       }
730     }
731   /* *INDENT-ON* */
732
733   if_address_index = ip_interface_address_find (lm, addr_fib, address_length);
734
735   if (is_del)
736     {
737       if (~0 == if_address_index)
738         {
739           vnm->api_errno = VNET_API_ERROR_ADDRESS_NOT_FOUND_FOR_INTERFACE;
740           error = clib_error_create ("%U not found for interface %U",
741                                      lm->format_address_and_length,
742                                      addr_fib, address_length,
743                                      format_vnet_sw_if_index_name, vnm,
744                                      sw_if_index);
745           goto done;
746         }
747
748       error = ip_interface_address_del (lm, vnm, if_address_index, addr_fib,
749                                         address_length, sw_if_index);
750       if (error)
751         goto done;
752     }
753   else
754     {
755       if (~0 != if_address_index)
756         {
757           ip_interface_address_t *ia;
758
759           ia = pool_elt_at_index (lm->if_address_pool, if_address_index);
760
761           if (ia->flags & IP_INTERFACE_ADDRESS_FLAG_STALE)
762             {
763               if (ia->sw_if_index == sw_if_index)
764                 {
765                   /* re-adding an address during the replace action.
766                    * consdier this the update. clear the flag and
767                    * we're done */
768                   ia->flags &= ~IP_INTERFACE_ADDRESS_FLAG_STALE;
769                   goto done;
770                 }
771               else
772                 {
773                   /* The prefix is moving from one interface to another.
774                    * delete the stale and add the new */
775                   ip4_add_del_interface_address_internal (vm,
776                                                           ia->sw_if_index,
777                                                           address,
778                                                           address_length, 1);
779                   ia = NULL;
780                   error = ip_interface_address_add (lm, sw_if_index,
781                                                     addr_fib, address_length,
782                                                     &if_address_index);
783                 }
784             }
785           else
786             {
787               vnm->api_errno = VNET_API_ERROR_DUPLICATE_IF_ADDRESS;
788               error = clib_error_create
789                 ("Prefix %U already found on interface %U",
790                  lm->format_address_and_length, addr_fib, address_length,
791                  format_vnet_sw_if_index_name, vnm, ia->sw_if_index);
792             }
793         }
794       else
795         error = ip_interface_address_add (lm, sw_if_index,
796                                           addr_fib, address_length,
797                                           &if_address_index);
798     }
799
800   if (error)
801     goto done;
802
803   ip4_sw_interface_enable_disable (sw_if_index, !is_del);
804   ip4_mfib_interface_enable_disable (sw_if_index, !is_del);
805
806   /* intf addr routes are added/deleted on admin up/down */
807   if (vnet_sw_interface_is_admin_up (vnm, sw_if_index))
808     {
809       if (is_del)
810         ip4_del_interface_routes (sw_if_index,
811                                   im, ip4_af.fib_index, address,
812                                   address_length);
813       else
814         ip4_add_interface_routes (sw_if_index,
815                                   im, ip4_af.fib_index,
816                                   pool_elt_at_index
817                                   (lm->if_address_pool, if_address_index));
818     }
819
820   ip4_add_del_interface_address_callback_t *cb;
821   vec_foreach (cb, im->add_del_interface_address_callbacks)
822     cb->function (im, cb->function_opaque, sw_if_index,
823                   address, address_length, if_address_index, is_del);
824
825 done:
826   vec_free (addr_fib);
827   return error;
828 }
829
830 clib_error_t *
831 ip4_add_del_interface_address (vlib_main_t * vm,
832                                u32 sw_if_index,
833                                ip4_address_t * address,
834                                u32 address_length, u32 is_del)
835 {
836   return ip4_add_del_interface_address_internal
837     (vm, sw_if_index, address, address_length, is_del);
838 }
839
840 void
841 ip4_directed_broadcast (u32 sw_if_index, u8 enable)
842 {
843   ip_interface_address_t *ia;
844   ip4_main_t *im;
845
846   im = &ip4_main;
847
848   /*
849    * when directed broadcast is enabled, the subnet braodcast route will forward
850    * packets using an adjacency with a broadcast MAC. otherwise it drops
851    */
852   /* *INDENT-OFF* */
853   foreach_ip_interface_address(&im->lookup_main, ia,
854                                sw_if_index, 0,
855      ({
856        if (ia->address_length <= 30)
857          {
858            ip4_address_t *ipa;
859
860            ipa = ip_interface_address_get_address (&im->lookup_main, ia);
861
862            fib_prefix_t pfx = {
863              .fp_len = 32,
864              .fp_proto = FIB_PROTOCOL_IP4,
865              .fp_addr = {
866                .ip4.as_u32 = (ipa->as_u32 | ~im->fib_masks[ia->address_length]),
867              },
868            };
869
870            ip4_add_subnet_bcast_route
871              (fib_table_get_index_for_sw_if_index(FIB_PROTOCOL_IP4,
872                                                   sw_if_index),
873               &pfx, sw_if_index);
874          }
875      }));
876   /* *INDENT-ON* */
877 }
878 #endif
879
880 static clib_error_t *
881 ip4_sw_interface_admin_up_down (vnet_main_t * vnm, u32 sw_if_index, u32 flags)
882 {
883   ip4_main_t *im = &ip4_main;
884   ip_interface_address_t *ia;
885   ip4_address_t *a;
886   u32 is_admin_up, fib_index;
887
888   /* Fill in lookup tables with default table (0). */
889   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
890
891   vec_validate_init_empty (im->
892                            lookup_main.if_address_pool_index_by_sw_if_index,
893                            sw_if_index, ~0);
894
895   is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
896
897   fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
898
899   /* *INDENT-OFF* */
900   foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
901                                 0 /* honor unnumbered */,
902   ({
903     a = ip_interface_address_get_address (&im->lookup_main, ia);
904     if (is_admin_up)
905       ip4_add_interface_routes (sw_if_index,
906                                 im, fib_index,
907                                 ia);
908     else
909       ip4_del_interface_routes (sw_if_index,
910                                 im, fib_index,
911                                 a, ia->address_length);
912   }));
913   /* *INDENT-ON* */
914
915   return 0;
916 }
917
918 VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
919
920 /* Built-in ip4 unicast rx feature path definition */
921 /* *INDENT-OFF* */
922 VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
923 {
924   .arc_name = "ip4-unicast",
925   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
926   .last_in_arc = "ip4-lookup",
927   .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
928 };
929
930 VNET_FEATURE_INIT (ip4_flow_classify, static) =
931 {
932   .arc_name = "ip4-unicast",
933   .node_name = "ip4-flow-classify",
934   .runs_before = VNET_FEATURES ("ip4-inacl"),
935 };
936
937 VNET_FEATURE_INIT (ip4_inacl, static) =
938 {
939   .arc_name = "ip4-unicast",
940   .node_name = "ip4-inacl",
941   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
942 };
943
944 VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) =
945 {
946   .arc_name = "ip4-unicast",
947   .node_name = "ip4-source-and-port-range-check-rx",
948   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
949 };
950
951 VNET_FEATURE_INIT (ip4_policer_classify, static) =
952 {
953   .arc_name = "ip4-unicast",
954   .node_name = "ip4-policer-classify",
955   .runs_before = VNET_FEATURES ("ipsec4-input-feature"),
956 };
957
958 VNET_FEATURE_INIT (ip4_ipsec, static) =
959 {
960   .arc_name = "ip4-unicast",
961   .node_name = "ipsec4-input-feature",
962   .runs_before = VNET_FEATURES ("vpath-input-ip4"),
963 };
964
965 VNET_FEATURE_INIT (ip4_vpath, static) =
966 {
967   .arc_name = "ip4-unicast",
968   .node_name = "vpath-input-ip4",
969   .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
970 };
971
972 VNET_FEATURE_INIT (ip4_vxlan_bypass, static) =
973 {
974   .arc_name = "ip4-unicast",
975   .node_name = "ip4-vxlan-bypass",
976   .runs_before = VNET_FEATURES ("ip4-lookup"),
977 };
978
979 VNET_FEATURE_INIT (ip4_not_enabled, static) =
980 {
981   .arc_name = "ip4-unicast",
982   .node_name = "ip4-not-enabled",
983   .runs_before = VNET_FEATURES ("ip4-lookup"),
984 };
985
986 VNET_FEATURE_INIT (ip4_lookup, static) =
987 {
988   .arc_name = "ip4-unicast",
989   .node_name = "ip4-lookup",
990   .runs_before = 0,     /* not before any other features */
991 };
992
993 /* Built-in ip4 multicast rx feature path definition */
994 VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
995 {
996   .arc_name = "ip4-multicast",
997   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
998   .last_in_arc = "ip4-mfib-forward-lookup",
999   .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
1000 };
1001
1002 VNET_FEATURE_INIT (ip4_vpath_mc, static) =
1003 {
1004   .arc_name = "ip4-multicast",
1005   .node_name = "vpath-input-ip4",
1006   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
1007 };
1008
1009 VNET_FEATURE_INIT (ip4_mc_not_enabled, static) =
1010 {
1011   .arc_name = "ip4-multicast",
1012   .node_name = "ip4-not-enabled",
1013   .runs_before = VNET_FEATURES ("ip4-mfib-forward-lookup"),
1014 };
1015
1016 VNET_FEATURE_INIT (ip4_lookup_mc, static) =
1017 {
1018   .arc_name = "ip4-multicast",
1019   .node_name = "ip4-mfib-forward-lookup",
1020   .runs_before = 0,     /* last feature */
1021 };
1022
1023 /* Source and port-range check ip4 tx feature path definition */
1024 VNET_FEATURE_ARC_INIT (ip4_output, static) =
1025 {
1026   .arc_name = "ip4-output",
1027   .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain", "ip4-dvr-dpo"),
1028   .last_in_arc = "interface-output",
1029   .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
1030 };
1031
1032 VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) =
1033 {
1034   .arc_name = "ip4-output",
1035   .node_name = "ip4-source-and-port-range-check-tx",
1036   .runs_before = VNET_FEATURES ("ip4-outacl"),
1037 };
1038
1039 VNET_FEATURE_INIT (ip4_outacl, static) =
1040 {
1041   .arc_name = "ip4-output",
1042   .node_name = "ip4-outacl",
1043   .runs_before = VNET_FEATURES ("ipsec4-output-feature"),
1044 };
1045
1046 VNET_FEATURE_INIT (ip4_ipsec_output, static) =
1047 {
1048   .arc_name = "ip4-output",
1049   .node_name = "ipsec4-output-feature",
1050   .runs_before = VNET_FEATURES ("interface-output"),
1051 };
1052
1053 /* Built-in ip4 tx feature path definition */
1054 VNET_FEATURE_INIT (ip4_interface_output, static) =
1055 {
1056   .arc_name = "ip4-output",
1057   .node_name = "interface-output",
1058   .runs_before = 0,     /* not before any other features */
1059 };
1060 /* *INDENT-ON* */
1061
1062 static clib_error_t *
1063 ip4_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add)
1064 {
1065   ip4_main_t *im = &ip4_main;
1066
1067   /* Fill in lookup tables with default table (0). */
1068   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1069   vec_validate (im->mfib_index_by_sw_if_index, sw_if_index);
1070
1071   if (!is_add)
1072     {
1073       ip4_main_t *im4 = &ip4_main;
1074       ip_lookup_main_t *lm4 = &im4->lookup_main;
1075       ip_interface_address_t *ia = 0;
1076       ip4_address_t *address;
1077       vlib_main_t *vm = vlib_get_main ();
1078
1079       vnet_sw_interface_update_unnumbered (sw_if_index, ~0, 0);
1080       /* *INDENT-OFF* */
1081       foreach_ip_interface_address (lm4, ia, sw_if_index, 0,
1082       ({
1083         address = ip_interface_address_get_address (lm4, ia);
1084         ip4_add_del_interface_address(vm, sw_if_index, address, ia->address_length, 1);
1085       }));
1086       /* *INDENT-ON* */
1087       ip4_mfib_interface_enable_disable (sw_if_index, 0);
1088     }
1089
1090   vnet_feature_enable_disable ("ip4-unicast", "ip4-not-enabled", sw_if_index,
1091                                is_add, 0, 0);
1092
1093   vnet_feature_enable_disable ("ip4-multicast", "ip4-not-enabled",
1094                                sw_if_index, is_add, 0, 0);
1095
1096   return /* no error */ 0;
1097 }
1098
1099 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1100
1101 /* Global IP4 main. */
1102 #ifndef CLIB_MARCH_VARIANT
1103 ip4_main_t ip4_main;
1104 #endif /* CLIB_MARCH_VARIANT */
1105
1106 static clib_error_t *
1107 ip4_lookup_init (vlib_main_t * vm)
1108 {
1109   ip4_main_t *im = &ip4_main;
1110   clib_error_t *error;
1111   uword i;
1112
1113   if ((error = vlib_call_init_function (vm, vnet_feature_init)))
1114     return error;
1115   if ((error = vlib_call_init_function (vm, ip4_mtrie_module_init)))
1116     return (error);
1117   if ((error = vlib_call_init_function (vm, fib_module_init)))
1118     return error;
1119   if ((error = vlib_call_init_function (vm, mfib_module_init)))
1120     return error;
1121
1122   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1123     {
1124       u32 m;
1125
1126       if (i < 32)
1127         m = pow2_mask (i) << (32 - i);
1128       else
1129         m = ~0;
1130       im->fib_masks[i] = clib_host_to_net_u32 (m);
1131     }
1132
1133   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1134
1135   /* Create FIB with index 0 and table id of 0. */
1136   fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1137                                      FIB_SOURCE_DEFAULT_ROUTE);
1138   mfib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, 0,
1139                                       MFIB_SOURCE_DEFAULT_ROUTE);
1140
1141   {
1142     pg_node_t *pn;
1143     pn = pg_get_node (ip4_lookup_node.index);
1144     pn->unformat_edit = unformat_pg_ip4_header;
1145   }
1146
1147   {
1148     ethernet_arp_header_t h;
1149
1150     clib_memset (&h, 0, sizeof (h));
1151
1152 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1153 #define _8(f,v) h.f = v;
1154     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1155     _16 (l3_type, ETHERNET_TYPE_IP4);
1156     _8 (n_l2_address_bytes, 6);
1157     _8 (n_l3_address_bytes, 4);
1158     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1159 #undef _16
1160 #undef _8
1161
1162     vlib_packet_template_init (vm, &im->ip4_arp_request_packet_template,
1163                                /* data */ &h,
1164                                sizeof (h),
1165                                /* alloc chunk size */ 8,
1166                                "ip4 arp");
1167   }
1168
1169   return error;
1170 }
1171
1172 VLIB_INIT_FUNCTION (ip4_lookup_init);
1173
1174 typedef struct
1175 {
1176   /* Adjacency taken. */
1177   u32 dpo_index;
1178   u32 flow_hash;
1179   u32 fib_index;
1180
1181   /* Packet data, possibly *after* rewrite. */
1182   u8 packet_data[64 - 1 * sizeof (u32)];
1183 }
1184 ip4_forward_next_trace_t;
1185
1186 #ifndef CLIB_MARCH_VARIANT
1187 u8 *
1188 format_ip4_forward_next_trace (u8 * s, va_list * args)
1189 {
1190   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1191   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1192   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1193   u32 indent = format_get_indent (s);
1194   s = format (s, "%U%U",
1195               format_white_space, indent,
1196               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1197   return s;
1198 }
1199 #endif
1200
1201 static u8 *
1202 format_ip4_lookup_trace (u8 * s, va_list * args)
1203 {
1204   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1205   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1206   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1207   u32 indent = format_get_indent (s);
1208
1209   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1210               t->fib_index, t->dpo_index, t->flow_hash);
1211   s = format (s, "\n%U%U",
1212               format_white_space, indent,
1213               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1214   return s;
1215 }
1216
1217 static u8 *
1218 format_ip4_rewrite_trace (u8 * s, va_list * args)
1219 {
1220   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1221   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1222   ip4_forward_next_trace_t *t = va_arg (*args, ip4_forward_next_trace_t *);
1223   u32 indent = format_get_indent (s);
1224
1225   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1226               t->fib_index, t->dpo_index, format_ip_adjacency,
1227               t->dpo_index, FORMAT_IP_ADJACENCY_NONE, t->flow_hash);
1228   s = format (s, "\n%U%U",
1229               format_white_space, indent,
1230               format_ip_adjacency_packet_data,
1231               t->packet_data, sizeof (t->packet_data));
1232   return s;
1233 }
1234
1235 #ifndef CLIB_MARCH_VARIANT
1236 /* Common trace function for all ip4-forward next nodes. */
1237 void
1238 ip4_forward_next_trace (vlib_main_t * vm,
1239                         vlib_node_runtime_t * node,
1240                         vlib_frame_t * frame, vlib_rx_or_tx_t which_adj_index)
1241 {
1242   u32 *from, n_left;
1243   ip4_main_t *im = &ip4_main;
1244
1245   n_left = frame->n_vectors;
1246   from = vlib_frame_vector_args (frame);
1247
1248   while (n_left >= 4)
1249     {
1250       u32 bi0, bi1;
1251       vlib_buffer_t *b0, *b1;
1252       ip4_forward_next_trace_t *t0, *t1;
1253
1254       /* Prefetch next iteration. */
1255       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1256       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1257
1258       bi0 = from[0];
1259       bi1 = from[1];
1260
1261       b0 = vlib_get_buffer (vm, bi0);
1262       b1 = vlib_get_buffer (vm, bi1);
1263
1264       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1265         {
1266           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1267           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1268           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1269           t0->fib_index =
1270             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1271              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1272             vec_elt (im->fib_index_by_sw_if_index,
1273                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1274
1275           clib_memcpy_fast (t0->packet_data,
1276                             vlib_buffer_get_current (b0),
1277                             sizeof (t0->packet_data));
1278         }
1279       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1280         {
1281           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1282           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1283           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1284           t1->fib_index =
1285             (vnet_buffer (b1)->sw_if_index[VLIB_TX] !=
1286              (u32) ~ 0) ? vnet_buffer (b1)->sw_if_index[VLIB_TX] :
1287             vec_elt (im->fib_index_by_sw_if_index,
1288                      vnet_buffer (b1)->sw_if_index[VLIB_RX]);
1289           clib_memcpy_fast (t1->packet_data, vlib_buffer_get_current (b1),
1290                             sizeof (t1->packet_data));
1291         }
1292       from += 2;
1293       n_left -= 2;
1294     }
1295
1296   while (n_left >= 1)
1297     {
1298       u32 bi0;
1299       vlib_buffer_t *b0;
1300       ip4_forward_next_trace_t *t0;
1301
1302       bi0 = from[0];
1303
1304       b0 = vlib_get_buffer (vm, bi0);
1305
1306       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1307         {
1308           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1309           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1310           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1311           t0->fib_index =
1312             (vnet_buffer (b0)->sw_if_index[VLIB_TX] !=
1313              (u32) ~ 0) ? vnet_buffer (b0)->sw_if_index[VLIB_TX] :
1314             vec_elt (im->fib_index_by_sw_if_index,
1315                      vnet_buffer (b0)->sw_if_index[VLIB_RX]);
1316           clib_memcpy_fast (t0->packet_data, vlib_buffer_get_current (b0),
1317                             sizeof (t0->packet_data));
1318         }
1319       from += 1;
1320       n_left -= 1;
1321     }
1322 }
1323
1324 /* Compute TCP/UDP/ICMP4 checksum in software. */
1325 u16
1326 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1327                               ip4_header_t * ip0)
1328 {
1329   ip_csum_t sum0;
1330   u32 ip_header_length, payload_length_host_byte_order;
1331
1332   /* Initialize checksum with ip header. */
1333   ip_header_length = ip4_header_bytes (ip0);
1334   payload_length_host_byte_order =
1335     clib_net_to_host_u16 (ip0->length) - ip_header_length;
1336   sum0 =
1337     clib_host_to_net_u32 (payload_length_host_byte_order +
1338                           (ip0->protocol << 16));
1339
1340   if (BITS (uword) == 32)
1341     {
1342       sum0 =
1343         ip_csum_with_carry (sum0,
1344                             clib_mem_unaligned (&ip0->src_address, u32));
1345       sum0 =
1346         ip_csum_with_carry (sum0,
1347                             clib_mem_unaligned (&ip0->dst_address, u32));
1348     }
1349   else
1350     sum0 =
1351       ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1352
1353   return ip_calculate_l4_checksum (vm, p0, sum0,
1354                                    payload_length_host_byte_order, (u8 *) ip0,
1355                                    ip_header_length, NULL);
1356 }
1357
1358 u32
1359 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1360 {
1361   ip4_header_t *ip0 = vlib_buffer_get_current (p0);
1362   udp_header_t *udp0;
1363   u16 sum16;
1364
1365   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1366           || ip0->protocol == IP_PROTOCOL_UDP);
1367
1368   udp0 = (void *) (ip0 + 1);
1369   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1370     {
1371       p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1372                     | VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
1373       return p0->flags;
1374     }
1375
1376   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1377
1378   p0->flags |= (VNET_BUFFER_F_L4_CHECKSUM_COMPUTED
1379                 | ((sum16 == 0) << VNET_BUFFER_F_LOG2_L4_CHECKSUM_CORRECT));
1380
1381   return p0->flags;
1382 }
1383 #endif
1384
1385 /* *INDENT-OFF* */
1386 VNET_FEATURE_ARC_INIT (ip4_local) =
1387 {
1388   .arc_name  = "ip4-local",
1389   .start_nodes = VNET_FEATURES ("ip4-local"),
1390   .last_in_arc = "ip4-local-end-of-arc",
1391 };
1392 /* *INDENT-ON* */
1393
1394 static inline void
1395 ip4_local_l4_csum_validate (vlib_main_t * vm, vlib_buffer_t * p,
1396                             ip4_header_t * ip, u8 is_udp, u8 * error,
1397                             u8 * good_tcp_udp)
1398 {
1399   u32 flags0;
1400   flags0 = ip4_tcp_udp_validate_checksum (vm, p);
1401   *good_tcp_udp = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0;
1402   if (is_udp)
1403     {
1404       udp_header_t *udp;
1405       u32 ip_len, udp_len;
1406       i32 len_diff;
1407       udp = ip4_next_header (ip);
1408       /* Verify UDP length. */
1409       ip_len = clib_net_to_host_u16 (ip->length);
1410       udp_len = clib_net_to_host_u16 (udp->length);
1411
1412       len_diff = ip_len - udp_len;
1413       *good_tcp_udp &= len_diff >= 0;
1414       *error = len_diff < 0 ? IP4_ERROR_UDP_LENGTH : *error;
1415     }
1416 }
1417
1418 #define ip4_local_csum_is_offloaded(_b)                                       \
1419   ((_b->flags & VNET_BUFFER_F_OFFLOAD) &&                                     \
1420    (vnet_buffer (_b)->oflags &                                                \
1421     (VNET_BUFFER_OFFLOAD_F_TCP_CKSUM | VNET_BUFFER_OFFLOAD_F_UDP_CKSUM)))
1422
1423 #define ip4_local_need_csum_check(is_tcp_udp, _b)                       \
1424     (is_tcp_udp && !(_b->flags & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED     \
1425         || ip4_local_csum_is_offloaded (_b)))
1426
1427 #define ip4_local_csum_is_valid(_b)                                     \
1428     (_b->flags & VNET_BUFFER_F_L4_CHECKSUM_CORRECT                      \
1429         || (ip4_local_csum_is_offloaded (_b))) != 0
1430
1431 static inline void
1432 ip4_local_check_l4_csum (vlib_main_t * vm, vlib_buffer_t * b,
1433                          ip4_header_t * ih, u8 * error)
1434 {
1435   u8 is_udp, is_tcp_udp, good_tcp_udp;
1436
1437   is_udp = ih->protocol == IP_PROTOCOL_UDP;
1438   is_tcp_udp = is_udp || ih->protocol == IP_PROTOCOL_TCP;
1439
1440   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp, b)))
1441     ip4_local_l4_csum_validate (vm, b, ih, is_udp, error, &good_tcp_udp);
1442   else
1443     good_tcp_udp = ip4_local_csum_is_valid (b);
1444
1445   ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1446   *error = (is_tcp_udp && !good_tcp_udp
1447             ? IP4_ERROR_TCP_CHECKSUM + is_udp : *error);
1448 }
1449
1450 static inline void
1451 ip4_local_check_l4_csum_x2 (vlib_main_t * vm, vlib_buffer_t ** b,
1452                             ip4_header_t ** ih, u8 * error)
1453 {
1454   u8 is_udp[2], is_tcp_udp[2], good_tcp_udp[2];
1455
1456   is_udp[0] = ih[0]->protocol == IP_PROTOCOL_UDP;
1457   is_udp[1] = ih[1]->protocol == IP_PROTOCOL_UDP;
1458
1459   is_tcp_udp[0] = is_udp[0] || ih[0]->protocol == IP_PROTOCOL_TCP;
1460   is_tcp_udp[1] = is_udp[1] || ih[1]->protocol == IP_PROTOCOL_TCP;
1461
1462   good_tcp_udp[0] = ip4_local_csum_is_valid (b[0]);
1463   good_tcp_udp[1] = ip4_local_csum_is_valid (b[1]);
1464
1465   if (PREDICT_FALSE (ip4_local_need_csum_check (is_tcp_udp[0], b[0])
1466                      || ip4_local_need_csum_check (is_tcp_udp[1], b[1])))
1467     {
1468       if (is_tcp_udp[0])
1469         ip4_local_l4_csum_validate (vm, b[0], ih[0], is_udp[0], &error[0],
1470                                     &good_tcp_udp[0]);
1471       if (is_tcp_udp[1])
1472         ip4_local_l4_csum_validate (vm, b[1], ih[1], is_udp[1], &error[1],
1473                                     &good_tcp_udp[1]);
1474     }
1475
1476   error[0] = (is_tcp_udp[0] && !good_tcp_udp[0] ?
1477               IP4_ERROR_TCP_CHECKSUM + is_udp[0] : error[0]);
1478   error[1] = (is_tcp_udp[1] && !good_tcp_udp[1] ?
1479               IP4_ERROR_TCP_CHECKSUM + is_udp[1] : error[1]);
1480 }
1481
1482 static inline void
1483 ip4_local_set_next_and_error (vlib_node_runtime_t * error_node,
1484                               vlib_buffer_t * b, u16 * next, u8 error,
1485                               u8 head_of_feature_arc)
1486 {
1487   u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index;
1488   u32 next_index;
1489
1490   *next = error != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : *next;
1491   b->error = error ? error_node->errors[error] : 0;
1492   if (head_of_feature_arc)
1493     {
1494       next_index = *next;
1495       if (PREDICT_TRUE (error == (u8) IP4_ERROR_UNKNOWN_PROTOCOL))
1496         {
1497           vnet_feature_arc_start (arc_index,
1498                                   vnet_buffer (b)->sw_if_index[VLIB_RX],
1499                                   &next_index, b);
1500           *next = next_index;
1501         }
1502     }
1503 }
1504
1505 typedef struct
1506 {
1507   ip4_address_t src;
1508   u32 lbi;
1509   u8 error;
1510   u8 first;
1511 } ip4_local_last_check_t;
1512
1513 static inline void
1514 ip4_local_check_src (vlib_buffer_t * b, ip4_header_t * ip0,
1515                      ip4_local_last_check_t * last_check, u8 * error0)
1516 {
1517   const dpo_id_t *dpo0;
1518   load_balance_t *lb0;
1519   u32 lbi0;
1520
1521   vnet_buffer (b)->ip.fib_index =
1522     vnet_buffer (b)->sw_if_index[VLIB_TX] != ~0 ?
1523     vnet_buffer (b)->sw_if_index[VLIB_TX] : vnet_buffer (b)->ip.fib_index;
1524
1525   /*
1526    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1527    *  adjacency for the destination address (the local interface address).
1528    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1529    *  adjacency for the source address (the remote sender's address)
1530    */
1531   if (PREDICT_TRUE (last_check->src.as_u32 != ip0->src_address.as_u32) ||
1532       last_check->first)
1533     {
1534       lbi0 = ip4_fib_forwarding_lookup (vnet_buffer (b)->ip.fib_index,
1535                                         &ip0->src_address);
1536
1537       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1538         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1539       vnet_buffer (b)->ip.adj_index[VLIB_TX] = lbi0;
1540
1541       lb0 = load_balance_get (lbi0);
1542       dpo0 = load_balance_get_bucket_i (lb0, 0);
1543
1544       /*
1545        * Must have a route to source otherwise we drop the packet.
1546        * ip4 broadcasts are accepted, e.g. to make dhcp client work
1547        *
1548        * The checks are:
1549        *  - the source is a recieve => it's from us => bogus, do this
1550        *    first since it sets a different error code.
1551        *  - uRPF check for any route to source - accept if passes.
1552        *  - allow packets destined to the broadcast address from unknown sources
1553        */
1554
1555       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1556                   && dpo0->dpoi_type == DPO_RECEIVE) ?
1557                  IP4_ERROR_SPOOFED_LOCAL_PACKETS : *error0);
1558       *error0 = ((*error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1559                   && !fib_urpf_check_size (lb0->lb_urpf)
1560                   && ip0->dst_address.as_u32 != 0xFFFFFFFF) ?
1561                  IP4_ERROR_SRC_LOOKUP_MISS : *error0);
1562
1563       last_check->src.as_u32 = ip0->src_address.as_u32;
1564       last_check->lbi = lbi0;
1565       last_check->error = *error0;
1566       last_check->first = 0;
1567     }
1568   else
1569     {
1570       vnet_buffer (b)->ip.adj_index[VLIB_RX] =
1571         vnet_buffer (b)->ip.adj_index[VLIB_TX];
1572       vnet_buffer (b)->ip.adj_index[VLIB_TX] = last_check->lbi;
1573       *error0 = last_check->error;
1574     }
1575 }
1576
1577 static inline void
1578 ip4_local_check_src_x2 (vlib_buffer_t ** b, ip4_header_t ** ip,
1579                         ip4_local_last_check_t * last_check, u8 * error)
1580 {
1581   const dpo_id_t *dpo[2];
1582   load_balance_t *lb[2];
1583   u32 not_last_hit;
1584   u32 lbi[2];
1585
1586   not_last_hit = last_check->first;
1587   not_last_hit |= ip[0]->src_address.as_u32 ^ last_check->src.as_u32;
1588   not_last_hit |= ip[1]->src_address.as_u32 ^ last_check->src.as_u32;
1589
1590   vnet_buffer (b[0])->ip.fib_index =
1591     vnet_buffer (b[0])->sw_if_index[VLIB_TX] != ~0 ?
1592     vnet_buffer (b[0])->sw_if_index[VLIB_TX] :
1593     vnet_buffer (b[0])->ip.fib_index;
1594
1595   vnet_buffer (b[1])->ip.fib_index =
1596     vnet_buffer (b[1])->sw_if_index[VLIB_TX] != ~0 ?
1597     vnet_buffer (b[1])->sw_if_index[VLIB_TX] :
1598     vnet_buffer (b[1])->ip.fib_index;
1599
1600   /*
1601    * vnet_buffer()->ip.adj_index[VLIB_RX] will be set to the index of the
1602    *  adjacency for the destination address (the local interface address).
1603    * vnet_buffer()->ip.adj_index[VLIB_TX] will be set to the index of the
1604    *  adjacency for the source address (the remote sender's address)
1605    */
1606   if (PREDICT_TRUE (not_last_hit))
1607     {
1608       ip4_fib_forwarding_lookup_x2 (
1609         vnet_buffer (b[0])->ip.fib_index, vnet_buffer (b[1])->ip.fib_index,
1610         &ip[0]->src_address, &ip[1]->src_address, &lbi[0], &lbi[1]);
1611
1612       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1613         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1614       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = lbi[0];
1615
1616       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1617         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1618       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = lbi[1];
1619
1620       lb[0] = load_balance_get (lbi[0]);
1621       lb[1] = load_balance_get (lbi[1]);
1622
1623       dpo[0] = load_balance_get_bucket_i (lb[0], 0);
1624       dpo[1] = load_balance_get_bucket_i (lb[1], 0);
1625
1626       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1627                    dpo[0]->dpoi_type == DPO_RECEIVE) ?
1628                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[0]);
1629       error[0] = ((error[0] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1630                    !fib_urpf_check_size (lb[0]->lb_urpf) &&
1631                    ip[0]->dst_address.as_u32 != 0xFFFFFFFF)
1632                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[0]);
1633
1634       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1635                    dpo[1]->dpoi_type == DPO_RECEIVE) ?
1636                   IP4_ERROR_SPOOFED_LOCAL_PACKETS : error[1]);
1637       error[1] = ((error[1] == IP4_ERROR_UNKNOWN_PROTOCOL &&
1638                    !fib_urpf_check_size (lb[1]->lb_urpf) &&
1639                    ip[1]->dst_address.as_u32 != 0xFFFFFFFF)
1640                   ? IP4_ERROR_SRC_LOOKUP_MISS : error[1]);
1641
1642       last_check->src.as_u32 = ip[1]->src_address.as_u32;
1643       last_check->lbi = lbi[1];
1644       last_check->error = error[1];
1645       last_check->first = 0;
1646     }
1647   else
1648     {
1649       vnet_buffer (b[0])->ip.adj_index[VLIB_RX] =
1650         vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
1651       vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = last_check->lbi;
1652
1653       vnet_buffer (b[1])->ip.adj_index[VLIB_RX] =
1654         vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
1655       vnet_buffer (b[1])->ip.adj_index[VLIB_TX] = last_check->lbi;
1656
1657       error[0] = last_check->error;
1658       error[1] = last_check->error;
1659     }
1660 }
1661
1662 enum ip_local_packet_type_e
1663 {
1664   IP_LOCAL_PACKET_TYPE_L4,
1665   IP_LOCAL_PACKET_TYPE_NAT,
1666   IP_LOCAL_PACKET_TYPE_FRAG,
1667 };
1668
1669 /**
1670  * Determine packet type and next node.
1671  *
1672  * The expectation is that all packets that are not L4 will skip
1673  * checksums and source checks.
1674  */
1675 always_inline u8
1676 ip4_local_classify (vlib_buffer_t * b, ip4_header_t * ip, u16 * next)
1677 {
1678   ip_lookup_main_t *lm = &ip4_main.lookup_main;
1679
1680   if (PREDICT_FALSE (ip4_is_fragment (ip)))
1681     {
1682       *next = IP_LOCAL_NEXT_REASSEMBLY;
1683       return IP_LOCAL_PACKET_TYPE_FRAG;
1684     }
1685   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_IS_NATED))
1686     {
1687       *next = lm->local_next_by_ip_protocol[ip->protocol];
1688       return IP_LOCAL_PACKET_TYPE_NAT;
1689     }
1690
1691   *next = lm->local_next_by_ip_protocol[ip->protocol];
1692   return IP_LOCAL_PACKET_TYPE_L4;
1693 }
1694
1695 static inline uword
1696 ip4_local_inline (vlib_main_t * vm,
1697                   vlib_node_runtime_t * node,
1698                   vlib_frame_t * frame, int head_of_feature_arc)
1699 {
1700   u32 *from, n_left_from;
1701   vlib_node_runtime_t *error_node =
1702     vlib_node_get_runtime (vm, ip4_local_node.index);
1703   u16 nexts[VLIB_FRAME_SIZE], *next;
1704   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1705   ip4_header_t *ip[2];
1706   u8 error[2], pt[2];
1707
1708   ip4_local_last_check_t last_check = {
1709     /*
1710      * 0.0.0.0 can appear as the source address of an IP packet,
1711      * as can any other address, hence the need to use the 'first'
1712      * member to make sure the .lbi is initialised for the first
1713      * packet.
1714      */
1715     .src = {.as_u32 = 0},
1716     .lbi = ~0,
1717     .error = IP4_ERROR_UNKNOWN_PROTOCOL,
1718     .first = 1,
1719   };
1720
1721   from = vlib_frame_vector_args (frame);
1722   n_left_from = frame->n_vectors;
1723
1724   if (node->flags & VLIB_NODE_FLAG_TRACE)
1725     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1726
1727   vlib_get_buffers (vm, from, bufs, n_left_from);
1728   b = bufs;
1729   next = nexts;
1730
1731   while (n_left_from >= 6)
1732     {
1733       u8 not_batch = 0;
1734
1735       /* Prefetch next iteration. */
1736       {
1737         vlib_prefetch_buffer_header (b[4], LOAD);
1738         vlib_prefetch_buffer_header (b[5], LOAD);
1739
1740         clib_prefetch_load (b[4]->data);
1741         clib_prefetch_load (b[5]->data);
1742       }
1743
1744       error[0] = error[1] = IP4_ERROR_UNKNOWN_PROTOCOL;
1745
1746       ip[0] = vlib_buffer_get_current (b[0]);
1747       ip[1] = vlib_buffer_get_current (b[1]);
1748
1749       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1750       vnet_buffer (b[1])->l3_hdr_offset = b[1]->current_data;
1751
1752       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1753       pt[1] = ip4_local_classify (b[1], ip[1], &next[1]);
1754
1755       not_batch = pt[0] ^ pt[1];
1756
1757       if (head_of_feature_arc == 0 || (pt[0] && not_batch == 0))
1758         goto skip_checks;
1759
1760       if (PREDICT_TRUE (not_batch == 0))
1761         {
1762           ip4_local_check_l4_csum_x2 (vm, b, ip, error);
1763           ip4_local_check_src_x2 (b, ip, &last_check, error);
1764         }
1765       else
1766         {
1767           if (!pt[0])
1768             {
1769               ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1770               ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1771             }
1772           if (!pt[1])
1773             {
1774               ip4_local_check_l4_csum (vm, b[1], ip[1], &error[1]);
1775               ip4_local_check_src (b[1], ip[1], &last_check, &error[1]);
1776             }
1777         }
1778
1779     skip_checks:
1780
1781       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1782                                     head_of_feature_arc);
1783       ip4_local_set_next_and_error (error_node, b[1], &next[1], error[1],
1784                                     head_of_feature_arc);
1785
1786       b += 2;
1787       next += 2;
1788       n_left_from -= 2;
1789     }
1790
1791   while (n_left_from > 0)
1792     {
1793       error[0] = IP4_ERROR_UNKNOWN_PROTOCOL;
1794
1795       ip[0] = vlib_buffer_get_current (b[0]);
1796       vnet_buffer (b[0])->l3_hdr_offset = b[0]->current_data;
1797       pt[0] = ip4_local_classify (b[0], ip[0], &next[0]);
1798
1799       if (head_of_feature_arc == 0 || pt[0])
1800         goto skip_check;
1801
1802       ip4_local_check_l4_csum (vm, b[0], ip[0], &error[0]);
1803       ip4_local_check_src (b[0], ip[0], &last_check, &error[0]);
1804
1805     skip_check:
1806
1807       ip4_local_set_next_and_error (error_node, b[0], &next[0], error[0],
1808                                     head_of_feature_arc);
1809
1810       b += 1;
1811       next += 1;
1812       n_left_from -= 1;
1813     }
1814
1815   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
1816   return frame->n_vectors;
1817 }
1818
1819 VLIB_NODE_FN (ip4_local_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
1820                                vlib_frame_t * frame)
1821 {
1822   return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ );
1823 }
1824
1825 /* *INDENT-OFF* */
1826 VLIB_REGISTER_NODE (ip4_local_node) =
1827 {
1828   .name = "ip4-local",
1829   .vector_size = sizeof (u32),
1830   .format_trace = format_ip4_forward_next_trace,
1831   .n_errors = IP4_N_ERROR,
1832   .error_strings = ip4_error_strings,
1833   .n_next_nodes = IP_LOCAL_N_NEXT,
1834   .next_nodes =
1835   {
1836     [IP_LOCAL_NEXT_DROP] = "ip4-drop",
1837     [IP_LOCAL_NEXT_PUNT] = "ip4-punt",
1838     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1839     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
1840     [IP_LOCAL_NEXT_REASSEMBLY] = "ip4-full-reassembly",
1841   },
1842 };
1843 /* *INDENT-ON* */
1844
1845
1846 VLIB_NODE_FN (ip4_local_end_of_arc_node) (vlib_main_t * vm,
1847                                           vlib_node_runtime_t * node,
1848                                           vlib_frame_t * frame)
1849 {
1850   return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ );
1851 }
1852
1853 /* *INDENT-OFF* */
1854 VLIB_REGISTER_NODE (ip4_local_end_of_arc_node) = {
1855   .name = "ip4-local-end-of-arc",
1856   .vector_size = sizeof (u32),
1857
1858   .format_trace = format_ip4_forward_next_trace,
1859   .sibling_of = "ip4-local",
1860 };
1861
1862 VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = {
1863   .arc_name = "ip4-local",
1864   .node_name = "ip4-local-end-of-arc",
1865   .runs_before = 0, /* not before any other features */
1866 };
1867 /* *INDENT-ON* */
1868
1869 #ifndef CLIB_MARCH_VARIANT
1870 void
1871 ip4_register_protocol (u32 protocol, u32 node_index)
1872 {
1873   vlib_main_t *vm = vlib_get_main ();
1874   ip4_main_t *im = &ip4_main;
1875   ip_lookup_main_t *lm = &im->lookup_main;
1876
1877   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1878   lm->local_next_by_ip_protocol[protocol] =
1879     vlib_node_add_next (vm, ip4_local_node.index, node_index);
1880 }
1881
1882 void
1883 ip4_unregister_protocol (u32 protocol)
1884 {
1885   ip4_main_t *im = &ip4_main;
1886   ip_lookup_main_t *lm = &im->lookup_main;
1887
1888   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1889   lm->local_next_by_ip_protocol[protocol] = IP_LOCAL_NEXT_PUNT;
1890 }
1891 #endif
1892
1893 static clib_error_t *
1894 show_ip_local_command_fn (vlib_main_t * vm,
1895                           unformat_input_t * input, vlib_cli_command_t * cmd)
1896 {
1897   ip4_main_t *im = &ip4_main;
1898   ip_lookup_main_t *lm = &im->lookup_main;
1899   int i;
1900
1901   vlib_cli_output (vm, "Protocols handled by ip4_local");
1902   for (i = 0; i < ARRAY_LEN (lm->local_next_by_ip_protocol); i++)
1903     {
1904       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1905         {
1906           u32 node_index = vlib_get_node (vm,
1907                                           ip4_local_node.index)->
1908             next_nodes[lm->local_next_by_ip_protocol[i]];
1909           vlib_cli_output (vm, "%U: %U", format_ip_protocol, i,
1910                            format_vlib_node_name, vm, node_index);
1911         }
1912     }
1913   return 0;
1914 }
1915
1916
1917
1918 /*?
1919  * Display the set of protocols handled by the local IPv4 stack.
1920  *
1921  * @cliexpar
1922  * Example of how to display local protocol table:
1923  * @cliexstart{show ip local}
1924  * Protocols handled by ip4_local
1925  * 1
1926  * 17
1927  * 47
1928  * @cliexend
1929 ?*/
1930 /* *INDENT-OFF* */
1931 VLIB_CLI_COMMAND (show_ip_local, static) =
1932 {
1933   .path = "show ip local",
1934   .function = show_ip_local_command_fn,
1935   .short_help = "show ip local",
1936 };
1937 /* *INDENT-ON* */
1938
1939 typedef enum
1940 {
1941   IP4_REWRITE_NEXT_DROP,
1942   IP4_REWRITE_NEXT_ICMP_ERROR,
1943   IP4_REWRITE_NEXT_FRAGMENT,
1944   IP4_REWRITE_N_NEXT            /* Last */
1945 } ip4_rewrite_next_t;
1946
1947 /**
1948  * This bits of an IPv4 address to mask to construct a multicast
1949  * MAC address
1950  */
1951 #if CLIB_ARCH_IS_BIG_ENDIAN
1952 #define IP4_MCAST_ADDR_MASK 0x007fffff
1953 #else
1954 #define IP4_MCAST_ADDR_MASK 0xffff7f00
1955 #endif
1956
1957 always_inline void
1958 ip4_mtu_check (vlib_buffer_t * b, u16 packet_len,
1959                u16 adj_packet_bytes, bool df, u16 * next,
1960                u8 is_midchain, u32 * error)
1961 {
1962   if (packet_len > adj_packet_bytes)
1963     {
1964       *error = IP4_ERROR_MTU_EXCEEDED;
1965       if (df)
1966         {
1967           icmp4_error_set_vnet_buffer
1968             (b, ICMP4_destination_unreachable,
1969              ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
1970              adj_packet_bytes);
1971           *next = IP4_REWRITE_NEXT_ICMP_ERROR;
1972         }
1973       else
1974         {
1975           /* IP fragmentation */
1976           ip_frag_set_vnet_buffer (b, adj_packet_bytes,
1977                                    (is_midchain ?
1978                                     IP_FRAG_NEXT_IP_REWRITE_MIDCHAIN :
1979                                     IP_FRAG_NEXT_IP_REWRITE), 0);
1980           *next = IP4_REWRITE_NEXT_FRAGMENT;
1981         }
1982     }
1983 }
1984
1985 /* increment TTL & update checksum.
1986    Works either endian, so no need for byte swap. */
1987 static_always_inline void
1988 ip4_ttl_inc (vlib_buffer_t * b, ip4_header_t * ip)
1989 {
1990   i32 ttl;
1991   u32 checksum;
1992   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
1993     return;
1994
1995   ttl = ip->ttl;
1996
1997   checksum = ip->checksum - clib_host_to_net_u16 (0x0100);
1998   checksum += checksum >= 0xffff;
1999
2000   ip->checksum = checksum;
2001   ttl += 1;
2002   ip->ttl = ttl;
2003
2004   ASSERT (ip4_header_checksum_is_valid (ip));
2005 }
2006
2007 /* Decrement TTL & update checksum.
2008    Works either endian, so no need for byte swap. */
2009 static_always_inline void
2010 ip4_ttl_and_checksum_check (vlib_buffer_t * b, ip4_header_t * ip, u16 * next,
2011                             u32 * error)
2012 {
2013   i32 ttl;
2014   u32 checksum;
2015   if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED))
2016     return;
2017
2018   ttl = ip->ttl;
2019
2020   /* Input node should have reject packets with ttl 0. */
2021   ASSERT (ip->ttl > 0);
2022
2023   checksum = ip->checksum + clib_host_to_net_u16 (0x0100);
2024   checksum += checksum >= 0xffff;
2025
2026   ip->checksum = checksum;
2027   ttl -= 1;
2028   ip->ttl = ttl;
2029
2030   /*
2031    * If the ttl drops below 1 when forwarding, generate
2032    * an ICMP response.
2033    */
2034   if (PREDICT_FALSE (ttl <= 0))
2035     {
2036       *error = IP4_ERROR_TIME_EXPIRED;
2037       vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0;
2038       icmp4_error_set_vnet_buffer (b, ICMP4_time_exceeded,
2039                                    ICMP4_time_exceeded_ttl_exceeded_in_transit,
2040                                    0);
2041       *next = IP4_REWRITE_NEXT_ICMP_ERROR;
2042     }
2043
2044   /* Verify checksum. */
2045   ASSERT (ip4_header_checksum_is_valid (ip) ||
2046           (vnet_buffer (b)->oflags & VNET_BUFFER_OFFLOAD_F_IP_CKSUM));
2047 }
2048
2049 always_inline uword
2050 ip4_rewrite_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
2051                     vlib_frame_t *frame, int do_counters, int is_midchain,
2052                     int is_mcast)
2053 {
2054   ip_lookup_main_t *lm = &ip4_main.lookup_main;
2055   u32 *from = vlib_frame_vector_args (frame);
2056   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2057   u16 nexts[VLIB_FRAME_SIZE], *next;
2058   u32 n_left_from;
2059   vlib_node_runtime_t *error_node =
2060     vlib_node_get_runtime (vm, ip4_input_node.index);
2061
2062   n_left_from = frame->n_vectors;
2063   u32 thread_index = vm->thread_index;
2064
2065   vlib_get_buffers (vm, from, bufs, n_left_from);
2066   clib_memset_u16 (nexts, IP4_REWRITE_NEXT_DROP, n_left_from);
2067
2068 #if (CLIB_N_PREFETCHES >= 8)
2069   if (n_left_from >= 6)
2070     {
2071       int i;
2072       for (i = 2; i < 6; i++)
2073         vlib_prefetch_buffer_header (bufs[i], LOAD);
2074     }
2075
2076   next = nexts;
2077   b = bufs;
2078   while (n_left_from >= 8)
2079     {
2080       const ip_adjacency_t *adj0, *adj1;
2081       ip4_header_t *ip0, *ip1;
2082       u32 rw_len0, error0, adj_index0;
2083       u32 rw_len1, error1, adj_index1;
2084       u32 tx_sw_if_index0, tx_sw_if_index1;
2085       u8 *p;
2086
2087       if (is_midchain)
2088         {
2089           vlib_prefetch_buffer_header (b[6], LOAD);
2090           vlib_prefetch_buffer_header (b[7], LOAD);
2091         }
2092
2093       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2094       adj_index1 = vnet_buffer (b[1])->ip.adj_index[VLIB_TX];
2095
2096       /*
2097        * pre-fetch the per-adjacency counters
2098        */
2099       if (do_counters)
2100         {
2101           vlib_prefetch_combined_counter (&adjacency_counters,
2102                                           thread_index, adj_index0);
2103           vlib_prefetch_combined_counter (&adjacency_counters,
2104                                           thread_index, adj_index1);
2105         }
2106
2107       ip0 = vlib_buffer_get_current (b[0]);
2108       ip1 = vlib_buffer_get_current (b[1]);
2109
2110       error0 = error1 = IP4_ERROR_NONE;
2111
2112       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2113       ip4_ttl_and_checksum_check (b[1], ip1, next + 1, &error1);
2114
2115       /* Rewrite packet header and updates lengths. */
2116       adj0 = adj_get (adj_index0);
2117       adj1 = adj_get (adj_index1);
2118
2119       /* Worth pipelining. No guarantee that adj0,1 are hot... */
2120       rw_len0 = adj0[0].rewrite_header.data_bytes;
2121       rw_len1 = adj1[0].rewrite_header.data_bytes;
2122       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2123       vnet_buffer (b[1])->ip.save_rewrite_length = rw_len1;
2124
2125       p = vlib_buffer_get_current (b[2]);
2126       clib_prefetch_store (p - CLIB_CACHE_LINE_BYTES);
2127       clib_prefetch_load (p);
2128
2129       p = vlib_buffer_get_current (b[3]);
2130       clib_prefetch_store (p - CLIB_CACHE_LINE_BYTES);
2131       clib_prefetch_load (p);
2132
2133       /* Check MTU of outgoing interface. */
2134       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2135       u16 ip1_len = clib_net_to_host_u16 (ip1->length);
2136
2137       if (b[0]->flags & VNET_BUFFER_F_GSO)
2138         ip0_len = gso_mtu_sz (b[0]);
2139       if (b[1]->flags & VNET_BUFFER_F_GSO)
2140         ip1_len = gso_mtu_sz (b[1]);
2141
2142       ip4_mtu_check (b[0], ip0_len,
2143                      adj0[0].rewrite_header.max_l3_packet_bytes,
2144                      ip0->flags_and_fragment_offset &
2145                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2146                      next + 0, is_midchain, &error0);
2147       ip4_mtu_check (b[1], ip1_len,
2148                      adj1[0].rewrite_header.max_l3_packet_bytes,
2149                      ip1->flags_and_fragment_offset &
2150                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2151                      next + 1, is_midchain, &error1);
2152
2153       if (is_mcast)
2154         {
2155           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2156                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2157                     IP4_ERROR_SAME_INTERFACE : error0);
2158           error1 = ((adj1[0].rewrite_header.sw_if_index ==
2159                      vnet_buffer (b[1])->sw_if_index[VLIB_RX]) ?
2160                     IP4_ERROR_SAME_INTERFACE : error1);
2161         }
2162
2163       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2164        * to see the IP header */
2165       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2166         {
2167           u32 next_index = adj0[0].rewrite_header.next_index;
2168           vlib_buffer_advance (b[0], -(word) rw_len0);
2169
2170           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2171           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2172
2173           if (PREDICT_FALSE
2174               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2175             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2176                                                 tx_sw_if_index0,
2177                                                 &next_index, b[0],
2178                                                 adj0->ia_cfg_index);
2179
2180           next[0] = next_index;
2181           if (is_midchain)
2182             vnet_calc_checksums_inline (vm, b[0], 1 /* is_ip4 */ ,
2183                                         0 /* is_ip6 */ );
2184         }
2185       else
2186         {
2187           b[0]->error = error_node->errors[error0];
2188           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2189             ip4_ttl_inc (b[0], ip0);
2190         }
2191       if (PREDICT_TRUE (error1 == IP4_ERROR_NONE))
2192         {
2193           u32 next_index = adj1[0].rewrite_header.next_index;
2194           vlib_buffer_advance (b[1], -(word) rw_len1);
2195
2196           tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2197           vnet_buffer (b[1])->sw_if_index[VLIB_TX] = tx_sw_if_index1;
2198
2199           if (PREDICT_FALSE
2200               (adj1[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2201             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2202                                                 tx_sw_if_index1,
2203                                                 &next_index, b[1],
2204                                                 adj1->ia_cfg_index);
2205           next[1] = next_index;
2206           if (is_midchain)
2207             vnet_calc_checksums_inline (vm, b[1], 1 /* is_ip4 */ ,
2208                                         0 /* is_ip6 */ );
2209         }
2210       else
2211         {
2212           b[1]->error = error_node->errors[error1];
2213           if (error1 == IP4_ERROR_MTU_EXCEEDED)
2214             ip4_ttl_inc (b[1], ip1);
2215         }
2216
2217       if (is_midchain)
2218         /* Guess we are only writing on ipv4 header. */
2219         vnet_rewrite_two_headers (adj0[0], adj1[0],
2220                                   ip0, ip1, sizeof (ip4_header_t));
2221       else
2222         /* Guess we are only writing on simple Ethernet header. */
2223         vnet_rewrite_two_headers (adj0[0], adj1[0],
2224                                   ip0, ip1, sizeof (ethernet_header_t));
2225
2226       if (do_counters)
2227         {
2228           if (error0 == IP4_ERROR_NONE)
2229             vlib_increment_combined_counter
2230               (&adjacency_counters,
2231                thread_index,
2232                adj_index0, 1,
2233                vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2234
2235           if (error1 == IP4_ERROR_NONE)
2236             vlib_increment_combined_counter
2237               (&adjacency_counters,
2238                thread_index,
2239                adj_index1, 1,
2240                vlib_buffer_length_in_chain (vm, b[1]) + rw_len1);
2241         }
2242
2243       if (is_midchain)
2244         {
2245           if (error0 == IP4_ERROR_NONE)
2246             adj_midchain_fixup (vm, adj0, b[0], VNET_LINK_IP4);
2247           if (error1 == IP4_ERROR_NONE)
2248             adj_midchain_fixup (vm, adj1, b[1], VNET_LINK_IP4);
2249         }
2250
2251       if (is_mcast)
2252         {
2253           /* copy bytes from the IP address into the MAC rewrite */
2254           if (error0 == IP4_ERROR_NONE)
2255             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2256                                         adj0->rewrite_header.dst_mcast_offset,
2257                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2258           if (error1 == IP4_ERROR_NONE)
2259             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2260                                         adj1->rewrite_header.dst_mcast_offset,
2261                                         &ip1->dst_address.as_u32, (u8 *) ip1);
2262         }
2263
2264       next += 2;
2265       b += 2;
2266       n_left_from -= 2;
2267     }
2268 #elif (CLIB_N_PREFETCHES >= 4)
2269   next = nexts;
2270   b = bufs;
2271   while (n_left_from >= 1)
2272     {
2273       ip_adjacency_t *adj0;
2274       ip4_header_t *ip0;
2275       u32 rw_len0, error0, adj_index0;
2276       u32 tx_sw_if_index0;
2277       u8 *p;
2278
2279       /* Prefetch next iteration */
2280       if (PREDICT_TRUE (n_left_from >= 4))
2281         {
2282           ip_adjacency_t *adj2;
2283           u32 adj_index2;
2284
2285           vlib_prefetch_buffer_header (b[3], LOAD);
2286           vlib_prefetch_buffer_data (b[2], LOAD);
2287
2288           /* Prefetch adj->rewrite_header */
2289           adj_index2 = vnet_buffer (b[2])->ip.adj_index[VLIB_TX];
2290           adj2 = adj_get (adj_index2);
2291           p = (u8 *) adj2;
2292           CLIB_PREFETCH (p + CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES,
2293                          LOAD);
2294         }
2295
2296       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2297
2298       /*
2299        * Prefetch the per-adjacency counters
2300        */
2301       if (do_counters)
2302         {
2303           vlib_prefetch_combined_counter (&adjacency_counters,
2304                                           thread_index, adj_index0);
2305         }
2306
2307       ip0 = vlib_buffer_get_current (b[0]);
2308
2309       error0 = IP4_ERROR_NONE;
2310
2311       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2312
2313       /* Rewrite packet header and updates lengths. */
2314       adj0 = adj_get (adj_index0);
2315
2316       /* Rewrite header was prefetched. */
2317       rw_len0 = adj0[0].rewrite_header.data_bytes;
2318       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2319
2320       /* Check MTU of outgoing interface. */
2321       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2322
2323       if (b[0]->flags & VNET_BUFFER_F_GSO)
2324         ip0_len = gso_mtu_sz (b[0]);
2325
2326       ip4_mtu_check (b[0], ip0_len,
2327                      adj0[0].rewrite_header.max_l3_packet_bytes,
2328                      ip0->flags_and_fragment_offset &
2329                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2330                      next + 0, is_midchain, &error0);
2331
2332       if (is_mcast)
2333         {
2334           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2335                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2336                     IP4_ERROR_SAME_INTERFACE : error0);
2337         }
2338
2339       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2340        * to see the IP header */
2341       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2342         {
2343           u32 next_index = adj0[0].rewrite_header.next_index;
2344           vlib_buffer_advance (b[0], -(word) rw_len0);
2345           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2346           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2347
2348           if (PREDICT_FALSE
2349               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2350             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2351                                                 tx_sw_if_index0,
2352                                                 &next_index, b[0],
2353                                                 adj0->ia_cfg_index);
2354           next[0] = next_index;
2355
2356           if (is_midchain)
2357             {
2358               vnet_calc_checksums_inline (vm, b[0], 1 /* is_ip4 */ ,
2359                                           0 /* is_ip6 */ );
2360
2361               /* Guess we are only writing on ipv4 header. */
2362               vnet_rewrite_one_header (adj0[0], ip0, sizeof (ip4_header_t));
2363             }
2364           else
2365             /* Guess we are only writing on simple Ethernet header. */
2366             vnet_rewrite_one_header (adj0[0], ip0,
2367                                      sizeof (ethernet_header_t));
2368
2369           /*
2370            * Bump the per-adjacency counters
2371            */
2372           if (do_counters)
2373             vlib_increment_combined_counter
2374               (&adjacency_counters,
2375                thread_index,
2376                adj_index0, 1, vlib_buffer_length_in_chain (vm,
2377                                                            b[0]) + rw_len0);
2378
2379           if (is_midchain)
2380             adj_midchain_fixup (vm, adj0, b[0], VNET_LINK_IP4);
2381
2382           if (is_mcast)
2383             /* copy bytes from the IP address into the MAC rewrite */
2384             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2385                                         adj0->rewrite_header.dst_mcast_offset,
2386                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2387         }
2388       else
2389         {
2390           b[0]->error = error_node->errors[error0];
2391           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2392             ip4_ttl_inc (b[0], ip0);
2393         }
2394
2395       next += 1;
2396       b += 1;
2397       n_left_from -= 1;
2398     }
2399 #endif
2400
2401   while (n_left_from > 0)
2402     {
2403       ip_adjacency_t *adj0;
2404       ip4_header_t *ip0;
2405       u32 rw_len0, adj_index0, error0;
2406       u32 tx_sw_if_index0;
2407
2408       adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
2409
2410       adj0 = adj_get (adj_index0);
2411
2412       if (do_counters)
2413         vlib_prefetch_combined_counter (&adjacency_counters,
2414                                         thread_index, adj_index0);
2415
2416       ip0 = vlib_buffer_get_current (b[0]);
2417
2418       error0 = IP4_ERROR_NONE;
2419
2420       ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0);
2421
2422
2423       /* Update packet buffer attributes/set output interface. */
2424       rw_len0 = adj0[0].rewrite_header.data_bytes;
2425       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
2426
2427       /* Check MTU of outgoing interface. */
2428       u16 ip0_len = clib_net_to_host_u16 (ip0->length);
2429       if (b[0]->flags & VNET_BUFFER_F_GSO)
2430         ip0_len = gso_mtu_sz (b[0]);
2431
2432       ip4_mtu_check (b[0], ip0_len,
2433                      adj0[0].rewrite_header.max_l3_packet_bytes,
2434                      ip0->flags_and_fragment_offset &
2435                      clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
2436                      next + 0, is_midchain, &error0);
2437
2438       if (is_mcast)
2439         {
2440           error0 = ((adj0[0].rewrite_header.sw_if_index ==
2441                      vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ?
2442                     IP4_ERROR_SAME_INTERFACE : error0);
2443         }
2444
2445       /* Don't adjust the buffer for ttl issue; icmp-error node wants
2446        * to see the IP header */
2447       if (PREDICT_TRUE (error0 == IP4_ERROR_NONE))
2448         {
2449           u32 next_index = adj0[0].rewrite_header.next_index;
2450           vlib_buffer_advance (b[0], -(word) rw_len0);
2451           tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2452           vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2453
2454           if (PREDICT_FALSE
2455               (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES))
2456             vnet_feature_arc_start_w_cfg_index (lm->output_feature_arc_index,
2457                                                 tx_sw_if_index0,
2458                                                 &next_index, b[0],
2459                                                 adj0->ia_cfg_index);
2460           next[0] = next_index;
2461
2462           if (is_midchain)
2463             {
2464               /* this acts on the packet that is about to be encapped */
2465               vnet_calc_checksums_inline (vm, b[0], 1 /* is_ip4 */ ,
2466                                           0 /* is_ip6 */ );
2467
2468               /* Guess we are only writing on ipv4 header. */
2469               vnet_rewrite_one_header (adj0[0], ip0, sizeof (ip4_header_t));
2470             }
2471           else
2472             /* Guess we are only writing on simple Ethernet header. */
2473             vnet_rewrite_one_header (adj0[0], ip0,
2474                                      sizeof (ethernet_header_t));
2475
2476           if (do_counters)
2477             vlib_increment_combined_counter
2478               (&adjacency_counters,
2479                thread_index, adj_index0, 1,
2480                vlib_buffer_length_in_chain (vm, b[0]) + rw_len0);
2481
2482           if (is_midchain)
2483             adj_midchain_fixup (vm, adj0, b[0], VNET_LINK_IP4);
2484
2485           if (is_mcast)
2486             /* copy bytes from the IP address into the MAC rewrite */
2487             vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK,
2488                                         adj0->rewrite_header.dst_mcast_offset,
2489                                         &ip0->dst_address.as_u32, (u8 *) ip0);
2490         }
2491       else
2492         {
2493           b[0]->error = error_node->errors[error0];
2494           /* undo the TTL decrement - we'll be back to do it again */
2495           if (error0 == IP4_ERROR_MTU_EXCEEDED)
2496             ip4_ttl_inc (b[0], ip0);
2497         }
2498
2499       next += 1;
2500       b += 1;
2501       n_left_from -= 1;
2502     }
2503
2504
2505   /* Need to do trace after rewrites to pick up new packet data. */
2506   if (node->flags & VLIB_NODE_FLAG_TRACE)
2507     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2508
2509   vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2510   return frame->n_vectors;
2511 }
2512
2513 /** @brief IPv4 rewrite node.
2514     @node ip4-rewrite
2515
2516     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2517     header checksum, fetch the ip adjacency, check the outbound mtu,
2518     apply the adjacency rewrite, and send pkts to the adjacency
2519     rewrite header's rewrite_next_index.
2520
2521     @param vm vlib_main_t corresponding to the current thread
2522     @param node vlib_node_runtime_t
2523     @param frame vlib_frame_t whose contents should be dispatched
2524
2525     @par Graph mechanics: buffer metadata, next index usage
2526
2527     @em Uses:
2528     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2529         - the rewrite adjacency index
2530     - <code>adj->lookup_next_index</code>
2531         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2532           the packet will be dropped.
2533     - <code>adj->rewrite_header</code>
2534         - Rewrite string length, rewrite string, next_index
2535
2536     @em Sets:
2537     - <code>b->current_data, b->current_length</code>
2538         - Updated net of applying the rewrite string
2539
2540     <em>Next Indices:</em>
2541     - <code> adj->rewrite_header.next_index </code>
2542       or @c ip4-drop
2543 */
2544
2545 VLIB_NODE_FN (ip4_rewrite_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
2546                                  vlib_frame_t * frame)
2547 {
2548   if (adj_are_counters_enabled ())
2549     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2550   else
2551     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2552 }
2553
2554 VLIB_NODE_FN (ip4_rewrite_bcast_node) (vlib_main_t * vm,
2555                                        vlib_node_runtime_t * node,
2556                                        vlib_frame_t * frame)
2557 {
2558   if (adj_are_counters_enabled ())
2559     return ip4_rewrite_inline (vm, node, frame, 1, 0, 0);
2560   else
2561     return ip4_rewrite_inline (vm, node, frame, 0, 0, 0);
2562 }
2563
2564 VLIB_NODE_FN (ip4_midchain_node) (vlib_main_t * vm,
2565                                   vlib_node_runtime_t * node,
2566                                   vlib_frame_t * frame)
2567 {
2568   if (adj_are_counters_enabled ())
2569     return ip4_rewrite_inline (vm, node, frame, 1, 1, 0);
2570   else
2571     return ip4_rewrite_inline (vm, node, frame, 0, 1, 0);
2572 }
2573
2574 VLIB_NODE_FN (ip4_rewrite_mcast_node) (vlib_main_t * vm,
2575                                        vlib_node_runtime_t * node,
2576                                        vlib_frame_t * frame)
2577 {
2578   if (adj_are_counters_enabled ())
2579     return ip4_rewrite_inline (vm, node, frame, 1, 0, 1);
2580   else
2581     return ip4_rewrite_inline (vm, node, frame, 0, 0, 1);
2582 }
2583
2584 VLIB_NODE_FN (ip4_mcast_midchain_node) (vlib_main_t * vm,
2585                                         vlib_node_runtime_t * node,
2586                                         vlib_frame_t * frame)
2587 {
2588   if (adj_are_counters_enabled ())
2589     return ip4_rewrite_inline (vm, node, frame, 1, 1, 1);
2590   else
2591     return ip4_rewrite_inline (vm, node, frame, 0, 1, 1);
2592 }
2593
2594 /* *INDENT-OFF* */
2595 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2596   .name = "ip4-rewrite",
2597   .vector_size = sizeof (u32),
2598
2599   .format_trace = format_ip4_rewrite_trace,
2600
2601   .n_next_nodes = IP4_REWRITE_N_NEXT,
2602   .next_nodes = {
2603     [IP4_REWRITE_NEXT_DROP] = "ip4-drop",
2604     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2605     [IP4_REWRITE_NEXT_FRAGMENT] = "ip4-frag",
2606   },
2607 };
2608
2609 VLIB_REGISTER_NODE (ip4_rewrite_bcast_node) = {
2610   .name = "ip4-rewrite-bcast",
2611   .vector_size = sizeof (u32),
2612
2613   .format_trace = format_ip4_rewrite_trace,
2614   .sibling_of = "ip4-rewrite",
2615 };
2616
2617 VLIB_REGISTER_NODE (ip4_rewrite_mcast_node) = {
2618   .name = "ip4-rewrite-mcast",
2619   .vector_size = sizeof (u32),
2620
2621   .format_trace = format_ip4_rewrite_trace,
2622   .sibling_of = "ip4-rewrite",
2623 };
2624
2625 VLIB_REGISTER_NODE (ip4_mcast_midchain_node) = {
2626   .name = "ip4-mcast-midchain",
2627   .vector_size = sizeof (u32),
2628
2629   .format_trace = format_ip4_rewrite_trace,
2630   .sibling_of = "ip4-rewrite",
2631 };
2632
2633 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2634   .name = "ip4-midchain",
2635   .vector_size = sizeof (u32),
2636   .format_trace = format_ip4_rewrite_trace,
2637   .sibling_of = "ip4-rewrite",
2638 };
2639 /* *INDENT-ON */
2640
2641 static clib_error_t *
2642 set_ip_flow_hash_command_fn (vlib_main_t * vm,
2643                              unformat_input_t * input,
2644                              vlib_cli_command_t * cmd)
2645 {
2646   int matched = 0;
2647   u32 table_id = 0;
2648   u32 flow_hash_config = 0;
2649   int rv;
2650
2651   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
2652     {
2653       if (unformat (input, "table %d", &table_id))
2654         matched = 1;
2655 #define _(a, b, v)                                                            \
2656   else if (unformat (input, #a))                                              \
2657   {                                                                           \
2658     flow_hash_config |= v;                                                    \
2659     matched = 1;                                                              \
2660   }
2661       foreach_flow_hash_bit
2662 #undef _
2663         else
2664         break;
2665     }
2666
2667   if (matched == 0)
2668     return clib_error_return (0, "unknown input `%U'",
2669                               format_unformat_error, input);
2670
2671   rv = ip_flow_hash_set (AF_IP4, table_id, flow_hash_config);
2672   switch (rv)
2673     {
2674     case 0:
2675       break;
2676
2677     case VNET_API_ERROR_NO_SUCH_FIB:
2678       return clib_error_return (0, "no such FIB table %d", table_id);
2679
2680     default:
2681       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
2682       break;
2683     }
2684
2685   return 0;
2686 }
2687
2688 /*?
2689  * Configure the set of IPv4 fields used by the flow hash.
2690  *
2691  * @cliexpar
2692  * Example of how to set the flow hash on a given table:
2693  * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
2694  * Example of display the configured flow hash:
2695  * @cliexstart{show ip fib}
2696  * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
2697  * 0.0.0.0/0
2698  *   unicast-ip4-chain
2699  *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
2700  *     [0] [@0]: dpo-drop ip6
2701  * 0.0.0.0/32
2702  *   unicast-ip4-chain
2703  *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
2704  *     [0] [@0]: dpo-drop ip6
2705  * 224.0.0.0/8
2706  *   unicast-ip4-chain
2707  *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
2708  *     [0] [@0]: dpo-drop ip6
2709  * 6.0.1.2/32
2710  *   unicast-ip4-chain
2711  *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
2712  *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
2713  * 7.0.0.1/32
2714  *   unicast-ip4-chain
2715  *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
2716  *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
2717  *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
2718  *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
2719  *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
2720  * 240.0.0.0/8
2721  *   unicast-ip4-chain
2722  *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
2723  *     [0] [@0]: dpo-drop ip6
2724  * 255.255.255.255/32
2725  *   unicast-ip4-chain
2726  *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
2727  *     [0] [@0]: dpo-drop ip6
2728  * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
2729  * 0.0.0.0/0
2730  *   unicast-ip4-chain
2731  *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
2732  *     [0] [@0]: dpo-drop ip6
2733  * 0.0.0.0/32
2734  *   unicast-ip4-chain
2735  *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
2736  *     [0] [@0]: dpo-drop ip6
2737  * 172.16.1.0/24
2738  *   unicast-ip4-chain
2739  *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
2740  *     [0] [@4]: ipv4-glean: af_packet0
2741  * 172.16.1.1/32
2742  *   unicast-ip4-chain
2743  *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
2744  *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
2745  * 172.16.1.2/32
2746  *   unicast-ip4-chain
2747  *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
2748  *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
2749  * 172.16.2.0/24
2750  *   unicast-ip4-chain
2751  *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
2752  *     [0] [@4]: ipv4-glean: af_packet1
2753  * 172.16.2.1/32
2754  *   unicast-ip4-chain
2755  *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
2756  *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
2757  * 224.0.0.0/8
2758  *   unicast-ip4-chain
2759  *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
2760  *     [0] [@0]: dpo-drop ip6
2761  * 240.0.0.0/8
2762  *   unicast-ip4-chain
2763  *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
2764  *     [0] [@0]: dpo-drop ip6
2765  * 255.255.255.255/32
2766  *   unicast-ip4-chain
2767  *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
2768  *     [0] [@0]: dpo-drop ip6
2769  * @cliexend
2770 ?*/
2771 /* *INDENT-OFF* */
2772 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) =
2773 {
2774   .path = "set ip flow-hash",
2775   .short_help =
2776   "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
2777   .function = set_ip_flow_hash_command_fn,
2778 };
2779 /* *INDENT-ON* */
2780
2781 #ifndef CLIB_MARCH_VARIANT
2782 int
2783 vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
2784                              u32 table_index)
2785 {
2786   vnet_main_t *vnm = vnet_get_main ();
2787   vnet_interface_main_t *im = &vnm->interface_main;
2788   ip4_main_t *ipm = &ip4_main;
2789   ip_lookup_main_t *lm = &ipm->lookup_main;
2790   vnet_classify_main_t *cm = &vnet_classify_main;
2791   ip4_address_t *if_addr;
2792
2793   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
2794     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
2795
2796   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
2797     return VNET_API_ERROR_NO_SUCH_ENTRY;
2798
2799   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
2800   lm->classify_table_index_by_sw_if_index[sw_if_index] = table_index;
2801
2802   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
2803
2804   if (NULL != if_addr)
2805     {
2806       fib_prefix_t pfx = {
2807         .fp_len = 32,
2808         .fp_proto = FIB_PROTOCOL_IP4,
2809         .fp_addr.ip4 = *if_addr,
2810       };
2811       u32 fib_index;
2812
2813       fib_index = fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP4,
2814                                                        sw_if_index);
2815
2816
2817       if (table_index != (u32) ~ 0)
2818         {
2819           dpo_id_t dpo = DPO_INVALID;
2820
2821           dpo_set (&dpo,
2822                    DPO_CLASSIFY,
2823                    DPO_PROTO_IP4,
2824                    classify_dpo_create (DPO_PROTO_IP4, table_index));
2825
2826           fib_table_entry_special_dpo_add (fib_index,
2827                                            &pfx,
2828                                            FIB_SOURCE_CLASSIFY,
2829                                            FIB_ENTRY_FLAG_NONE, &dpo);
2830           dpo_reset (&dpo);
2831         }
2832       else
2833         {
2834           fib_table_entry_special_remove (fib_index,
2835                                           &pfx, FIB_SOURCE_CLASSIFY);
2836         }
2837     }
2838
2839   return 0;
2840 }
2841 #endif
2842
2843 static clib_error_t *
2844 set_ip_classify_command_fn (vlib_main_t * vm,
2845                             unformat_input_t * input,
2846                             vlib_cli_command_t * cmd)
2847 {
2848   u32 table_index = ~0;
2849   int table_index_set = 0;
2850   u32 sw_if_index = ~0;
2851   int rv;
2852
2853   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
2854     {
2855       if (unformat (input, "table-index %d", &table_index))
2856         table_index_set = 1;
2857       else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
2858                          vnet_get_main (), &sw_if_index))
2859         ;
2860       else
2861         break;
2862     }
2863
2864   if (table_index_set == 0)
2865     return clib_error_return (0, "classify table-index must be specified");
2866
2867   if (sw_if_index == ~0)
2868     return clib_error_return (0, "interface / subif must be specified");
2869
2870   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
2871
2872   switch (rv)
2873     {
2874     case 0:
2875       break;
2876
2877     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
2878       return clib_error_return (0, "No such interface");
2879
2880     case VNET_API_ERROR_NO_SUCH_ENTRY:
2881       return clib_error_return (0, "No such classifier table");
2882     }
2883   return 0;
2884 }
2885
2886 /*?
2887  * Assign a classification table to an interface. The classification
2888  * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
2889  * commands. Once the table is create, use this command to filter packets
2890  * on an interface.
2891  *
2892  * @cliexpar
2893  * Example of how to assign a classification table to an interface:
2894  * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
2895 ?*/
2896 /* *INDENT-OFF* */
2897 VLIB_CLI_COMMAND (set_ip_classify_command, static) =
2898 {
2899     .path = "set ip classify",
2900     .short_help =
2901     "set ip classify intfc <interface> table-index <classify-idx>",
2902     .function = set_ip_classify_command_fn,
2903 };
2904 /* *INDENT-ON* */
2905
2906 /*
2907  * fd.io coding-style-patch-verification: ON
2908  *
2909  * Local Variables:
2910  * eval: (c-set-style "gnu")
2911  * End:
2912  */