43a894075dd8041b462050fbda44ef178676539f
[vpp.git] / vnet / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
43 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
44 #include <vnet/ppp/ppp.h>
45 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
46 #include <vnet/api_errno.h>     /* for API error numbers */
47 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
48 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_urpf_list.h> /* for FIB uRPF check */
50 #include <vnet/fib/ip4_fib.h>
51 #include <vnet/dpo/load_balance.h>
52 #include <vnet/dpo/classify_dpo.h>
53
54 /**
55  * @file
56  * @brief IPv4 Forwarding.
57  *
58  * This file contains the source code for IPv4 forwarding.
59  */
60
61 void
62 ip4_forward_next_trace (vlib_main_t * vm,
63                         vlib_node_runtime_t * node,
64                         vlib_frame_t * frame,
65                         vlib_rx_or_tx_t which_adj_index);
66
67 always_inline uword
68 ip4_lookup_inline (vlib_main_t * vm,
69                    vlib_node_runtime_t * node,
70                    vlib_frame_t * frame,
71                    int lookup_for_responses_to_locally_received_packets)
72 {
73   ip4_main_t * im = &ip4_main;
74   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
75   u32 n_left_from, n_left_to_next, * from, * to_next;
76   ip_lookup_next_t next;
77   u32 cpu_index = os_get_cpu_number();
78
79   from = vlib_frame_vector_args (frame);
80   n_left_from = frame->n_vectors;
81   next = node->cached_next_index;
82
83   while (n_left_from > 0)
84     {
85       vlib_get_next_frame (vm, node, next,
86                            to_next, n_left_to_next);
87
88       while (n_left_from >= 8 && n_left_to_next >= 4)
89         {
90           vlib_buffer_t * p0, * p1, * p2, * p3;
91           ip4_header_t * ip0, * ip1, * ip2, * ip3;
92           __attribute__((unused)) tcp_header_t * tcp0, * tcp1, * tcp2, * tcp3;
93           ip_lookup_next_t next0, next1, next2, next3;
94           const load_balance_t * lb0, * lb1, * lb2, * lb3;
95           ip4_fib_mtrie_t * mtrie0, * mtrie1, * mtrie2, * mtrie3;
96           ip4_fib_mtrie_leaf_t leaf0, leaf1, leaf2, leaf3;
97           ip4_address_t * dst_addr0, *dst_addr1, *dst_addr2, *dst_addr3;
98           __attribute__((unused)) u32 pi0, fib_index0, lb_index0, is_tcp_udp0;
99           __attribute__((unused)) u32 pi1, fib_index1, lb_index1, is_tcp_udp1;
100           __attribute__((unused)) u32 pi2, fib_index2, lb_index2, is_tcp_udp2;
101           __attribute__((unused)) u32 pi3, fib_index3, lb_index3, is_tcp_udp3;
102           flow_hash_config_t flow_hash_config0, flow_hash_config1;
103           flow_hash_config_t flow_hash_config2, flow_hash_config3;
104           u32 hash_c0, hash_c1, hash_c2, hash_c3;
105           const dpo_id_t *dpo0, *dpo1, *dpo2, *dpo3;
106
107           /* Prefetch next iteration. */
108           {
109             vlib_buffer_t * p4, * p5, * p6, * p7;
110
111             p4 = vlib_get_buffer (vm, from[4]);
112             p5 = vlib_get_buffer (vm, from[5]);
113             p6 = vlib_get_buffer (vm, from[6]);
114             p7 = vlib_get_buffer (vm, from[7]);
115
116             vlib_prefetch_buffer_header (p4, LOAD);
117             vlib_prefetch_buffer_header (p5, LOAD);
118             vlib_prefetch_buffer_header (p6, LOAD);
119             vlib_prefetch_buffer_header (p7, LOAD);
120
121             CLIB_PREFETCH (p4->data, sizeof (ip0[0]), LOAD);
122             CLIB_PREFETCH (p5->data, sizeof (ip0[0]), LOAD);
123             CLIB_PREFETCH (p6->data, sizeof (ip0[0]), LOAD);
124             CLIB_PREFETCH (p7->data, sizeof (ip0[0]), LOAD);
125           }
126
127           pi0 = to_next[0] = from[0];
128           pi1 = to_next[1] = from[1];
129           pi2 = to_next[2] = from[2];
130           pi3 = to_next[3] = from[3];
131
132           from += 4;
133           to_next += 4;
134           n_left_to_next -= 4;
135           n_left_from -= 4;
136
137           p0 = vlib_get_buffer (vm, pi0);
138           p1 = vlib_get_buffer (vm, pi1);
139           p2 = vlib_get_buffer (vm, pi2);
140           p3 = vlib_get_buffer (vm, pi3);
141
142           ip0 = vlib_buffer_get_current (p0);
143           ip1 = vlib_buffer_get_current (p1);
144           ip2 = vlib_buffer_get_current (p2);
145           ip3 = vlib_buffer_get_current (p3);
146
147           dst_addr0 = &ip0->dst_address;
148           dst_addr1 = &ip1->dst_address;
149           dst_addr2 = &ip2->dst_address;
150           dst_addr3 = &ip3->dst_address;
151
152           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
153           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
154           fib_index2 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p2)->sw_if_index[VLIB_RX]);
155           fib_index3 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p3)->sw_if_index[VLIB_RX]);
156           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
157             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
158           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
159             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
160           fib_index2 = (vnet_buffer(p2)->sw_if_index[VLIB_TX] == (u32)~0) ?
161             fib_index2 : vnet_buffer(p2)->sw_if_index[VLIB_TX];
162           fib_index3 = (vnet_buffer(p3)->sw_if_index[VLIB_TX] == (u32)~0) ?
163             fib_index3 : vnet_buffer(p3)->sw_if_index[VLIB_TX];
164
165
166           if (! lookup_for_responses_to_locally_received_packets)
167             {
168               mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
169               mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
170               mtrie2 = &ip4_fib_get (fib_index2)->mtrie;
171               mtrie3 = &ip4_fib_get (fib_index3)->mtrie;
172
173               leaf0 = leaf1 = leaf2 = leaf3 = IP4_FIB_MTRIE_LEAF_ROOT;
174
175               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
176               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 0);
177               leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 0);
178               leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 0);
179             }
180
181           tcp0 = (void *) (ip0 + 1);
182           tcp1 = (void *) (ip1 + 1);
183           tcp2 = (void *) (ip2 + 1);
184           tcp3 = (void *) (ip3 + 1);
185
186           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
187                          || ip0->protocol == IP_PROTOCOL_UDP);
188           is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
189                          || ip1->protocol == IP_PROTOCOL_UDP);
190           is_tcp_udp2 = (ip2->protocol == IP_PROTOCOL_TCP
191                          || ip2->protocol == IP_PROTOCOL_UDP);
192           is_tcp_udp3 = (ip1->protocol == IP_PROTOCOL_TCP
193                          || ip1->protocol == IP_PROTOCOL_UDP);
194
195           if (! lookup_for_responses_to_locally_received_packets)
196             {
197               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
198               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 1);
199               leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 1);
200               leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 1);
201             }
202
203           if (! lookup_for_responses_to_locally_received_packets)
204             {
205               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
206               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
207               leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 2);
208               leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 2);
209             }
210
211           if (! lookup_for_responses_to_locally_received_packets)
212             {
213               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
214               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
215               leaf2 = ip4_fib_mtrie_lookup_step (mtrie2, leaf2, dst_addr2, 3);
216               leaf3 = ip4_fib_mtrie_lookup_step (mtrie3, leaf3, dst_addr3, 3);
217             }
218
219           if (lookup_for_responses_to_locally_received_packets)
220             {
221               lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
222               lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
223               lb_index2 = vnet_buffer (p2)->ip.adj_index[VLIB_RX];
224               lb_index3 = vnet_buffer (p3)->ip.adj_index[VLIB_RX];
225             }
226           else
227             {
228               /* Handle default route. */
229               leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
230               leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
231               leaf2 = (leaf2 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie2->default_leaf : leaf2);
232               leaf3 = (leaf3 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie3->default_leaf : leaf3);
233               lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
234               lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
235               lb_index2 = ip4_fib_mtrie_leaf_get_adj_index (leaf2);
236               lb_index3 = ip4_fib_mtrie_leaf_get_adj_index (leaf3);
237             }
238
239           lb0 = load_balance_get (lb_index0);
240           lb1 = load_balance_get (lb_index1);
241           lb2 = load_balance_get (lb_index2);
242           lb3 = load_balance_get (lb_index3);
243
244           /* Use flow hash to compute multipath adjacency. */
245           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
246           hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
247           hash_c2 = vnet_buffer (p2)->ip.flow_hash = 0;
248           hash_c3 = vnet_buffer (p3)->ip.flow_hash = 0;
249           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
250             {
251               flow_hash_config0 = lb0->lb_hash_config;
252               hash_c0 = vnet_buffer (p0)->ip.flow_hash =
253                 ip4_compute_flow_hash (ip0, flow_hash_config0);
254             }
255           if (PREDICT_FALSE(lb1->lb_n_buckets > 1))
256             {
257               flow_hash_config1 = lb1->lb_hash_config;
258               hash_c1 = vnet_buffer (p1)->ip.flow_hash =
259                 ip4_compute_flow_hash (ip1, flow_hash_config1);
260             }
261           if (PREDICT_FALSE (lb2->lb_n_buckets > 1))
262             {
263               flow_hash_config2 = lb2->lb_hash_config;
264               hash_c2 = vnet_buffer (p2)->ip.flow_hash =
265                 ip4_compute_flow_hash (ip2, flow_hash_config2);
266             }
267           if (PREDICT_FALSE(lb3->lb_n_buckets > 1))
268             {
269               flow_hash_config3 = lb3->lb_hash_config;
270               hash_c3 = vnet_buffer (p3)->ip.flow_hash =
271                 ip4_compute_flow_hash (ip3, flow_hash_config3);
272             }
273
274           ASSERT (lb0->lb_n_buckets > 0);
275           ASSERT (is_pow2 (lb0->lb_n_buckets));
276           ASSERT (lb1->lb_n_buckets > 0);
277           ASSERT (is_pow2 (lb1->lb_n_buckets));
278           ASSERT (lb2->lb_n_buckets > 0);
279           ASSERT (is_pow2 (lb2->lb_n_buckets));
280           ASSERT (lb3->lb_n_buckets > 0);
281           ASSERT (is_pow2 (lb3->lb_n_buckets));
282
283           dpo0 = load_balance_get_bucket_i(lb0,
284                                            (hash_c0 &
285                                             (lb0->lb_n_buckets_minus_1)));
286           dpo1 = load_balance_get_bucket_i(lb1,
287                                            (hash_c1 &
288                                             (lb1->lb_n_buckets_minus_1)));
289           dpo2 = load_balance_get_bucket_i(lb2,
290                                            (hash_c2 &
291                                             (lb2->lb_n_buckets_minus_1)));
292           dpo3 = load_balance_get_bucket_i(lb3,
293                                            (hash_c3 &
294                                             (lb3->lb_n_buckets_minus_1)));
295
296           next0 = dpo0->dpoi_next_node;
297           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
298           next1 = dpo1->dpoi_next_node;
299           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
300           next2 = dpo2->dpoi_next_node;
301           vnet_buffer (p2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index;
302           next3 = dpo3->dpoi_next_node;
303           vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;
304
305           vlib_increment_combined_counter
306               (cm, cpu_index, lb_index0, 1,
307                vlib_buffer_length_in_chain (vm, p0)
308                + sizeof(ethernet_header_t));
309           vlib_increment_combined_counter
310               (cm, cpu_index, lb_index1, 1,
311                vlib_buffer_length_in_chain (vm, p1)
312                + sizeof(ethernet_header_t));
313           vlib_increment_combined_counter
314               (cm, cpu_index, lb_index2, 1,
315                vlib_buffer_length_in_chain (vm, p2)
316                + sizeof(ethernet_header_t));
317           vlib_increment_combined_counter
318               (cm, cpu_index, lb_index3, 1,
319                vlib_buffer_length_in_chain (vm, p3)
320                + sizeof(ethernet_header_t));
321
322           vlib_validate_buffer_enqueue_x4 (vm, node, next,
323                                            to_next, n_left_to_next,
324                                            pi0, pi1, pi2, pi3,
325                                            next0, next1, next2, next3);
326         }
327
328       while (n_left_from > 0 && n_left_to_next > 0)
329         {
330           vlib_buffer_t * p0;
331           ip4_header_t * ip0;
332           __attribute__((unused)) tcp_header_t * tcp0;
333           ip_lookup_next_t next0;
334           const load_balance_t *lb0;
335           ip4_fib_mtrie_t * mtrie0;
336           ip4_fib_mtrie_leaf_t leaf0;
337           ip4_address_t * dst_addr0;
338           __attribute__((unused)) u32 pi0, fib_index0, is_tcp_udp0, lbi0;
339           flow_hash_config_t flow_hash_config0;
340           const dpo_id_t *dpo0;
341           u32 hash_c0;
342
343           pi0 = from[0];
344           to_next[0] = pi0;
345
346           p0 = vlib_get_buffer (vm, pi0);
347
348           ip0 = vlib_buffer_get_current (p0);
349
350           dst_addr0 = &ip0->dst_address;
351
352           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
353           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
354             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
355
356           if (! lookup_for_responses_to_locally_received_packets)
357             {
358               mtrie0 = &ip4_fib_get( fib_index0)->mtrie;
359
360               leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
361
362               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
363             }
364
365           tcp0 = (void *) (ip0 + 1);
366
367           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
368                          || ip0->protocol == IP_PROTOCOL_UDP);
369
370           if (! lookup_for_responses_to_locally_received_packets)
371             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
372
373           if (! lookup_for_responses_to_locally_received_packets)
374             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
375
376           if (! lookup_for_responses_to_locally_received_packets)
377             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
378
379           if (lookup_for_responses_to_locally_received_packets)
380             lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
381           else
382             {
383               /* Handle default route. */
384               leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
385               lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
386             }
387
388           lb0 = load_balance_get (lbi0);
389
390           /* Use flow hash to compute multipath adjacency. */
391           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
392           if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
393             {
394               flow_hash_config0 = lb0->lb_hash_config;
395
396               hash_c0 = vnet_buffer (p0)->ip.flow_hash =
397                 ip4_compute_flow_hash (ip0, flow_hash_config0);
398             }
399
400           ASSERT (lb0->lb_n_buckets > 0);
401           ASSERT (is_pow2 (lb0->lb_n_buckets));
402
403           dpo0 = load_balance_get_bucket_i(lb0,
404                                            (hash_c0 &
405                                             (lb0->lb_n_buckets_minus_1)));
406
407           next0 = dpo0->dpoi_next_node;
408           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
409
410           vlib_increment_combined_counter
411               (cm, cpu_index, lbi0, 1,
412                vlib_buffer_length_in_chain (vm, p0));
413
414           from += 1;
415           to_next += 1;
416           n_left_to_next -= 1;
417           n_left_from -= 1;
418
419           if (PREDICT_FALSE (next0 != next))
420             {
421               n_left_to_next += 1;
422               vlib_put_next_frame (vm, node, next, n_left_to_next);
423               next = next0;
424               vlib_get_next_frame (vm, node, next,
425                                    to_next, n_left_to_next);
426               to_next[0] = pi0;
427               to_next += 1;
428               n_left_to_next -= 1;
429             }
430         }
431
432       vlib_put_next_frame (vm, node, next, n_left_to_next);
433     }
434
435   if (node->flags & VLIB_NODE_FLAG_TRACE)
436     ip4_forward_next_trace(vm, node, frame, VLIB_TX);
437
438   return frame->n_vectors;
439 }
440
441 /** @brief IPv4 lookup node.
442     @node ip4-lookup
443
444     This is the main IPv4 lookup dispatch node.
445
446     @param vm vlib_main_t corresponding to the current thread
447     @param node vlib_node_runtime_t
448     @param frame vlib_frame_t whose contents should be dispatched
449
450     @par Graph mechanics: buffer metadata, next index usage
451
452     @em Uses:
453     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
454         - Indicates the @c sw_if_index value of the interface that the
455           packet was received on.
456     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
457         - When the value is @c ~0 then the node performs a longest prefix
458           match (LPM) for the packet destination address in the FIB attached
459           to the receive interface.
460         - Otherwise perform LPM for the packet destination address in the
461           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
462           value (0, 1, ...) and not a VRF id.
463
464     @em Sets:
465     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
466         - The lookup result adjacency index.
467
468     <em>Next Index:</em>
469     - Dispatches the packet to the node index found in
470       ip_adjacency_t @c adj->lookup_next_index
471       (where @c adj is the lookup result adjacency).
472 */
473 static uword
474 ip4_lookup (vlib_main_t * vm,
475             vlib_node_runtime_t * node,
476             vlib_frame_t * frame)
477 {
478   return ip4_lookup_inline (vm, node, frame,
479                             /* lookup_for_responses_to_locally_received_packets */ 0);
480
481 }
482
483 static u8 * format_ip4_lookup_trace (u8 * s, va_list * args);
484
485 VLIB_REGISTER_NODE (ip4_lookup_node) = {
486   .function = ip4_lookup,
487   .name = "ip4-lookup",
488   .vector_size = sizeof (u32),
489
490   .format_trace = format_ip4_lookup_trace,
491   .n_next_nodes = IP_LOOKUP_N_NEXT,
492   .next_nodes = IP4_LOOKUP_NEXT_NODES,
493 };
494
495 VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_node, ip4_lookup)
496
497 always_inline uword
498 ip4_load_balance (vlib_main_t * vm,
499                   vlib_node_runtime_t * node,
500                   vlib_frame_t * frame)
501 {
502   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters;
503   u32 n_left_from, n_left_to_next, * from, * to_next;
504   ip_lookup_next_t next;
505   u32 cpu_index = os_get_cpu_number();
506
507   from = vlib_frame_vector_args (frame);
508   n_left_from = frame->n_vectors;
509   next = node->cached_next_index;
510
511   if (node->flags & VLIB_NODE_FLAG_TRACE)
512       ip4_forward_next_trace(vm, node, frame, VLIB_TX);
513
514   while (n_left_from > 0)
515     {
516       vlib_get_next_frame (vm, node, next,
517                            to_next, n_left_to_next);
518
519
520       while (n_left_from >= 4 && n_left_to_next >= 2)
521         {
522           ip_lookup_next_t next0, next1;
523           const load_balance_t *lb0, *lb1;
524           vlib_buffer_t * p0, *p1;
525           u32 pi0, lbi0, hc0, pi1, lbi1, hc1;
526           const ip4_header_t *ip0, *ip1;
527           const dpo_id_t *dpo0, *dpo1;
528
529           /* Prefetch next iteration. */
530           {
531             vlib_buffer_t * p2, * p3;
532
533             p2 = vlib_get_buffer (vm, from[2]);
534             p3 = vlib_get_buffer (vm, from[3]);
535
536             vlib_prefetch_buffer_header (p2, STORE);
537             vlib_prefetch_buffer_header (p3, STORE);
538
539             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
540             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
541           }
542
543           pi0 = to_next[0] = from[0];
544           pi1 = to_next[1] = from[1];
545
546           from += 2;
547           n_left_from -= 2;
548           to_next += 2;
549           n_left_to_next -= 2;
550
551           p0 = vlib_get_buffer (vm, pi0);
552           p1 = vlib_get_buffer (vm, pi1);
553
554           ip0 = vlib_buffer_get_current (p0);
555           ip1 = vlib_buffer_get_current (p1);
556           lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
557           lbi1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];
558
559           lb0 = load_balance_get(lbi0);
560           lb1 = load_balance_get(lbi1);
561
562           /*
563            * this node is for via FIBs we can re-use the hash value from the
564            * to node if present.
565            * We don't want to use the same hash value at each level in the recursion
566            * graph as that would lead to polarisation
567            */
568           hc0 = vnet_buffer (p0)->ip.flow_hash = 0;
569           hc1 = vnet_buffer (p1)->ip.flow_hash = 0;
570
571           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
572           {
573               if (PREDICT_TRUE (vnet_buffer(p0)->ip.flow_hash))
574               {
575                   hc0 = vnet_buffer(p0)->ip.flow_hash = vnet_buffer(p0)->ip.flow_hash >> 1;
576               }
577               else
578               {
579                   hc0 = vnet_buffer(p0)->ip.flow_hash = ip4_compute_flow_hash(ip0, hc0);
580               }
581           }
582           if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
583           {
584               if (PREDICT_TRUE (vnet_buffer(p1)->ip.flow_hash))
585               {
586                   hc1 = vnet_buffer(p1)->ip.flow_hash = vnet_buffer(p1)->ip.flow_hash >> 1;
587               }
588               else
589               {
590                   hc1 = vnet_buffer(p1)->ip.flow_hash = ip4_compute_flow_hash(ip1, hc1);
591               }
592           }
593
594           dpo0 = load_balance_get_bucket_i(lb0, hc0 & (lb0->lb_n_buckets_minus_1));
595           dpo1 = load_balance_get_bucket_i(lb1, hc1 & (lb1->lb_n_buckets_minus_1));
596
597           next0 = dpo0->dpoi_next_node;
598           next1 = dpo1->dpoi_next_node;
599
600           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
601           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
602
603           vlib_increment_combined_counter
604               (cm, cpu_index, lbi0, 1,
605                vlib_buffer_length_in_chain (vm, p0));
606           vlib_increment_combined_counter
607               (cm, cpu_index, lbi1, 1,
608                vlib_buffer_length_in_chain (vm, p1));
609
610           vlib_validate_buffer_enqueue_x2 (vm, node, next,
611                                            to_next, n_left_to_next,
612                                            pi0, pi1, next0, next1);
613        }
614
615       while (n_left_from > 0 && n_left_to_next > 0)
616         {
617           ip_lookup_next_t next0;
618           const load_balance_t *lb0;
619           vlib_buffer_t * p0;
620           u32 pi0, lbi0, hc0;
621           const ip4_header_t *ip0;
622           const dpo_id_t *dpo0;
623
624           pi0 = from[0];
625           to_next[0] = pi0;
626           from += 1;
627           to_next += 1;
628           n_left_to_next -= 1;
629           n_left_from -= 1;
630
631           p0 = vlib_get_buffer (vm, pi0);
632
633           ip0 = vlib_buffer_get_current (p0);
634           lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
635
636           lb0 = load_balance_get(lbi0);
637
638           hc0 = vnet_buffer (p0)->ip.flow_hash = 0;
639           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
640           {
641               if (PREDICT_TRUE (vnet_buffer(p0)->ip.flow_hash))
642               {
643                   hc0 = vnet_buffer(p0)->ip.flow_hash = vnet_buffer(p0)->ip.flow_hash >> 1;
644               }
645               else
646               {
647                   hc0 = vnet_buffer(p0)->ip.flow_hash = ip4_compute_flow_hash(ip0, hc0);
648               }
649           }
650
651           dpo0 = load_balance_get_bucket_i(lb0, hc0 & (lb0->lb_n_buckets_minus_1));
652
653           next0 = dpo0->dpoi_next_node;
654           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
655
656           vlib_increment_combined_counter
657               (cm, cpu_index, lbi0, 1,
658                vlib_buffer_length_in_chain (vm, p0));
659
660           vlib_validate_buffer_enqueue_x1 (vm, node, next,
661                                            to_next, n_left_to_next,
662                                            pi0, next0);
663         }
664
665       vlib_put_next_frame (vm, node, next, n_left_to_next);
666     }
667
668   return frame->n_vectors;
669 }
670
671 VLIB_REGISTER_NODE (ip4_load_balance_node) = {
672   .function = ip4_load_balance,
673   .name = "ip4-load-balance",
674   .vector_size = sizeof (u32),
675   .sibling_of = "ip4-lookup",
676
677   .format_trace = format_ip4_lookup_trace,
678 };
679
680 VLIB_NODE_FUNCTION_MULTIARCH (ip4_load_balance_node, ip4_load_balance)
681
682 /* get first interface address */
683 ip4_address_t *
684 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
685                              ip_interface_address_t ** result_ia)
686 {
687   ip_lookup_main_t * lm = &im->lookup_main;
688   ip_interface_address_t * ia = 0;
689   ip4_address_t * result = 0;
690
691   foreach_ip_interface_address (lm, ia, sw_if_index,
692                                 1 /* honor unnumbered */,
693   ({
694     ip4_address_t * a = ip_interface_address_get_address (lm, ia);
695     result = a;
696     break;
697   }));
698   if (result_ia)
699     *result_ia = result ? ia : 0;
700   return result;
701 }
702
703 static void
704 ip4_add_interface_routes (u32 sw_if_index,
705                           ip4_main_t * im, u32 fib_index,
706                           ip_interface_address_t * a)
707 {
708   ip_lookup_main_t * lm = &im->lookup_main;
709   ip4_address_t * address = ip_interface_address_get_address (lm, a);
710   fib_prefix_t pfx = {
711       .fp_len = a->address_length,
712       .fp_proto = FIB_PROTOCOL_IP4,
713       .fp_addr.ip4 = *address,
714   };
715
716   a->neighbor_probe_adj_index = ~0;
717
718   if (pfx.fp_len < 32)
719   {
720       fib_node_index_t fei;
721
722       fei = fib_table_entry_update_one_path(fib_index,
723                                             &pfx,
724                                             FIB_SOURCE_INTERFACE,
725                                             (FIB_ENTRY_FLAG_CONNECTED |
726                                              FIB_ENTRY_FLAG_ATTACHED),
727                                             FIB_PROTOCOL_IP4,
728                                             NULL, /* No next-hop address */
729                                             sw_if_index,
730                                             ~0, // invalid FIB index
731                                             1,
732                                             NULL, // no out-label stack
733                                             FIB_ROUTE_PATH_FLAG_NONE);
734       a->neighbor_probe_adj_index = fib_entry_get_adj(fei);
735   }
736
737   pfx.fp_len = 32;
738
739   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
740   {
741       u32 classify_table_index =
742           lm->classify_table_index_by_sw_if_index [sw_if_index];
743       if (classify_table_index != (u32) ~0)
744       {
745           dpo_id_t dpo = DPO_INVALID;
746
747           dpo_set(&dpo,
748                   DPO_CLASSIFY,
749                   DPO_PROTO_IP4,
750                   classify_dpo_create(DPO_PROTO_IP4,
751                                       classify_table_index));
752
753           fib_table_entry_special_dpo_add(fib_index,
754                                           &pfx,
755                                           FIB_SOURCE_CLASSIFY,
756                                           FIB_ENTRY_FLAG_NONE,
757                                           &dpo);
758           dpo_reset(&dpo);
759       }
760   }
761
762   fib_table_entry_update_one_path(fib_index,
763                                   &pfx,
764                                   FIB_SOURCE_INTERFACE,
765                                   (FIB_ENTRY_FLAG_CONNECTED |
766                                    FIB_ENTRY_FLAG_LOCAL),
767                                   FIB_PROTOCOL_IP4,
768                                   &pfx.fp_addr,
769                                   sw_if_index,
770                                   ~0, // invalid FIB index
771                                   1,
772                                   NULL, // no out-label stack
773                                   FIB_ROUTE_PATH_FLAG_NONE);
774 }
775
776 static void
777 ip4_del_interface_routes (ip4_main_t * im,
778                           u32 fib_index,
779                           ip4_address_t * address,
780                           u32 address_length)
781 {
782     fib_prefix_t pfx = {
783         .fp_len = address_length,
784         .fp_proto = FIB_PROTOCOL_IP4,
785         .fp_addr.ip4 = *address,
786     };
787
788     if (pfx.fp_len < 32)
789     {
790         fib_table_entry_delete(fib_index,
791                                &pfx,
792                                FIB_SOURCE_INTERFACE);
793     }
794
795     pfx.fp_len = 32;
796     fib_table_entry_delete(fib_index,
797                            &pfx,
798                            FIB_SOURCE_INTERFACE);
799 }
800
801 void
802 ip4_sw_interface_enable_disable (u32 sw_if_index,
803                                  u32 is_enable)
804 {
805   ip4_main_t * im = &ip4_main;
806
807   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
808
809   /*
810    * enable/disable only on the 1<->0 transition
811    */
812   if (is_enable)
813     {
814       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
815         return;
816     }
817   else
818     {
819       ASSERT(im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
820       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
821         return;
822     }
823   vnet_feature_enable_disable ("ip4-unicast", "ip4-lookup", sw_if_index,
824                                is_enable, 0, 0);
825
826   vnet_feature_enable_disable ("ip4-multicast", "ip4-lookup-multicast", sw_if_index,
827                                is_enable, 0, 0);
828
829 }
830
831 static clib_error_t *
832 ip4_add_del_interface_address_internal (vlib_main_t * vm,
833                                         u32 sw_if_index,
834                                         ip4_address_t * address,
835                                         u32 address_length,
836                                         u32 is_del)
837 {
838   vnet_main_t * vnm = vnet_get_main();
839   ip4_main_t * im = &ip4_main;
840   ip_lookup_main_t * lm = &im->lookup_main;
841   clib_error_t * error = 0;
842   u32 if_address_index, elts_before;
843   ip4_address_fib_t ip4_af, * addr_fib = 0;
844
845   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
846   ip4_addr_fib_init (&ip4_af, address,
847                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
848   vec_add1 (addr_fib, ip4_af);
849
850   /* FIXME-LATER
851    * there is no support for adj-fib handling in the presence of overlapping
852    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
853    * most routers do.
854    */
855   if (! is_del)
856     {
857       /* When adding an address check that it does not conflict
858          with an existing address. */
859       ip_interface_address_t * ia;
860       foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
861                                     0 /* honor unnumbered */,
862       ({
863         ip4_address_t * x = ip_interface_address_get_address (&im->lookup_main, ia);
864
865         if (ip4_destination_matches_route (im, address, x, ia->address_length)
866             || ip4_destination_matches_route (im, x, address, address_length))
867           return clib_error_create ("failed to add %U which conflicts with %U for interface %U",
868                                     format_ip4_address_and_length, address, address_length,
869                                     format_ip4_address_and_length, x, ia->address_length,
870                                     format_vnet_sw_if_index_name, vnm, sw_if_index);
871        }));
872     }
873
874   elts_before = pool_elts (lm->if_address_pool);
875
876   error = ip_interface_address_add_del
877     (lm,
878      sw_if_index,
879      addr_fib,
880      address_length,
881      is_del,
882      &if_address_index);
883   if (error)
884     goto done;
885
886   ip4_sw_interface_enable_disable(sw_if_index, !is_del);
887
888   if (is_del)
889       ip4_del_interface_routes (im, ip4_af.fib_index, address,
890                                 address_length);
891   else
892       ip4_add_interface_routes (sw_if_index,
893                                 im, ip4_af.fib_index,
894                                 pool_elt_at_index
895                                 (lm->if_address_pool, if_address_index));
896
897   /* If pool did not grow/shrink: add duplicate address. */
898   if (elts_before != pool_elts (lm->if_address_pool))
899     {
900       ip4_add_del_interface_address_callback_t * cb;
901       vec_foreach (cb, im->add_del_interface_address_callbacks)
902         cb->function (im, cb->function_opaque, sw_if_index,
903                       address, address_length,
904                       if_address_index,
905                       is_del);
906     }
907
908  done:
909   vec_free (addr_fib);
910   return error;
911 }
912
913 clib_error_t *
914 ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
915                                ip4_address_t * address, u32 address_length,
916                                u32 is_del)
917 {
918   return ip4_add_del_interface_address_internal
919     (vm, sw_if_index, address, address_length,
920      is_del);
921 }
922
923 /* Built-in ip4 unicast rx feature path definition */
924 VNET_FEATURE_ARC_INIT (ip4_unicast, static) =
925 {
926   .arc_name  = "ip4-unicast",
927   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
928   .arc_index_ptr = &ip4_main.lookup_main.ucast_feature_arc_index,
929 };
930
931 VNET_FEATURE_INIT (ip4_flow_classify, static) = {
932   .arc_name = "ip4-unicast",
933   .node_name = "ip4-flow-classify",
934   .runs_before = VNET_FEATURES ("ip4-inacl"),
935 };
936
937 VNET_FEATURE_INIT (ip4_inacl, static) = {
938   .arc_name = "ip4-unicast",
939   .node_name = "ip4-inacl",
940   .runs_before = VNET_FEATURES ("ip4-source-check-via-rx"),
941 };
942
943 VNET_FEATURE_INIT (ip4_source_check_1, static) = {
944   .arc_name = "ip4-unicast",
945   .node_name = "ip4-source-check-via-rx",
946   .runs_before = VNET_FEATURES ("ip4-source-check-via-any"),
947 };
948
949 VNET_FEATURE_INIT (ip4_source_check_2, static) = {
950   .arc_name = "ip4-unicast",
951   .node_name = "ip4-source-check-via-any",
952   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
953 };
954
955 VNET_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) = {
956   .arc_name = "ip4-unicast",
957   .node_name = "ip4-source-and-port-range-check-rx",
958   .runs_before = VNET_FEATURES ("ip4-policer-classify"),
959 };
960
961 VNET_FEATURE_INIT (ip4_policer_classify, static) = {
962   .arc_name = "ip4-unicast",
963   .node_name = "ip4-policer-classify",
964   .runs_before = VNET_FEATURES ("ipsec-input-ip4"),
965 };
966
967 VNET_FEATURE_INIT (ip4_ipsec, static) = {
968   .arc_name = "ip4-unicast",
969   .node_name = "ipsec-input-ip4",
970   .runs_before = VNET_FEATURES ("vpath-input-ip4"),
971 };
972
973 VNET_FEATURE_INIT (ip4_vpath, static) = {
974   .arc_name = "ip4-unicast",
975   .node_name = "vpath-input-ip4",
976   .runs_before = VNET_FEATURES ("ip4-vxlan-bypass"),
977 };
978
979 VNET_FEATURE_INIT (ip4_vxlan_bypass, static) = {
980   .arc_name = "ip4-unicast",
981   .node_name = "ip4-vxlan-bypass",
982   .runs_before = VNET_FEATURES ("ip4-lookup"),
983 };
984
985 VNET_FEATURE_INIT (ip4_lookup, static) = {
986   .arc_name = "ip4-unicast",
987   .node_name = "ip4-lookup",
988   .runs_before = VNET_FEATURES ("ip4-drop"),
989 };
990
991 VNET_FEATURE_INIT (ip4_drop, static) = {
992   .arc_name = "ip4-unicast",
993   .node_name = "ip4-drop",
994   .runs_before = 0, /* not before any other features */
995 };
996
997
998 /* Built-in ip4 multicast rx feature path definition */
999 VNET_FEATURE_ARC_INIT (ip4_multicast, static) =
1000 {
1001   .arc_name  = "ip4-multicast",
1002   .start_nodes = VNET_FEATURES ("ip4-input", "ip4-input-no-checksum"),
1003   .arc_index_ptr = &ip4_main.lookup_main.mcast_feature_arc_index,
1004 };
1005
1006 VNET_FEATURE_INIT (ip4_vpath_mc, static) = {
1007   .arc_name = "ip4-multicast",
1008   .node_name = "vpath-input-ip4",
1009   .runs_before = VNET_FEATURES ("ip4-lookup-multicast"),
1010 };
1011
1012 VNET_FEATURE_INIT (ip4_lookup_mc, static) = {
1013   .arc_name = "ip4-multicast",
1014   .node_name = "ip4-lookup-multicast",
1015   .runs_before = VNET_FEATURES ("ip4-drop"),
1016 };
1017
1018 VNET_FEATURE_INIT (ip4_mc_drop, static) = {
1019   .arc_name = "ip4-multicast",
1020   .node_name = "ip4-drop",
1021   .runs_before = 0, /* last feature */
1022 };
1023
1024 /* Source and port-range check ip4 tx feature path definition */
1025 VNET_FEATURE_ARC_INIT (ip4_output, static) =
1026 {
1027   .arc_name  = "ip4-output",
1028   .start_nodes = VNET_FEATURES ("ip4-rewrite", "ip4-midchain"),
1029   .arc_index_ptr = &ip4_main.lookup_main.output_feature_arc_index,
1030 };
1031
1032 VNET_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) = {
1033   .arc_name = "ip4-output",
1034   .node_name = "ip4-source-and-port-range-check-tx",
1035   .runs_before = VNET_FEATURES ("ipsec-output-ip4"),
1036 };
1037
1038 VNET_FEATURE_INIT (ip4_ipsec_output, static) = {
1039   .arc_name = "ip4-output",
1040   .node_name = "ipsec-output-ip4",
1041   .runs_before = VNET_FEATURES ("interface-output"),
1042 };
1043
1044 /* Built-in ip4 tx feature path definition */
1045 VNET_FEATURE_INIT (ip4_interface_output, static) = {
1046   .arc_name = "ip4-output",
1047   .node_name = "interface-output",
1048   .runs_before = 0, /* not before any other features */
1049 };
1050
1051
1052 static clib_error_t *
1053 ip4_sw_interface_add_del (vnet_main_t * vnm,
1054                           u32 sw_if_index,
1055                           u32 is_add)
1056 {
1057   ip4_main_t * im = &ip4_main;
1058
1059   /* Fill in lookup tables with default table (0). */
1060   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1061
1062   vnet_feature_enable_disable ("ip4-unicast", "ip4-drop", sw_if_index,
1063                                is_add, 0, 0);
1064
1065   vnet_feature_enable_disable ("ip4-multicast", "ip4-drop", sw_if_index,
1066                                is_add, 0, 0);
1067
1068   vnet_feature_enable_disable ("ip4-output", "interface-output", sw_if_index,
1069                                is_add, 0, 0);
1070
1071   return /* no error */ 0;
1072 }
1073
1074 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1075
1076 /* Global IP4 main. */
1077 ip4_main_t ip4_main;
1078
1079 clib_error_t *
1080 ip4_lookup_init (vlib_main_t * vm)
1081 {
1082   ip4_main_t * im = &ip4_main;
1083   clib_error_t * error;
1084   uword i;
1085
1086   if ((error = vlib_call_init_function (vm, vnet_feature_init)))
1087     return error;
1088
1089   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1090     {
1091       u32 m;
1092
1093       if (i < 32)
1094         m = pow2_mask (i) << (32 - i);
1095       else
1096         m = ~0;
1097       im->fib_masks[i] = clib_host_to_net_u32 (m);
1098     }
1099
1100   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1101
1102   /* Create FIB with index 0 and table id of 0. */
1103   fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4, 0);
1104
1105   {
1106     pg_node_t * pn;
1107     pn = pg_get_node (ip4_lookup_node.index);
1108     pn->unformat_edit = unformat_pg_ip4_header;
1109   }
1110
1111   {
1112     ethernet_arp_header_t h;
1113
1114     memset (&h, 0, sizeof (h));
1115
1116     /* Set target ethernet address to all zeros. */
1117     memset (h.ip4_over_ethernet[1].ethernet, 0, sizeof (h.ip4_over_ethernet[1].ethernet));
1118
1119 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1120 #define _8(f,v) h.f = v;
1121     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1122     _16 (l3_type, ETHERNET_TYPE_IP4);
1123     _8 (n_l2_address_bytes, 6);
1124     _8 (n_l3_address_bytes, 4);
1125     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1126 #undef _16
1127 #undef _8
1128
1129     vlib_packet_template_init (vm,
1130                                &im->ip4_arp_request_packet_template,
1131                                /* data */ &h,
1132                                sizeof (h),
1133                                /* alloc chunk size */ 8,
1134                                "ip4 arp");
1135   }
1136
1137   return error;
1138 }
1139
1140 VLIB_INIT_FUNCTION (ip4_lookup_init);
1141
1142 typedef struct {
1143   /* Adjacency taken. */
1144   u32 dpo_index;
1145   u32 flow_hash;
1146   u32 fib_index;
1147
1148   /* Packet data, possibly *after* rewrite. */
1149   u8 packet_data[64 - 1*sizeof(u32)];
1150 } ip4_forward_next_trace_t;
1151
1152 u8 * format_ip4_forward_next_trace (u8 * s, va_list * args)
1153 {
1154   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1155   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1156   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1157   uword indent = format_get_indent (s);
1158   s = format (s, "%U%U",
1159               format_white_space, indent,
1160               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1161   return s;
1162 }
1163
1164 static u8 * format_ip4_lookup_trace (u8 * s, va_list * args)
1165 {
1166   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1167   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1168   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1169   uword indent = format_get_indent (s);
1170
1171   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1172               t->fib_index, t->dpo_index, t->flow_hash);
1173   s = format (s, "\n%U%U",
1174               format_white_space, indent,
1175               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1176   return s;
1177 }
1178
1179 static u8 * format_ip4_rewrite_trace (u8 * s, va_list * args)
1180 {
1181   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1182   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1183   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1184   vnet_main_t * vnm = vnet_get_main();
1185   uword indent = format_get_indent (s);
1186
1187   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1188               t->fib_index, t->dpo_index, format_ip_adjacency,
1189               t->dpo_index, FORMAT_IP_ADJACENCY_NONE,
1190               t->flow_hash);
1191   s = format (s, "\n%U%U",
1192               format_white_space, indent,
1193               format_ip_adjacency_packet_data,
1194               vnm, t->dpo_index,
1195               t->packet_data, sizeof (t->packet_data));
1196   return s;
1197 }
1198
1199 /* Common trace function for all ip4-forward next nodes. */
1200 void
1201 ip4_forward_next_trace (vlib_main_t * vm,
1202                         vlib_node_runtime_t * node,
1203                         vlib_frame_t * frame,
1204                         vlib_rx_or_tx_t which_adj_index)
1205 {
1206   u32 * from, n_left;
1207   ip4_main_t * im = &ip4_main;
1208
1209   n_left = frame->n_vectors;
1210   from = vlib_frame_vector_args (frame);
1211
1212   while (n_left >= 4)
1213     {
1214       u32 bi0, bi1;
1215       vlib_buffer_t * b0, * b1;
1216       ip4_forward_next_trace_t * t0, * t1;
1217
1218       /* Prefetch next iteration. */
1219       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1220       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1221
1222       bi0 = from[0];
1223       bi1 = from[1];
1224
1225       b0 = vlib_get_buffer (vm, bi0);
1226       b1 = vlib_get_buffer (vm, bi1);
1227
1228       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1229         {
1230           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1231           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1232           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1233           t0->fib_index = (vnet_buffer(b0)->sw_if_index[VLIB_TX] != (u32)~0) ?
1234               vnet_buffer(b0)->sw_if_index[VLIB_TX] :
1235               vec_elt (im->fib_index_by_sw_if_index,
1236                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
1237
1238           clib_memcpy (t0->packet_data,
1239                   vlib_buffer_get_current (b0),
1240                   sizeof (t0->packet_data));
1241         }
1242       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1243         {
1244           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1245           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1246           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1247           t1->fib_index = (vnet_buffer(b1)->sw_if_index[VLIB_TX] != (u32)~0) ?
1248               vnet_buffer(b1)->sw_if_index[VLIB_TX] :
1249               vec_elt (im->fib_index_by_sw_if_index,
1250                        vnet_buffer(b1)->sw_if_index[VLIB_RX]);
1251           clib_memcpy (t1->packet_data,
1252                   vlib_buffer_get_current (b1),
1253                   sizeof (t1->packet_data));
1254         }
1255       from += 2;
1256       n_left -= 2;
1257     }
1258
1259   while (n_left >= 1)
1260     {
1261       u32 bi0;
1262       vlib_buffer_t * b0;
1263       ip4_forward_next_trace_t * t0;
1264
1265       bi0 = from[0];
1266
1267       b0 = vlib_get_buffer (vm, bi0);
1268
1269       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1270         {
1271           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1272           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1273           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1274           t0->fib_index = (vnet_buffer(b0)->sw_if_index[VLIB_TX] != (u32)~0) ?
1275               vnet_buffer(b0)->sw_if_index[VLIB_TX] :
1276               vec_elt (im->fib_index_by_sw_if_index,
1277                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
1278           clib_memcpy (t0->packet_data,
1279                   vlib_buffer_get_current (b0),
1280                   sizeof (t0->packet_data));
1281         }
1282       from += 1;
1283       n_left -= 1;
1284     }
1285 }
1286
1287 static uword
1288 ip4_drop_or_punt (vlib_main_t * vm,
1289                   vlib_node_runtime_t * node,
1290                   vlib_frame_t * frame,
1291                   ip4_error_t error_code)
1292 {
1293   u32 * buffers = vlib_frame_vector_args (frame);
1294   uword n_packets = frame->n_vectors;
1295
1296   vlib_error_drop_buffers (vm, node,
1297                            buffers,
1298                            /* stride */ 1,
1299                            n_packets,
1300                            /* next */ 0,
1301                            ip4_input_node.index,
1302                            error_code);
1303
1304   if (node->flags & VLIB_NODE_FLAG_TRACE)
1305     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1306
1307   return n_packets;
1308 }
1309
1310 static uword
1311 ip4_drop (vlib_main_t * vm,
1312           vlib_node_runtime_t * node,
1313           vlib_frame_t * frame)
1314 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_DROP); }
1315
1316 static uword
1317 ip4_punt (vlib_main_t * vm,
1318           vlib_node_runtime_t * node,
1319           vlib_frame_t * frame)
1320 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT); }
1321
1322 VLIB_REGISTER_NODE (ip4_drop_node,static) = {
1323   .function = ip4_drop,
1324   .name = "ip4-drop",
1325   .vector_size = sizeof (u32),
1326
1327   .format_trace = format_ip4_forward_next_trace,
1328
1329   .n_next_nodes = 1,
1330   .next_nodes = {
1331     [0] = "error-drop",
1332   },
1333 };
1334
1335 VLIB_NODE_FUNCTION_MULTIARCH (ip4_drop_node, ip4_drop)
1336
1337 VLIB_REGISTER_NODE (ip4_punt_node,static) = {
1338   .function = ip4_punt,
1339   .name = "ip4-punt",
1340   .vector_size = sizeof (u32),
1341
1342   .format_trace = format_ip4_forward_next_trace,
1343
1344   .n_next_nodes = 1,
1345   .next_nodes = {
1346     [0] = "error-punt",
1347   },
1348 };
1349
1350 VLIB_NODE_FUNCTION_MULTIARCH (ip4_punt_node, ip4_punt)
1351
1352 /* Compute TCP/UDP/ICMP4 checksum in software. */
1353 u16
1354 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1355                               ip4_header_t * ip0)
1356 {
1357   ip_csum_t sum0;
1358   u32 ip_header_length, payload_length_host_byte_order;
1359   u32 n_this_buffer, n_bytes_left;
1360   u16 sum16;
1361   void * data_this_buffer;
1362
1363   /* Initialize checksum with ip header. */
1364   ip_header_length = ip4_header_bytes (ip0);
1365   payload_length_host_byte_order = clib_net_to_host_u16 (ip0->length) - ip_header_length;
1366   sum0 = clib_host_to_net_u32 (payload_length_host_byte_order + (ip0->protocol << 16));
1367
1368   if (BITS (uword) == 32)
1369     {
1370       sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u32));
1371       sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32));
1372     }
1373   else
1374     sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1375
1376   n_bytes_left = n_this_buffer = payload_length_host_byte_order;
1377   data_this_buffer = (void *) ip0 + ip_header_length;
1378   if (n_this_buffer + ip_header_length > p0->current_length)
1379     n_this_buffer = p0->current_length > ip_header_length ? p0->current_length - ip_header_length : 0;
1380   while (1)
1381     {
1382       sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
1383       n_bytes_left -= n_this_buffer;
1384       if (n_bytes_left == 0)
1385         break;
1386
1387       ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
1388       p0 = vlib_get_buffer (vm, p0->next_buffer);
1389       data_this_buffer = vlib_buffer_get_current (p0);
1390       n_this_buffer = p0->current_length;
1391     }
1392
1393   sum16 = ~ ip_csum_fold (sum0);
1394
1395   return sum16;
1396 }
1397
1398 u32
1399 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1400 {
1401   ip4_header_t * ip0 = vlib_buffer_get_current (p0);
1402   udp_header_t * udp0;
1403   u16 sum16;
1404
1405   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1406           || ip0->protocol == IP_PROTOCOL_UDP);
1407
1408   udp0 = (void *) (ip0 + 1);
1409   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1410     {
1411       p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1412                     | IP_BUFFER_L4_CHECKSUM_CORRECT);
1413       return p0->flags;
1414     }
1415
1416   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1417
1418   p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1419                 | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT));
1420
1421   return p0->flags;
1422 }
1423
1424 static uword
1425 ip4_local (vlib_main_t * vm,
1426            vlib_node_runtime_t * node,
1427            vlib_frame_t * frame)
1428 {
1429   ip4_main_t * im = &ip4_main;
1430   ip_lookup_main_t * lm = &im->lookup_main;
1431   ip_local_next_t next_index;
1432   u32 * from, * to_next, n_left_from, n_left_to_next;
1433   vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
1434
1435   from = vlib_frame_vector_args (frame);
1436   n_left_from = frame->n_vectors;
1437   next_index = node->cached_next_index;
1438
1439   if (node->flags & VLIB_NODE_FLAG_TRACE)
1440     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1441
1442   while (n_left_from > 0)
1443     {
1444       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1445
1446       while (n_left_from >= 4 && n_left_to_next >= 2)
1447         {
1448           vlib_buffer_t * p0, * p1;
1449           ip4_header_t * ip0, * ip1;
1450           udp_header_t * udp0, * udp1;
1451           ip4_fib_mtrie_t * mtrie0, * mtrie1;
1452           ip4_fib_mtrie_leaf_t leaf0, leaf1;
1453           const dpo_id_t *dpo0, *dpo1;
1454           const load_balance_t *lb0, *lb1;
1455           u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, lbi0;
1456           u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, lbi1;
1457           i32 len_diff0, len_diff1;
1458           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
1459           u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
1460           u8 enqueue_code;
1461
1462           pi0 = to_next[0] = from[0];
1463           pi1 = to_next[1] = from[1];
1464           from += 2;
1465           n_left_from -= 2;
1466           to_next += 2;
1467           n_left_to_next -= 2;
1468
1469           p0 = vlib_get_buffer (vm, pi0);
1470           p1 = vlib_get_buffer (vm, pi1);
1471
1472           ip0 = vlib_buffer_get_current (p0);
1473           ip1 = vlib_buffer_get_current (p1);
1474
1475           vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data;
1476           vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data;
1477
1478           fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
1479                                 vnet_buffer (p0)->sw_if_index[VLIB_RX]);
1480           fib_index1 = vec_elt (im->fib_index_by_sw_if_index,
1481                                 vnet_buffer(p1)->sw_if_index[VLIB_RX]);
1482
1483           mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
1484           mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
1485
1486           leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
1487
1488           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
1489           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0);
1490
1491           /* Treat IP frag packets as "experimental" protocol for now
1492              until support of IP frag reassembly is implemented */
1493           proto0 = ip4_is_fragment(ip0) ? 0xfe : ip0->protocol;
1494           proto1 = ip4_is_fragment(ip1) ? 0xfe : ip1->protocol;
1495           is_udp0 = proto0 == IP_PROTOCOL_UDP;
1496           is_udp1 = proto1 == IP_PROTOCOL_UDP;
1497           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
1498           is_tcp_udp1 = is_udp1 || proto1 == IP_PROTOCOL_TCP;
1499
1500           flags0 = p0->flags;
1501           flags1 = p1->flags;
1502
1503           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1504           good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1505
1506           udp0 = ip4_next_header (ip0);
1507           udp1 = ip4_next_header (ip1);
1508
1509           /* Don't verify UDP checksum for packets with explicit zero checksum. */
1510           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1511           good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1512
1513           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
1514           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1);
1515
1516           /* Verify UDP length. */
1517           ip_len0 = clib_net_to_host_u16 (ip0->length);
1518           ip_len1 = clib_net_to_host_u16 (ip1->length);
1519           udp_len0 = clib_net_to_host_u16 (udp0->length);
1520           udp_len1 = clib_net_to_host_u16 (udp1->length);
1521
1522           len_diff0 = ip_len0 - udp_len0;
1523           len_diff1 = ip_len1 - udp_len1;
1524
1525           len_diff0 = is_udp0 ? len_diff0 : 0;
1526           len_diff1 = is_udp1 ? len_diff1 : 0;
1527
1528           if (PREDICT_FALSE (! (is_tcp_udp0 & is_tcp_udp1
1529                                 & good_tcp_udp0 & good_tcp_udp1)))
1530             {
1531               if (is_tcp_udp0)
1532                 {
1533                   if (is_tcp_udp0
1534                       && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1535                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
1536                   good_tcp_udp0 =
1537                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1538                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1539                 }
1540               if (is_tcp_udp1)
1541                 {
1542                   if (is_tcp_udp1
1543                       && ! (flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1544                     flags1 = ip4_tcp_udp_validate_checksum (vm, p1);
1545                   good_tcp_udp1 =
1546                     (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1547                   good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1548                 }
1549             }
1550
1551           good_tcp_udp0 &= len_diff0 >= 0;
1552           good_tcp_udp1 &= len_diff1 >= 0;
1553
1554           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1555           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2);
1556
1557           error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;
1558
1559           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
1560           error1 = len_diff1 < 0 ? IP4_ERROR_UDP_LENGTH : error1;
1561
1562           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1563           error0 = (is_tcp_udp0 && ! good_tcp_udp0
1564                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0
1565                     : error0);
1566           error1 = (is_tcp_udp1 && ! good_tcp_udp1
1567                     ? IP4_ERROR_TCP_CHECKSUM + is_udp1
1568                     : error1);
1569
1570           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1571           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
1572           leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
1573           leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
1574
1575           vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1576           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
1577
1578           vnet_buffer (p1)->ip.adj_index[VLIB_RX] = lbi1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
1579           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = lbi1;
1580
1581           lb0 = load_balance_get(lbi0);
1582           lb1 = load_balance_get(lbi1);
1583           dpo0 = load_balance_get_bucket_i(lb0, 0);
1584           dpo1 = load_balance_get_bucket_i(lb1, 0);
1585
1586           /*
1587            * Must have a route to source otherwise we drop the packet.
1588            * ip4 broadcasts are accepted, e.g. to make dhcp client work
1589            *
1590            * The checks are:
1591            *  - the source is a recieve => it's from us => bogus, do this
1592            *    first since it sets a different error code.
1593            *  - uRPF check for any route to source - accept if passes.
1594            *  - allow packets destined to the broadcast address from unknown sources
1595            */
1596           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1597                      dpo0->dpoi_type == DPO_RECEIVE) ?
1598                     IP4_ERROR_SPOOFED_LOCAL_PACKETS :
1599                     error0);
1600           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1601                      !fib_urpf_check_size(lb0->lb_urpf) &&
1602                      ip0->dst_address.as_u32 != 0xFFFFFFFF)
1603                     ? IP4_ERROR_SRC_LOOKUP_MISS
1604                     : error0);
1605           error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1606                      dpo1->dpoi_type == DPO_RECEIVE) ?
1607                     IP4_ERROR_SPOOFED_LOCAL_PACKETS :
1608                     error1);
1609           error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1610                      !fib_urpf_check_size(lb1->lb_urpf) &&
1611                      ip1->dst_address.as_u32 != 0xFFFFFFFF)
1612                     ? IP4_ERROR_SRC_LOOKUP_MISS
1613                     : error1);
1614
1615           next0 = lm->local_next_by_ip_protocol[proto0];
1616           next1 = lm->local_next_by_ip_protocol[proto1];
1617
1618           next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
1619           next1 = error1 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1;
1620
1621           p0->error = error0 ? error_node->errors[error0] : 0;
1622           p1->error = error1 ? error_node->errors[error1] : 0;
1623
1624           enqueue_code = (next0 != next_index) + 2*(next1 != next_index);
1625
1626           if (PREDICT_FALSE (enqueue_code != 0))
1627             {
1628               switch (enqueue_code)
1629                 {
1630                 case 1:
1631                   /* A B A */
1632                   to_next[-2] = pi1;
1633                   to_next -= 1;
1634                   n_left_to_next += 1;
1635                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
1636                   break;
1637
1638                 case 2:
1639                   /* A A B */
1640                   to_next -= 1;
1641                   n_left_to_next += 1;
1642                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
1643                   break;
1644
1645                 case 3:
1646                   /* A B B or A B C */
1647                   to_next -= 2;
1648                   n_left_to_next += 2;
1649                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
1650                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
1651                   if (next0 == next1)
1652                     {
1653                       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1654                       next_index = next1;
1655                       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1656                     }
1657                   break;
1658                 }
1659             }
1660         }
1661
1662       while (n_left_from > 0 && n_left_to_next > 0)
1663         {
1664           vlib_buffer_t * p0;
1665           ip4_header_t * ip0;
1666           udp_header_t * udp0;
1667           ip4_fib_mtrie_t * mtrie0;
1668           ip4_fib_mtrie_leaf_t leaf0;
1669           u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, lbi0;
1670           i32 len_diff0;
1671           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
1672           load_balance_t *lb0;
1673           const dpo_id_t *dpo0;
1674
1675           pi0 = to_next[0] = from[0];
1676           from += 1;
1677           n_left_from -= 1;
1678           to_next += 1;
1679           n_left_to_next -= 1;
1680
1681           p0 = vlib_get_buffer (vm, pi0);
1682
1683           ip0 = vlib_buffer_get_current (p0);
1684
1685           vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data;
1686
1687           fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
1688                                 vnet_buffer(p0)->sw_if_index[VLIB_RX]);
1689
1690           mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
1691
1692           leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
1693
1694           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
1695
1696           /* Treat IP frag packets as "experimental" protocol for now
1697              until support of IP frag reassembly is implemented */
1698           proto0 = ip4_is_fragment(ip0) ? 0xfe : ip0->protocol;
1699           is_udp0 = proto0 == IP_PROTOCOL_UDP;
1700           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
1701
1702           flags0 = p0->flags;
1703
1704           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1705
1706           udp0 = ip4_next_header (ip0);
1707
1708           /* Don't verify UDP checksum for packets with explicit zero checksum. */
1709           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1710
1711           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
1712
1713           /* Verify UDP length. */
1714           ip_len0 = clib_net_to_host_u16 (ip0->length);
1715           udp_len0 = clib_net_to_host_u16 (udp0->length);
1716
1717           len_diff0 = ip_len0 - udp_len0;
1718
1719           len_diff0 = is_udp0 ? len_diff0 : 0;
1720
1721           if (PREDICT_FALSE (! (is_tcp_udp0 & good_tcp_udp0)))
1722             {
1723               if (is_tcp_udp0)
1724                 {
1725                   if (is_tcp_udp0
1726                       && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1727                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
1728                   good_tcp_udp0 =
1729                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1730                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1731                 }
1732             }
1733
1734           good_tcp_udp0 &= len_diff0 >= 0;
1735
1736           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1737
1738           error0 = IP4_ERROR_UNKNOWN_PROTOCOL;
1739
1740           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
1741
1742           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1743           error0 = (is_tcp_udp0 && ! good_tcp_udp0
1744                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0
1745                     : error0);
1746
1747           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1748           leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
1749
1750           lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1751           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
1752
1753           lb0 = load_balance_get(lbi0);
1754           dpo0 = load_balance_get_bucket_i(lb0, 0);
1755
1756           vnet_buffer (p0)->ip.adj_index[VLIB_TX] =
1757               vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0;
1758
1759           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1760                      dpo0->dpoi_type == DPO_RECEIVE) ?
1761                     IP4_ERROR_SPOOFED_LOCAL_PACKETS :
1762                     error0);
1763           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1764                      !fib_urpf_check_size(lb0->lb_urpf) &&
1765                      ip0->dst_address.as_u32 != 0xFFFFFFFF)
1766                     ? IP4_ERROR_SRC_LOOKUP_MISS
1767                     : error0);
1768
1769           next0 = lm->local_next_by_ip_protocol[proto0];
1770
1771           next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
1772
1773           p0->error = error0? error_node->errors[error0] : 0;
1774
1775           if (PREDICT_FALSE (next0 != next_index))
1776             {
1777               n_left_to_next += 1;
1778               vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1779
1780               next_index = next0;
1781               vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1782               to_next[0] = pi0;
1783               to_next += 1;
1784               n_left_to_next -= 1;
1785             }
1786         }
1787
1788       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1789     }
1790
1791   return frame->n_vectors;
1792 }
1793
1794 VLIB_REGISTER_NODE (ip4_local_node,static) = {
1795   .function = ip4_local,
1796   .name = "ip4-local",
1797   .vector_size = sizeof (u32),
1798
1799   .format_trace = format_ip4_forward_next_trace,
1800
1801   .n_next_nodes = IP_LOCAL_N_NEXT,
1802   .next_nodes = {
1803     [IP_LOCAL_NEXT_DROP] = "error-drop",
1804     [IP_LOCAL_NEXT_PUNT] = "error-punt",
1805     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1806     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
1807   },
1808 };
1809
1810 VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local)
1811
1812 void ip4_register_protocol (u32 protocol, u32 node_index)
1813 {
1814   vlib_main_t * vm = vlib_get_main();
1815   ip4_main_t * im = &ip4_main;
1816   ip_lookup_main_t * lm = &im->lookup_main;
1817
1818   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1819   lm->local_next_by_ip_protocol[protocol] = vlib_node_add_next (vm, ip4_local_node.index, node_index);
1820 }
1821
1822 static clib_error_t *
1823 show_ip_local_command_fn (vlib_main_t * vm,
1824                           unformat_input_t * input,
1825                          vlib_cli_command_t * cmd)
1826 {
1827   ip4_main_t * im = &ip4_main;
1828   ip_lookup_main_t * lm = &im->lookup_main;
1829   int i;
1830
1831   vlib_cli_output (vm, "Protocols handled by ip4_local");
1832   for (i = 0; i < ARRAY_LEN(lm->local_next_by_ip_protocol); i++)
1833     {
1834       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1835         vlib_cli_output (vm, "%d", i);
1836     }
1837   return 0;
1838 }
1839
1840
1841
1842 /*?
1843  * Display the set of protocols handled by the local IPv4 stack.
1844  *
1845  * @cliexpar
1846  * Example of how to display local protocol table:
1847  * @cliexstart{show ip local}
1848  * Protocols handled by ip4_local
1849  * 1
1850  * 17
1851  * 47
1852  * @cliexend
1853 ?*/
1854 /* *INDENT-OFF* */
1855 VLIB_CLI_COMMAND (show_ip_local, static) = {
1856   .path = "show ip local",
1857   .function = show_ip_local_command_fn,
1858   .short_help = "show ip local",
1859 };
1860 /* *INDENT-ON* */
1861
1862 always_inline uword
1863 ip4_arp_inline (vlib_main_t * vm,
1864                 vlib_node_runtime_t * node,
1865                 vlib_frame_t * frame,
1866                 int is_glean)
1867 {
1868   vnet_main_t * vnm = vnet_get_main();
1869   ip4_main_t * im = &ip4_main;
1870   ip_lookup_main_t * lm = &im->lookup_main;
1871   u32 * from, * to_next_drop;
1872   uword n_left_from, n_left_to_next_drop, next_index;
1873   static f64 time_last_seed_change = -1e100;
1874   static u32 hash_seeds[3];
1875   static uword hash_bitmap[256 / BITS (uword)];
1876   f64 time_now;
1877
1878   if (node->flags & VLIB_NODE_FLAG_TRACE)
1879     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1880
1881   time_now = vlib_time_now (vm);
1882   if (time_now - time_last_seed_change > 1e-3)
1883     {
1884       uword i;
1885       u32 * r = clib_random_buffer_get_data (&vm->random_buffer,
1886                                              sizeof (hash_seeds));
1887       for (i = 0; i < ARRAY_LEN (hash_seeds); i++)
1888         hash_seeds[i] = r[i];
1889
1890       /* Mark all hash keys as been no-seen before. */
1891       for (i = 0; i < ARRAY_LEN (hash_bitmap); i++)
1892         hash_bitmap[i] = 0;
1893
1894       time_last_seed_change = time_now;
1895     }
1896
1897   from = vlib_frame_vector_args (frame);
1898   n_left_from = frame->n_vectors;
1899   next_index = node->cached_next_index;
1900   if (next_index == IP4_ARP_NEXT_DROP)
1901     next_index = IP4_ARP_N_NEXT; /* point to first interface */
1902
1903   while (n_left_from > 0)
1904     {
1905       vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
1906                            to_next_drop, n_left_to_next_drop);
1907
1908       while (n_left_from > 0 && n_left_to_next_drop > 0)
1909         {
1910           u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
1911           ip_adjacency_t * adj0;
1912           vlib_buffer_t * p0;
1913           ip4_header_t * ip0;
1914           uword bm0;
1915
1916           pi0 = from[0];
1917
1918           p0 = vlib_get_buffer (vm, pi0);
1919
1920           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
1921           adj0 = ip_get_adjacency (lm, adj_index0);
1922           ip0 = vlib_buffer_get_current (p0);
1923
1924           a0 = hash_seeds[0];
1925           b0 = hash_seeds[1];
1926           c0 = hash_seeds[2];
1927
1928           sw_if_index0 = adj0->rewrite_header.sw_if_index;
1929           vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
1930
1931           if (is_glean)
1932           {
1933               /*
1934                * this is the Glean case, so we are ARPing for the
1935                * packet's destination
1936                */
1937               a0 ^= ip0->dst_address.data_u32;
1938           }
1939           else
1940           {
1941               a0 ^= adj0->sub_type.nbr.next_hop.ip4.data_u32;
1942           }
1943           b0 ^= sw_if_index0;
1944
1945           hash_v3_finalize32 (a0, b0, c0);
1946
1947           c0 &= BITS (hash_bitmap) - 1;
1948           c0 = c0 / BITS (uword);
1949           m0 = (uword) 1 << (c0 % BITS (uword));
1950
1951           bm0 = hash_bitmap[c0];
1952           drop0 = (bm0 & m0) != 0;
1953
1954           /* Mark it as seen. */
1955           hash_bitmap[c0] = bm0 | m0;
1956
1957           from += 1;
1958           n_left_from -= 1;
1959           to_next_drop[0] = pi0;
1960           to_next_drop += 1;
1961           n_left_to_next_drop -= 1;
1962
1963           p0->error = node->errors[drop0 ? IP4_ARP_ERROR_DROP : IP4_ARP_ERROR_REQUEST_SENT];
1964
1965           /*
1966            * the adj has been updated to a rewrite but the node the DPO that got
1967            * us here hasn't - yet. no big deal. we'll drop while we wait.
1968            */
1969           if (IP_LOOKUP_NEXT_REWRITE == adj0->lookup_next_index)
1970             continue;
1971
1972           if (drop0)
1973             continue;
1974
1975           /*
1976            * Can happen if the control-plane is programming tables
1977            * with traffic flowing; at least that's today's lame excuse.
1978            */
1979           if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN) ||
1980               (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
1981           {
1982             p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
1983           }
1984           else
1985           /* Send ARP request. */
1986           {
1987             u32 bi0 = 0;
1988             vlib_buffer_t * b0;
1989             ethernet_arp_header_t * h0;
1990             vnet_hw_interface_t * hw_if0;
1991
1992             h0 = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi0);
1993
1994             /* Add rewrite/encap string for ARP packet. */
1995             vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t));
1996
1997             hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
1998
1999             /* Src ethernet address in ARP header. */
2000             clib_memcpy (h0->ip4_over_ethernet[0].ethernet, hw_if0->hw_address,
2001                     sizeof (h0->ip4_over_ethernet[0].ethernet));
2002
2003             if (is_glean)
2004             {
2005                 /* The interface's source address is stashed in the Glean Adj */
2006                 h0->ip4_over_ethernet[0].ip4 = adj0->sub_type.glean.receive_addr.ip4;
2007
2008                 /* Copy in destination address we are requesting. This is the
2009                 * glean case, so it's the packet's destination.*/
2010                 h0->ip4_over_ethernet[1].ip4.data_u32 = ip0->dst_address.data_u32;
2011             }
2012             else
2013             {
2014                 /* Src IP address in ARP header. */
2015                 if (ip4_src_address_for_packet(lm, sw_if_index0,
2016                                                &h0->ip4_over_ethernet[0].ip4))
2017                 {
2018                     /* No source address available */
2019                     p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
2020                     vlib_buffer_free(vm, &bi0, 1);
2021                     continue;
2022                 }
2023
2024                 /* Copy in destination address we are requesting from the
2025                    incomplete adj */
2026                 h0->ip4_over_ethernet[1].ip4.data_u32 =
2027                     adj0->sub_type.nbr.next_hop.ip4.as_u32;
2028             }
2029
2030             vlib_buffer_copy_trace_flag (vm, p0, bi0);
2031             b0 = vlib_get_buffer (vm, bi0);
2032             vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
2033
2034             vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
2035
2036             vlib_set_next_frame_buffer (vm, node, adj0->rewrite_header.next_index, bi0);
2037           }
2038         }
2039
2040       vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
2041     }
2042
2043   return frame->n_vectors;
2044 }
2045
2046 static uword
2047 ip4_arp (vlib_main_t * vm,
2048          vlib_node_runtime_t * node,
2049          vlib_frame_t * frame)
2050 {
2051     return (ip4_arp_inline(vm, node, frame, 0));
2052 }
2053
2054 static uword
2055 ip4_glean (vlib_main_t * vm,
2056            vlib_node_runtime_t * node,
2057            vlib_frame_t * frame)
2058 {
2059     return (ip4_arp_inline(vm, node, frame, 1));
2060 }
2061
2062 static char * ip4_arp_error_strings[] = {
2063   [IP4_ARP_ERROR_DROP] = "address overflow drops",
2064   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
2065   [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
2066   [IP4_ARP_ERROR_REPLICATE_DROP] = "ARP replication completed",
2067   [IP4_ARP_ERROR_REPLICATE_FAIL] = "ARP replication failed",
2068   [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
2069 };
2070
2071 VLIB_REGISTER_NODE (ip4_arp_node) = {
2072   .function = ip4_arp,
2073   .name = "ip4-arp",
2074   .vector_size = sizeof (u32),
2075
2076   .format_trace = format_ip4_forward_next_trace,
2077
2078   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2079   .error_strings = ip4_arp_error_strings,
2080
2081   .n_next_nodes = IP4_ARP_N_NEXT,
2082   .next_nodes = {
2083     [IP4_ARP_NEXT_DROP] = "error-drop",
2084   },
2085 };
2086
2087 VLIB_REGISTER_NODE (ip4_glean_node) = {
2088   .function = ip4_glean,
2089   .name = "ip4-glean",
2090   .vector_size = sizeof (u32),
2091
2092   .format_trace = format_ip4_forward_next_trace,
2093
2094   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2095   .error_strings = ip4_arp_error_strings,
2096
2097   .n_next_nodes = IP4_ARP_N_NEXT,
2098   .next_nodes = {
2099     [IP4_ARP_NEXT_DROP] = "error-drop",
2100   },
2101 };
2102
2103 #define foreach_notrace_ip4_arp_error           \
2104 _(DROP)                                         \
2105 _(REQUEST_SENT)                                 \
2106 _(REPLICATE_DROP)                               \
2107 _(REPLICATE_FAIL)
2108
2109 clib_error_t * arp_notrace_init (vlib_main_t * vm)
2110 {
2111   vlib_node_runtime_t *rt =
2112     vlib_node_get_runtime (vm, ip4_arp_node.index);
2113
2114   /* don't trace ARP request packets */
2115 #define _(a)                                    \
2116     vnet_pcap_drop_trace_filter_add_del         \
2117         (rt->errors[IP4_ARP_ERROR_##a],         \
2118          1 /* is_add */);
2119     foreach_notrace_ip4_arp_error;
2120 #undef _
2121   return 0;
2122 }
2123
2124 VLIB_INIT_FUNCTION(arp_notrace_init);
2125
2126
2127 /* Send an ARP request to see if given destination is reachable on given interface. */
2128 clib_error_t *
2129 ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index)
2130 {
2131   vnet_main_t * vnm = vnet_get_main();
2132   ip4_main_t * im = &ip4_main;
2133   ethernet_arp_header_t * h;
2134   ip4_address_t * src;
2135   ip_interface_address_t * ia;
2136   ip_adjacency_t * adj;
2137   vnet_hw_interface_t * hi;
2138   vnet_sw_interface_t * si;
2139   vlib_buffer_t * b;
2140   u32 bi = 0;
2141
2142   si = vnet_get_sw_interface (vnm, sw_if_index);
2143
2144   if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
2145     {
2146       return clib_error_return (0, "%U: interface %U down",
2147                                 format_ip4_address, dst,
2148                                 format_vnet_sw_if_index_name, vnm,
2149                                 sw_if_index);
2150     }
2151
2152   src = ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
2153   if (! src)
2154     {
2155       vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
2156       return clib_error_return
2157         (0, "no matching interface address for destination %U (interface %U)",
2158          format_ip4_address, dst,
2159          format_vnet_sw_if_index_name, vnm, sw_if_index);
2160     }
2161
2162   adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index);
2163
2164   h = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi);
2165
2166   hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
2167
2168   clib_memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address, sizeof (h->ip4_over_ethernet[0].ethernet));
2169
2170   h->ip4_over_ethernet[0].ip4 = src[0];
2171   h->ip4_over_ethernet[1].ip4 = dst[0];
2172
2173   b = vlib_get_buffer (vm, bi);
2174   vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
2175
2176   /* Add encapsulation string for software interface (e.g. ethernet header). */
2177   vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
2178   vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
2179
2180   {
2181     vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
2182     u32 * to_next = vlib_frame_vector_args (f);
2183     to_next[0] = bi;
2184     f->n_vectors = 1;
2185     vlib_put_frame_to_node (vm, hi->output_node_index, f);
2186   }
2187
2188   return /* no error */ 0;
2189 }
2190
2191 typedef enum {
2192   IP4_REWRITE_NEXT_DROP,
2193   IP4_REWRITE_NEXT_ICMP_ERROR,
2194 } ip4_rewrite_next_t;
2195
2196 always_inline uword
2197 ip4_rewrite_inline (vlib_main_t * vm,
2198                     vlib_node_runtime_t * node,
2199                     vlib_frame_t * frame,
2200                     int is_midchain)
2201 {
2202   ip_lookup_main_t * lm = &ip4_main.lookup_main;
2203   u32 * from = vlib_frame_vector_args (frame);
2204   u32 n_left_from, n_left_to_next, * to_next, next_index;
2205   vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
2206
2207   n_left_from = frame->n_vectors;
2208   next_index = node->cached_next_index;
2209   u32 cpu_index = os_get_cpu_number();
2210
2211   while (n_left_from > 0)
2212     {
2213       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2214
2215       while (n_left_from >= 4 && n_left_to_next >= 2)
2216         {
2217           ip_adjacency_t * adj0, * adj1;
2218           vlib_buffer_t * p0, * p1;
2219           ip4_header_t * ip0, * ip1;
2220           u32 pi0, rw_len0, next0, error0, checksum0, adj_index0;
2221           u32 pi1, rw_len1, next1, error1, checksum1, adj_index1;
2222           u32 tx_sw_if_index0, tx_sw_if_index1;
2223
2224           /* Prefetch next iteration. */
2225           {
2226             vlib_buffer_t * p2, * p3;
2227
2228             p2 = vlib_get_buffer (vm, from[2]);
2229             p3 = vlib_get_buffer (vm, from[3]);
2230
2231             vlib_prefetch_buffer_header (p2, STORE);
2232             vlib_prefetch_buffer_header (p3, STORE);
2233
2234             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
2235             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
2236           }
2237
2238           pi0 = to_next[0] = from[0];
2239           pi1 = to_next[1] = from[1];
2240
2241           from += 2;
2242           n_left_from -= 2;
2243           to_next += 2;
2244           n_left_to_next -= 2;
2245
2246           p0 = vlib_get_buffer (vm, pi0);
2247           p1 = vlib_get_buffer (vm, pi1);
2248
2249           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2250           adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];
2251
2252           /* We should never rewrite a pkt using the MISS adjacency */
2253           ASSERT(adj_index0 && adj_index1);
2254
2255           ip0 = vlib_buffer_get_current (p0);
2256           ip1 = vlib_buffer_get_current (p1);
2257
2258           error0 = error1 = IP4_ERROR_NONE;
2259           next0 = next1 = IP4_REWRITE_NEXT_DROP;
2260
2261           /* Decrement TTL & update checksum.
2262              Works either endian, so no need for byte swap. */
2263           if (PREDICT_TRUE(!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
2264             {
2265                 i32 ttl0 = ip0->ttl;
2266
2267               /* Input node should have reject packets with ttl 0. */
2268               ASSERT (ip0->ttl > 0);
2269
2270               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2271               checksum0 += checksum0 >= 0xffff;
2272
2273               ip0->checksum = checksum0;
2274               ttl0 -= 1;
2275               ip0->ttl = ttl0;
2276
2277               /*
2278                * If the ttl drops below 1 when forwarding, generate
2279                * an ICMP response.
2280                */
2281               if (PREDICT_FALSE(ttl0 <= 0))
2282                 {
2283                   error0 = IP4_ERROR_TIME_EXPIRED;
2284                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32)~0;
2285                   icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
2286                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2287                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2288                 }
2289
2290               /* Verify checksum. */
2291               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2292             }
2293           else
2294             {
2295               p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
2296             }
2297           if (PREDICT_TRUE(!(p1->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
2298             {
2299               i32 ttl1 = ip1->ttl;
2300
2301               /* Input node should have reject packets with ttl 0. */
2302               ASSERT (ip1->ttl > 0);
2303
2304               checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100);
2305               checksum1 += checksum1 >= 0xffff;
2306
2307               ip1->checksum = checksum1;
2308               ttl1 -= 1;
2309               ip1->ttl = ttl1;
2310
2311               /*
2312                * If the ttl drops below 1 when forwarding, generate
2313                * an ICMP response.
2314                */
2315               if (PREDICT_FALSE(ttl1 <= 0))
2316                 {
2317                   error1 = IP4_ERROR_TIME_EXPIRED;
2318                   vnet_buffer (p1)->sw_if_index[VLIB_TX] = (u32)~0;
2319                   icmp4_error_set_vnet_buffer(p1, ICMP4_time_exceeded,
2320                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2321                   next1 = IP4_REWRITE_NEXT_ICMP_ERROR;
2322                 }
2323
2324               /* Verify checksum. */
2325               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2326               ASSERT (ip1->checksum == ip4_header_checksum (ip1));
2327             }
2328           else
2329             {
2330               p1->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
2331             }
2332
2333           /* Rewrite packet header and updates lengths. */
2334           adj0 = ip_get_adjacency (lm, adj_index0);
2335           adj1 = ip_get_adjacency (lm, adj_index1);
2336
2337           /* Worth pipelining. No guarantee that adj0,1 are hot... */
2338           rw_len0 = adj0[0].rewrite_header.data_bytes;
2339           rw_len1 = adj1[0].rewrite_header.data_bytes;
2340           vnet_buffer(p0)->ip.save_rewrite_length = rw_len0;
2341           vnet_buffer(p1)->ip.save_rewrite_length = rw_len1;
2342
2343           /* Check MTU of outgoing interface. */
2344           error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes
2345                     ? IP4_ERROR_MTU_EXCEEDED
2346                     : error0);
2347           error1 = (vlib_buffer_length_in_chain (vm, p1) > adj1[0].rewrite_header.max_l3_packet_bytes
2348                     ? IP4_ERROR_MTU_EXCEEDED
2349                     : error1);
2350
2351           next0 = (error0 == IP4_ERROR_NONE)
2352             ? adj0[0].rewrite_header.next_index : next0;
2353
2354           next1 = (error1 == IP4_ERROR_NONE)
2355             ? adj1[0].rewrite_header.next_index : next1;
2356
2357           /*
2358            * We've already accounted for an ethernet_header_t elsewhere
2359            */
2360           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
2361               vlib_increment_combined_counter
2362                   (&adjacency_counters,
2363                    cpu_index, adj_index0,
2364                    /* packet increment */ 0,
2365                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
2366
2367           if (PREDICT_FALSE (rw_len1 > sizeof(ethernet_header_t)))
2368               vlib_increment_combined_counter
2369                   (&adjacency_counters,
2370                    cpu_index, adj_index1,
2371                    /* packet increment */ 0,
2372                    /* byte increment */ rw_len1-sizeof(ethernet_header_t));
2373
2374           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2375            * to see the IP headerr */
2376           if (PREDICT_TRUE(error0 == IP4_ERROR_NONE))
2377             {
2378               p0->current_data -= rw_len0;
2379               p0->current_length += rw_len0;
2380               tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2381               vnet_buffer (p0)->sw_if_index[VLIB_TX] =
2382                   tx_sw_if_index0;
2383
2384               vnet_feature_arc_start(lm->output_feature_arc_index,
2385                                      tx_sw_if_index0, &next0, p0);
2386             }
2387           if (PREDICT_TRUE(error1 == IP4_ERROR_NONE))
2388             {
2389               p1->current_data -= rw_len1;
2390               p1->current_length += rw_len1;
2391
2392               tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2393               vnet_buffer (p1)->sw_if_index[VLIB_TX] =
2394                   tx_sw_if_index1;
2395
2396               vnet_feature_arc_start(lm->output_feature_arc_index,
2397                                      tx_sw_if_index1, &next1, p1);
2398             }
2399
2400           /* Guess we are only writing on simple Ethernet header. */
2401           vnet_rewrite_two_headers (adj0[0], adj1[0],
2402                                     ip0, ip1,
2403                                     sizeof (ethernet_header_t));
2404
2405           if (is_midchain)
2406           {
2407               adj0->sub_type.midchain.fixup_func(vm, adj0, p0);
2408               adj1->sub_type.midchain.fixup_func(vm, adj1, p1);
2409           }
2410
2411           vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
2412                                            to_next, n_left_to_next,
2413                                            pi0, pi1, next0, next1);
2414         }
2415
2416       while (n_left_from > 0 && n_left_to_next > 0)
2417         {
2418           ip_adjacency_t * adj0;
2419           vlib_buffer_t * p0;
2420           ip4_header_t * ip0;
2421           u32 pi0, rw_len0, adj_index0, next0, error0, checksum0;
2422           u32 tx_sw_if_index0;
2423
2424           pi0 = to_next[0] = from[0];
2425
2426           p0 = vlib_get_buffer (vm, pi0);
2427
2428           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2429
2430           /* We should never rewrite a pkt using the MISS adjacency */
2431           ASSERT(adj_index0);
2432
2433           adj0 = ip_get_adjacency (lm, adj_index0);
2434
2435           ip0 = vlib_buffer_get_current (p0);
2436
2437           error0 = IP4_ERROR_NONE;
2438           next0 = IP4_REWRITE_NEXT_DROP;            /* drop on error */
2439
2440           /* Decrement TTL & update checksum. */
2441           if (PREDICT_TRUE(!(p0->flags & VNET_BUFFER_LOCALLY_ORIGINATED)))
2442             {
2443               i32 ttl0 = ip0->ttl;
2444
2445               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2446
2447               checksum0 += checksum0 >= 0xffff;
2448
2449               ip0->checksum = checksum0;
2450
2451               ASSERT (ip0->ttl > 0);
2452
2453               ttl0 -= 1;
2454
2455               ip0->ttl = ttl0;
2456
2457               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2458
2459               if (PREDICT_FALSE(ttl0 <= 0))
2460                 {
2461                   /*
2462                    * If the ttl drops below 1 when forwarding, generate
2463                    * an ICMP response.
2464                    */
2465                   error0 = IP4_ERROR_TIME_EXPIRED;
2466                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2467                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32)~0;
2468                   icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
2469                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2470                 }
2471             }
2472           else
2473             {
2474               p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED;
2475             }
2476
2477           /* Guess we are only writing on simple Ethernet header. */
2478           vnet_rewrite_one_header (adj0[0], ip0,
2479                                    sizeof (ethernet_header_t));
2480
2481           /* Update packet buffer attributes/set output interface. */
2482           rw_len0 = adj0[0].rewrite_header.data_bytes;
2483           vnet_buffer(p0)->ip.save_rewrite_length = rw_len0;
2484
2485           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
2486               vlib_increment_combined_counter
2487                   (&adjacency_counters,
2488                    cpu_index, adj_index0,
2489                    /* packet increment */ 0,
2490                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
2491
2492           /* Check MTU of outgoing interface. */
2493           error0 = (vlib_buffer_length_in_chain (vm, p0)
2494                     > adj0[0].rewrite_header.max_l3_packet_bytes
2495                     ? IP4_ERROR_MTU_EXCEEDED
2496                     : error0);
2497
2498           p0->error = error_node->errors[error0];
2499
2500           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2501            * to see the IP headerr */
2502           if (PREDICT_TRUE(error0 == IP4_ERROR_NONE))
2503             {
2504               p0->current_data -= rw_len0;
2505               p0->current_length += rw_len0;
2506               tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2507
2508               vnet_buffer (p0)->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2509               next0 = adj0[0].rewrite_header.next_index;
2510
2511               if (is_midchain)
2512                 {
2513                   adj0->sub_type.midchain.fixup_func(vm, adj0, p0);
2514                 }
2515
2516               vnet_feature_arc_start(lm->output_feature_arc_index,
2517                                      tx_sw_if_index0, &next0, p0);
2518
2519             }
2520
2521           from += 1;
2522           n_left_from -= 1;
2523           to_next += 1;
2524           n_left_to_next -= 1;
2525
2526           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
2527                                            to_next, n_left_to_next,
2528                                            pi0, next0);
2529         }
2530
2531       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2532     }
2533
2534   /* Need to do trace after rewrites to pick up new packet data. */
2535   if (node->flags & VLIB_NODE_FLAG_TRACE)
2536     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2537
2538   return frame->n_vectors;
2539 }
2540
2541
2542 /** @brief IPv4 rewrite node.
2543     @node ip4-rewrite
2544
2545     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2546     header checksum, fetch the ip adjacency, check the outbound mtu,
2547     apply the adjacency rewrite, and send pkts to the adjacency
2548     rewrite header's rewrite_next_index.
2549
2550     @param vm vlib_main_t corresponding to the current thread
2551     @param node vlib_node_runtime_t
2552     @param frame vlib_frame_t whose contents should be dispatched
2553
2554     @par Graph mechanics: buffer metadata, next index usage
2555
2556     @em Uses:
2557     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2558         - the rewrite adjacency index
2559     - <code>adj->lookup_next_index</code>
2560         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2561           the packet will be dropped.
2562     - <code>adj->rewrite_header</code>
2563         - Rewrite string length, rewrite string, next_index
2564
2565     @em Sets:
2566     - <code>b->current_data, b->current_length</code>
2567         - Updated net of applying the rewrite string
2568
2569     <em>Next Indices:</em>
2570     - <code> adj->rewrite_header.next_index </code>
2571       or @c error-drop
2572 */
2573 static uword
2574 ip4_rewrite (vlib_main_t * vm,
2575              vlib_node_runtime_t * node,
2576              vlib_frame_t * frame)
2577 {
2578   return ip4_rewrite_inline (vm, node, frame, 0);
2579 }
2580
2581 static uword
2582 ip4_midchain (vlib_main_t * vm,
2583               vlib_node_runtime_t * node,
2584               vlib_frame_t * frame)
2585 {
2586   return ip4_rewrite_inline (vm, node, frame, 1);
2587 }
2588
2589
2590 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2591   .function = ip4_rewrite,
2592   .name = "ip4-rewrite",
2593   .vector_size = sizeof (u32),
2594
2595   .format_trace = format_ip4_rewrite_trace,
2596
2597   .n_next_nodes = 2,
2598   .next_nodes = {
2599     [IP4_REWRITE_NEXT_DROP] = "error-drop",
2600     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2601   },
2602 };
2603
2604 VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_node, ip4_rewrite)
2605
2606 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2607   .function = ip4_midchain,
2608   .name = "ip4-midchain",
2609   .vector_size = sizeof (u32),
2610
2611   .format_trace = format_ip4_forward_next_trace,
2612
2613   .sibling_of = "ip4-rewrite",
2614 };
2615
2616 VLIB_NODE_FUNCTION_MULTIARCH (ip4_midchain_node, ip4_midchain)
2617
2618 static clib_error_t *
2619 add_del_interface_table (vlib_main_t * vm,
2620                          unformat_input_t * input,
2621                          vlib_cli_command_t * cmd)
2622 {
2623   vnet_main_t * vnm = vnet_get_main();
2624   clib_error_t * error = 0;
2625   u32 sw_if_index, table_id;
2626
2627   sw_if_index = ~0;
2628
2629   if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
2630     {
2631       error = clib_error_return (0, "unknown interface `%U'",
2632                                  format_unformat_error, input);
2633       goto done;
2634     }
2635
2636   if (unformat (input, "%d", &table_id))
2637     ;
2638   else
2639     {
2640       error = clib_error_return (0, "expected table id `%U'",
2641                                  format_unformat_error, input);
2642       goto done;
2643     }
2644
2645   {
2646     ip4_main_t * im = &ip4_main;
2647     u32 fib_index;
2648
2649     fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4,
2650                                                    table_id);
2651
2652     //
2653     // FIXME-LATER
2654     //  changing an interface's table has consequences for any connecteds
2655     //  and adj-fibs already installed.
2656     //
2657     vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
2658     im->fib_index_by_sw_if_index[sw_if_index] = fib_index;
2659   }
2660
2661  done:
2662   return error;
2663 }
2664
2665 /*?
2666  * Place the indicated interface into the supplied IPv4 FIB table (also known
2667  * as a VRF). If the FIB table does not exist, this command creates it. To
2668  * display the current IPv4 FIB table, use the command '<em>show ip fib</em>'.
2669  * FIB table will only be displayed if a route has been added to the table, or
2670  * an IP Address is assigned to an interface in the table (which adds a route
2671  * automatically).
2672  *
2673  * @note IP addresses added after setting the interface IP table end up in
2674  * the indicated FIB table. If the IP address is added prior to adding the
2675  * interface to the FIB table, it will NOT be part of the FIB table. Predictable
2676  * but potentially counter-intuitive results occur if you provision interface
2677  * addresses in multiple FIBs. Upon RX, packets will be processed in the last
2678  * IP table ID provisioned. It might be marginally useful to evade source RPF
2679  * drops to put an interface address into multiple FIBs.
2680  *
2681  * @cliexpar
2682  * Example of how to add an interface to an IPv4 FIB table (where 2 is the table-id):
2683  * @cliexcmd{set interface ip table GigabitEthernet2/0/0 2}
2684  ?*/
2685 /* *INDENT-OFF* */
2686 VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = {
2687   .path = "set interface ip table",
2688   .function = add_del_interface_table,
2689   .short_help = "set interface ip table <interface> <table-id>",
2690 };
2691 /* *INDENT-ON* */
2692
2693
2694 static uword
2695 ip4_lookup_multicast (vlib_main_t * vm,
2696                       vlib_node_runtime_t * node,
2697                       vlib_frame_t * frame)
2698 {
2699   ip4_main_t * im = &ip4_main;
2700   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
2701   u32 n_left_from, n_left_to_next, * from, * to_next;
2702   ip_lookup_next_t next;
2703   u32 cpu_index = os_get_cpu_number();
2704
2705   from = vlib_frame_vector_args (frame);
2706   n_left_from = frame->n_vectors;
2707   next = node->cached_next_index;
2708
2709   while (n_left_from > 0)
2710     {
2711       vlib_get_next_frame (vm, node, next,
2712                            to_next, n_left_to_next);
2713
2714       while (n_left_from >= 4 && n_left_to_next >= 2)
2715         {
2716           vlib_buffer_t * p0, * p1;
2717           u32 pi0, pi1, lb_index0, lb_index1, wrong_next;
2718           ip_lookup_next_t next0, next1;
2719           ip4_header_t * ip0, * ip1;
2720           u32 fib_index0, fib_index1;
2721           const dpo_id_t *dpo0, *dpo1;
2722           const load_balance_t * lb0, * lb1;
2723
2724           /* Prefetch next iteration. */
2725           {
2726             vlib_buffer_t * p2, * p3;
2727
2728             p2 = vlib_get_buffer (vm, from[2]);
2729             p3 = vlib_get_buffer (vm, from[3]);
2730
2731             vlib_prefetch_buffer_header (p2, LOAD);
2732             vlib_prefetch_buffer_header (p3, LOAD);
2733
2734             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
2735             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
2736           }
2737
2738           pi0 = to_next[0] = from[0];
2739           pi1 = to_next[1] = from[1];
2740
2741           p0 = vlib_get_buffer (vm, pi0);
2742           p1 = vlib_get_buffer (vm, pi1);
2743
2744           ip0 = vlib_buffer_get_current (p0);
2745           ip1 = vlib_buffer_get_current (p1);
2746
2747           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
2748           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
2749           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
2750             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
2751           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
2752             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
2753
2754           lb_index0 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0),
2755                                                &ip0->dst_address);
2756           lb_index1 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index1),
2757                                                &ip1->dst_address);
2758
2759           lb0 = load_balance_get (lb_index0);
2760           lb1 = load_balance_get (lb_index1);
2761
2762           ASSERT (lb0->lb_n_buckets > 0);
2763           ASSERT (is_pow2 (lb0->lb_n_buckets));
2764           ASSERT (lb1->lb_n_buckets > 0);
2765           ASSERT (is_pow2 (lb1->lb_n_buckets));
2766
2767           vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash
2768               (ip0, lb0->lb_hash_config);
2769
2770           vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash
2771               (ip1, lb1->lb_hash_config);
2772
2773           dpo0 = load_balance_get_bucket_i(lb0,
2774                                            (vnet_buffer (p0)->ip.flow_hash &
2775                                             (lb0->lb_n_buckets_minus_1)));
2776           dpo1 = load_balance_get_bucket_i(lb1,
2777                                            (vnet_buffer (p1)->ip.flow_hash &
2778                                             (lb1->lb_n_buckets_minus_1)));
2779
2780           next0 = dpo0->dpoi_next_node;
2781           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
2782           next1 = dpo1->dpoi_next_node;
2783           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
2784
2785           if (1) /* $$$$$$ HACK FIXME */
2786           vlib_increment_combined_counter
2787               (cm, cpu_index, lb_index0, 1,
2788                vlib_buffer_length_in_chain (vm, p0));
2789           if (1) /* $$$$$$ HACK FIXME */
2790           vlib_increment_combined_counter
2791               (cm, cpu_index, lb_index1, 1,
2792                vlib_buffer_length_in_chain (vm, p1));
2793
2794           from += 2;
2795           to_next += 2;
2796           n_left_to_next -= 2;
2797           n_left_from -= 2;
2798
2799           wrong_next = (next0 != next) + 2*(next1 != next);
2800           if (PREDICT_FALSE (wrong_next != 0))
2801             {
2802               switch (wrong_next)
2803                 {
2804                 case 1:
2805                   /* A B A */
2806                   to_next[-2] = pi1;
2807                   to_next -= 1;
2808                   n_left_to_next += 1;
2809                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
2810                   break;
2811
2812                 case 2:
2813                   /* A A B */
2814                   to_next -= 1;
2815                   n_left_to_next += 1;
2816                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
2817                   break;
2818
2819                 case 3:
2820                   /* A B C */
2821                   to_next -= 2;
2822                   n_left_to_next += 2;
2823                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
2824                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
2825                   if (next0 == next1)
2826                     {
2827                       /* A B B */
2828                       vlib_put_next_frame (vm, node, next, n_left_to_next);
2829                       next = next1;
2830                       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
2831                     }
2832                 }
2833             }
2834         }
2835
2836       while (n_left_from > 0 && n_left_to_next > 0)
2837         {
2838           vlib_buffer_t * p0;
2839           ip4_header_t * ip0;
2840           u32 pi0, lb_index0;
2841           ip_lookup_next_t next0;
2842           u32 fib_index0;
2843           const dpo_id_t *dpo0;
2844           const load_balance_t * lb0;
2845
2846           pi0 = from[0];
2847           to_next[0] = pi0;
2848
2849           p0 = vlib_get_buffer (vm, pi0);
2850
2851           ip0 = vlib_buffer_get_current (p0);
2852
2853           fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
2854                                 vnet_buffer (p0)->sw_if_index[VLIB_RX]);
2855           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
2856               fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
2857
2858           lb_index0 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0),
2859                                                &ip0->dst_address);
2860
2861           lb0 = load_balance_get (lb_index0);
2862
2863           ASSERT (lb0->lb_n_buckets > 0);
2864           ASSERT (is_pow2 (lb0->lb_n_buckets));
2865
2866           vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash
2867               (ip0, lb0->lb_hash_config);
2868
2869           dpo0 = load_balance_get_bucket_i(lb0,
2870                                            (vnet_buffer (p0)->ip.flow_hash &
2871                                             (lb0->lb_n_buckets_minus_1)));
2872
2873           next0 = dpo0->dpoi_next_node;
2874           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
2875
2876           if (1) /* $$$$$$ HACK FIXME */
2877               vlib_increment_combined_counter
2878                   (cm, cpu_index, lb_index0, 1,
2879                    vlib_buffer_length_in_chain (vm, p0));
2880
2881           from += 1;
2882           to_next += 1;
2883           n_left_to_next -= 1;
2884           n_left_from -= 1;
2885
2886           if (PREDICT_FALSE (next0 != next))
2887             {
2888               n_left_to_next += 1;
2889               vlib_put_next_frame (vm, node, next, n_left_to_next);
2890               next = next0;
2891               vlib_get_next_frame (vm, node, next,
2892                                    to_next, n_left_to_next);
2893               to_next[0] = pi0;
2894               to_next += 1;
2895               n_left_to_next -= 1;
2896             }
2897         }
2898
2899       vlib_put_next_frame (vm, node, next, n_left_to_next);
2900     }
2901
2902   if (node->flags & VLIB_NODE_FLAG_TRACE)
2903       ip4_forward_next_trace(vm, node, frame, VLIB_TX);
2904
2905   return frame->n_vectors;
2906 }
2907
2908 VLIB_REGISTER_NODE (ip4_lookup_multicast_node,static) = {
2909   .function = ip4_lookup_multicast,
2910   .name = "ip4-lookup-multicast",
2911   .vector_size = sizeof (u32),
2912   .sibling_of = "ip4-lookup",
2913   .format_trace = format_ip4_lookup_trace,
2914
2915   .n_next_nodes = 0,
2916 };
2917
2918 VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_multicast_node, ip4_lookup_multicast)
2919
2920 VLIB_REGISTER_NODE (ip4_multicast_node,static) = {
2921   .function = ip4_drop,
2922   .name = "ip4-multicast",
2923   .vector_size = sizeof (u32),
2924
2925   .format_trace = format_ip4_forward_next_trace,
2926
2927   .n_next_nodes = 1,
2928   .next_nodes = {
2929     [0] = "error-drop",
2930   },
2931 };
2932
2933 int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0)
2934 {
2935   ip4_fib_mtrie_t * mtrie0;
2936   ip4_fib_mtrie_leaf_t leaf0;
2937   u32 lbi0;
2938
2939   mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
2940
2941   leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
2942   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 0);
2943   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 1);
2944   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
2945   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
2946
2947   /* Handle default route. */
2948   leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
2949
2950   lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
2951
2952   return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0), a);
2953 }
2954
2955 static clib_error_t *
2956 test_lookup_command_fn (vlib_main_t * vm,
2957                         unformat_input_t * input,
2958                         vlib_cli_command_t * cmd)
2959 {
2960   ip4_fib_t *fib;
2961   u32 table_id = 0;
2962   f64 count = 1;
2963   u32 n;
2964   int i;
2965   ip4_address_t ip4_base_address;
2966   u64 errors = 0;
2967
2968   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
2969       if (unformat (input, "table %d", &table_id))
2970       {
2971           /* Make sure the entry exists. */
2972           fib = ip4_fib_get(table_id);
2973           if ((fib) && (fib->index != table_id))
2974               return clib_error_return (0, "<fib-index> %d does not exist",
2975                                         table_id);
2976       }
2977       else if (unformat (input, "count %f", &count))
2978         ;
2979
2980       else if (unformat (input, "%U",
2981                          unformat_ip4_address, &ip4_base_address))
2982         ;
2983       else
2984         return clib_error_return (0, "unknown input `%U'",
2985                                   format_unformat_error, input);
2986   }
2987
2988   n = count;
2989
2990   for (i = 0; i < n; i++)
2991     {
2992       if (!ip4_lookup_validate (&ip4_base_address, table_id))
2993         errors++;
2994
2995       ip4_base_address.as_u32 =
2996         clib_host_to_net_u32 (1 +
2997                               clib_net_to_host_u32 (ip4_base_address.as_u32));
2998     }
2999
3000   if (errors)
3001     vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
3002   else
3003     vlib_cli_output (vm, "No errors in %d lookups\n", n);
3004
3005   return 0;
3006 }
3007
3008 /*?
3009  * Perform a lookup of an IPv4 Address (or range of addresses) in the
3010  * given FIB table to determine if there is a conflict with the
3011  * adjacency table. The fib-id can be determined by using the
3012  * '<em>show ip fib</em>' command. If fib-id is not entered, default value
3013  * of 0 is used.
3014  *
3015  * @todo This command uses fib-id, other commands use table-id (not
3016  * just a name, they are different indexes). Would like to change this
3017  * to table-id for consistency.
3018  *
3019  * @cliexpar
3020  * Example of how to run the test lookup command:
3021  * @cliexstart{test lookup 172.16.1.1 table 1 count 2}
3022  * No errors in 2 lookups
3023  * @cliexend
3024 ?*/
3025 /* *INDENT-OFF* */
3026 VLIB_CLI_COMMAND (lookup_test_command, static) = {
3027     .path = "test lookup",
3028     .short_help = "test lookup <ipv4-addr> [table <fib-id>] [count <nn>]",
3029     .function = test_lookup_command_fn,
3030 };
3031 /* *INDENT-ON* */
3032
3033 int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
3034 {
3035   ip4_main_t * im4 = &ip4_main;
3036   ip4_fib_t * fib;
3037   uword * p = hash_get (im4->fib_index_by_table_id, table_id);
3038
3039   if (p == 0)
3040     return VNET_API_ERROR_NO_SUCH_FIB;
3041
3042   fib = ip4_fib_get (p[0]);
3043
3044   fib->flow_hash_config = flow_hash_config;
3045   return 0;
3046 }
3047
3048 static clib_error_t *
3049 set_ip_flow_hash_command_fn (vlib_main_t * vm,
3050                              unformat_input_t * input,
3051                              vlib_cli_command_t * cmd)
3052 {
3053   int matched = 0;
3054   u32 table_id = 0;
3055   u32 flow_hash_config = 0;
3056   int rv;
3057
3058   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
3059     if (unformat (input, "table %d", &table_id))
3060       matched = 1;
3061 #define _(a,v) \
3062     else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
3063     foreach_flow_hash_bit
3064 #undef _
3065     else break;
3066   }
3067
3068   if (matched == 0)
3069     return clib_error_return (0, "unknown input `%U'",
3070                               format_unformat_error, input);
3071
3072   rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
3073   switch (rv)
3074     {
3075     case 0:
3076       break;
3077
3078     case VNET_API_ERROR_NO_SUCH_FIB:
3079       return clib_error_return (0, "no such FIB table %d", table_id);
3080
3081     default:
3082       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
3083       break;
3084     }
3085
3086   return 0;
3087 }
3088
3089 /*?
3090  * Configure the set of IPv4 fields used by the flow hash.
3091  *
3092  * @cliexpar
3093  * Example of how to set the flow hash on a given table:
3094  * @cliexcmd{set ip flow-hash table 7 dst sport dport proto}
3095  * Example of display the configured flow hash:
3096  * @cliexstart{show ip fib}
3097  * ipv4-VRF:0, fib_index 0, flow hash: src dst sport dport proto
3098  * 0.0.0.0/0
3099  *   unicast-ip4-chain
3100  *   [@0]: dpo-load-balance: [index:0 buckets:1 uRPF:0 to:[0:0]]
3101  *     [0] [@0]: dpo-drop ip6
3102  * 0.0.0.0/32
3103  *   unicast-ip4-chain
3104  *   [@0]: dpo-load-balance: [index:1 buckets:1 uRPF:1 to:[0:0]]
3105  *     [0] [@0]: dpo-drop ip6
3106  * 224.0.0.0/8
3107  *   unicast-ip4-chain
3108  *   [@0]: dpo-load-balance: [index:3 buckets:1 uRPF:3 to:[0:0]]
3109  *     [0] [@0]: dpo-drop ip6
3110  * 6.0.1.2/32
3111  *   unicast-ip4-chain
3112  *   [@0]: dpo-load-balance: [index:30 buckets:1 uRPF:29 to:[0:0]]
3113  *     [0] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3114  * 7.0.0.1/32
3115  *   unicast-ip4-chain
3116  *   [@0]: dpo-load-balance: [index:31 buckets:4 uRPF:30 to:[0:0]]
3117  *     [0] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3118  *     [1] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3119  *     [2] [@3]: arp-ipv4: via 6.0.0.2 af_packet0
3120  *     [3] [@3]: arp-ipv4: via 6.0.0.1 af_packet0
3121  * 240.0.0.0/8
3122  *   unicast-ip4-chain
3123  *   [@0]: dpo-load-balance: [index:2 buckets:1 uRPF:2 to:[0:0]]
3124  *     [0] [@0]: dpo-drop ip6
3125  * 255.255.255.255/32
3126  *   unicast-ip4-chain
3127  *   [@0]: dpo-load-balance: [index:4 buckets:1 uRPF:4 to:[0:0]]
3128  *     [0] [@0]: dpo-drop ip6
3129  * ipv4-VRF:7, fib_index 1, flow hash: dst sport dport proto
3130  * 0.0.0.0/0
3131  *   unicast-ip4-chain
3132  *   [@0]: dpo-load-balance: [index:12 buckets:1 uRPF:11 to:[0:0]]
3133  *     [0] [@0]: dpo-drop ip6
3134  * 0.0.0.0/32
3135  *   unicast-ip4-chain
3136  *   [@0]: dpo-load-balance: [index:13 buckets:1 uRPF:12 to:[0:0]]
3137  *     [0] [@0]: dpo-drop ip6
3138  * 172.16.1.0/24
3139  *   unicast-ip4-chain
3140  *   [@0]: dpo-load-balance: [index:17 buckets:1 uRPF:16 to:[0:0]]
3141  *     [0] [@4]: ipv4-glean: af_packet0
3142  * 172.16.1.1/32
3143  *   unicast-ip4-chain
3144  *   [@0]: dpo-load-balance: [index:18 buckets:1 uRPF:17 to:[1:84]]
3145  *     [0] [@2]: dpo-receive: 172.16.1.1 on af_packet0
3146  * 172.16.1.2/32
3147  *   unicast-ip4-chain
3148  *   [@0]: dpo-load-balance: [index:21 buckets:1 uRPF:20 to:[0:0]]
3149  *     [0] [@5]: ipv4 via 172.16.1.2 af_packet0: IP4: 02:fe:9e:70:7a:2b -> 26:a5:f6:9c:3a:36
3150  * 172.16.2.0/24
3151  *   unicast-ip4-chain
3152  *   [@0]: dpo-load-balance: [index:19 buckets:1 uRPF:18 to:[0:0]]
3153  *     [0] [@4]: ipv4-glean: af_packet1
3154  * 172.16.2.1/32
3155  *   unicast-ip4-chain
3156  *   [@0]: dpo-load-balance: [index:20 buckets:1 uRPF:19 to:[0:0]]
3157  *     [0] [@2]: dpo-receive: 172.16.2.1 on af_packet1
3158  * 224.0.0.0/8
3159  *   unicast-ip4-chain
3160  *   [@0]: dpo-load-balance: [index:15 buckets:1 uRPF:14 to:[0:0]]
3161  *     [0] [@0]: dpo-drop ip6
3162  * 240.0.0.0/8
3163  *   unicast-ip4-chain
3164  *   [@0]: dpo-load-balance: [index:14 buckets:1 uRPF:13 to:[0:0]]
3165  *     [0] [@0]: dpo-drop ip6
3166  * 255.255.255.255/32
3167  *   unicast-ip4-chain
3168  *   [@0]: dpo-load-balance: [index:16 buckets:1 uRPF:15 to:[0:0]]
3169  *     [0] [@0]: dpo-drop ip6
3170  * @cliexend
3171 ?*/
3172 /* *INDENT-OFF* */
3173 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) = {
3174   .path = "set ip flow-hash",
3175   .short_help =
3176   "set ip flow-hash table <table-id> [src] [dst] [sport] [dport] [proto] [reverse]",
3177   .function = set_ip_flow_hash_command_fn,
3178 };
3179 /* *INDENT-ON* */
3180
3181 int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
3182                                  u32 table_index)
3183 {
3184   vnet_main_t * vnm = vnet_get_main();
3185   vnet_interface_main_t * im = &vnm->interface_main;
3186   ip4_main_t * ipm = &ip4_main;
3187   ip_lookup_main_t * lm = &ipm->lookup_main;
3188   vnet_classify_main_t * cm = &vnet_classify_main;
3189   ip4_address_t *if_addr;
3190
3191   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
3192     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
3193
3194   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
3195     return VNET_API_ERROR_NO_SUCH_ENTRY;
3196
3197   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
3198   lm->classify_table_index_by_sw_if_index [sw_if_index] = table_index;
3199
3200   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
3201
3202   if (NULL != if_addr)
3203   {
3204       fib_prefix_t pfx = {
3205           .fp_len = 32,
3206           .fp_proto = FIB_PROTOCOL_IP4,
3207           .fp_addr.ip4 = *if_addr,
3208       };
3209       u32 fib_index;
3210
3211       fib_index = fib_table_get_index_for_sw_if_index(FIB_PROTOCOL_IP4,
3212                                                       sw_if_index);
3213
3214
3215       if (table_index != (u32) ~0)
3216       {
3217           dpo_id_t dpo = DPO_INVALID;
3218
3219           dpo_set(&dpo,
3220                   DPO_CLASSIFY,
3221                   DPO_PROTO_IP4,
3222                   classify_dpo_create(DPO_PROTO_IP4, table_index));
3223
3224           fib_table_entry_special_dpo_add(fib_index,
3225                                           &pfx,
3226                                           FIB_SOURCE_CLASSIFY,
3227                                           FIB_ENTRY_FLAG_NONE,
3228                                           &dpo);
3229           dpo_reset(&dpo);
3230       }
3231       else
3232       {
3233           fib_table_entry_special_remove(fib_index,
3234                                          &pfx,
3235                                          FIB_SOURCE_CLASSIFY);
3236       }
3237   }
3238
3239   return 0;
3240 }
3241
3242 static clib_error_t *
3243 set_ip_classify_command_fn (vlib_main_t * vm,
3244                             unformat_input_t * input,
3245                             vlib_cli_command_t * cmd)
3246 {
3247   u32 table_index = ~0;
3248   int table_index_set = 0;
3249   u32 sw_if_index = ~0;
3250   int rv;
3251
3252   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
3253     if (unformat (input, "table-index %d", &table_index))
3254       table_index_set = 1;
3255     else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
3256                        vnet_get_main(), &sw_if_index))
3257       ;
3258     else
3259       break;
3260   }
3261
3262   if (table_index_set == 0)
3263     return clib_error_return (0, "classify table-index must be specified");
3264
3265   if (sw_if_index == ~0)
3266     return clib_error_return (0, "interface / subif must be specified");
3267
3268   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
3269
3270   switch (rv)
3271     {
3272     case 0:
3273       break;
3274
3275     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
3276       return clib_error_return (0, "No such interface");
3277
3278     case VNET_API_ERROR_NO_SUCH_ENTRY:
3279       return clib_error_return (0, "No such classifier table");
3280     }
3281   return 0;
3282 }
3283
3284 /*?
3285  * Assign a classification table to an interface. The classification
3286  * table is created using the '<em>classify table</em>' and '<em>classify session</em>'
3287  * commands. Once the table is create, use this command to filter packets
3288  * on an interface.
3289  *
3290  * @cliexpar
3291  * Example of how to assign a classification table to an interface:
3292  * @cliexcmd{set ip classify intfc GigabitEthernet2/0/0 table-index 1}
3293 ?*/
3294 /* *INDENT-OFF* */
3295 VLIB_CLI_COMMAND (set_ip_classify_command, static) = {
3296     .path = "set ip classify",
3297     .short_help =
3298     "set ip classify intfc <interface> table-index <classify-idx>",
3299     .function = set_ip_classify_command_fn,
3300 };
3301 /* *INDENT-ON* */