unicast RPF for FIB2.0
[vpp.git] / vnet / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
43 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
44 #include <vnet/ppp/ppp.h>
45 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
46 #include <vnet/api_errno.h>     /* for API error numbers */
47 #include <vnet/fib/fib_table.h> /* for FIB table and entry creation */
48 #include <vnet/fib/fib_entry.h> /* for FIB table and entry creation */
49 #include <vnet/fib/fib_urpf_list.h> /* for FIB uRPF check */
50 #include <vnet/fib/ip4_fib.h>
51 #include <vnet/dpo/load_balance.h>
52 #include <vnet/dpo/classify_dpo.h>
53
54 void
55 ip4_forward_next_trace (vlib_main_t * vm,
56                         vlib_node_runtime_t * node,
57                         vlib_frame_t * frame,
58                         vlib_rx_or_tx_t which_adj_index);
59
60 always_inline uword
61 ip4_lookup_inline (vlib_main_t * vm,
62                    vlib_node_runtime_t * node,
63                    vlib_frame_t * frame,
64                    int lookup_for_responses_to_locally_received_packets)
65 {
66   ip4_main_t * im = &ip4_main;
67   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
68   u32 n_left_from, n_left_to_next, * from, * to_next;
69   ip_lookup_next_t next;
70   u32 cpu_index = os_get_cpu_number();
71
72   from = vlib_frame_vector_args (frame);
73   n_left_from = frame->n_vectors;
74   next = node->cached_next_index;
75
76   while (n_left_from > 0)
77     {
78       vlib_get_next_frame (vm, node, next,
79                            to_next, n_left_to_next);
80
81       while (n_left_from >= 4 && n_left_to_next >= 2)
82         {
83           vlib_buffer_t * p0, * p1;
84           ip4_header_t * ip0, * ip1;
85           __attribute__((unused)) tcp_header_t * tcp0, * tcp1;
86           ip_lookup_next_t next0, next1;
87           const load_balance_t * lb0, * lb1;
88           ip4_fib_mtrie_t * mtrie0, * mtrie1;
89           ip4_fib_mtrie_leaf_t leaf0, leaf1;
90           ip4_address_t * dst_addr0, *dst_addr1;
91           __attribute__((unused)) u32 pi0, fib_index0, lb_index0, is_tcp_udp0;
92           __attribute__((unused)) u32 pi1, fib_index1, lb_index1, is_tcp_udp1;
93           flow_hash_config_t flow_hash_config0, flow_hash_config1;
94           u32 hash_c0, hash_c1;
95           u32 wrong_next;
96           const dpo_id_t *dpo0, *dpo1;
97
98           /* Prefetch next iteration. */
99           {
100             vlib_buffer_t * p2, * p3;
101
102             p2 = vlib_get_buffer (vm, from[2]);
103             p3 = vlib_get_buffer (vm, from[3]);
104
105             vlib_prefetch_buffer_header (p2, LOAD);
106             vlib_prefetch_buffer_header (p3, LOAD);
107
108             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
109             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
110           }
111
112           pi0 = to_next[0] = from[0];
113           pi1 = to_next[1] = from[1];
114
115           p0 = vlib_get_buffer (vm, pi0);
116           p1 = vlib_get_buffer (vm, pi1);
117
118           ip0 = vlib_buffer_get_current (p0);
119           ip1 = vlib_buffer_get_current (p1);
120
121           dst_addr0 = &ip0->dst_address;
122           dst_addr1 = &ip1->dst_address;
123
124           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
125           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
126           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
127             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
128           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
129             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
130
131
132           if (! lookup_for_responses_to_locally_received_packets)
133             {
134               mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
135               mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
136
137               leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
138
139               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
140               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 0);
141             }
142
143           tcp0 = (void *) (ip0 + 1);
144           tcp1 = (void *) (ip1 + 1);
145
146           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
147                          || ip0->protocol == IP_PROTOCOL_UDP);
148           is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
149                          || ip1->protocol == IP_PROTOCOL_UDP);
150
151           if (! lookup_for_responses_to_locally_received_packets)
152             {
153               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
154               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 1);
155             }
156
157           if (! lookup_for_responses_to_locally_received_packets)
158             {
159               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
160               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
161             }
162
163           if (! lookup_for_responses_to_locally_received_packets)
164             {
165               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
166               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
167             }
168
169           if (lookup_for_responses_to_locally_received_packets)
170             {
171               lb_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
172               lb_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
173             }
174           else
175             {
176               /* Handle default route. */
177               leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
178               leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
179
180               lb_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
181               lb_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
182             }
183
184           lb0 = load_balance_get (lb_index0);
185           lb1 = load_balance_get (lb_index1);
186
187           /* Use flow hash to compute multipath adjacency. */
188           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
189           hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
190           if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
191             {
192               flow_hash_config0 = lb0->lb_hash_config;
193               hash_c0 = vnet_buffer (p0)->ip.flow_hash =
194                 ip4_compute_flow_hash (ip0, flow_hash_config0);
195             }
196           if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
197             {
198               flow_hash_config1 = lb1->lb_hash_config;
199               hash_c1 = vnet_buffer (p1)->ip.flow_hash =
200                 ip4_compute_flow_hash (ip1, flow_hash_config1);
201             }
202
203           ASSERT (lb0->lb_n_buckets > 0);
204           ASSERT (is_pow2 (lb0->lb_n_buckets));
205           ASSERT (lb1->lb_n_buckets > 0);
206           ASSERT (is_pow2 (lb1->lb_n_buckets));
207
208           dpo0 = load_balance_get_bucket_i(lb0,
209                                            (hash_c0 &
210                                             (lb0->lb_n_buckets_minus_1)));
211           dpo1 = load_balance_get_bucket_i(lb1,
212                                            (hash_c1 &
213                                             (lb0->lb_n_buckets_minus_1)));
214
215           next0 = dpo0->dpoi_next_node;
216           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
217           next1 = dpo1->dpoi_next_node;
218           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
219
220           vlib_increment_combined_counter
221               (cm, cpu_index, lb_index0, 1,
222                vlib_buffer_length_in_chain (vm, p0)
223                + sizeof(ethernet_header_t));
224           vlib_increment_combined_counter
225               (cm, cpu_index, lb_index1, 1,
226                vlib_buffer_length_in_chain (vm, p1)
227                + sizeof(ethernet_header_t));
228
229           from += 2;
230           to_next += 2;
231           n_left_to_next -= 2;
232           n_left_from -= 2;
233
234           wrong_next = (next0 != next) + 2*(next1 != next);
235           if (PREDICT_FALSE (wrong_next != 0))
236             {
237               switch (wrong_next)
238                 {
239                 case 1:
240                   /* A B A */
241                   to_next[-2] = pi1;
242                   to_next -= 1;
243                   n_left_to_next += 1;
244                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
245                   break;
246
247                 case 2:
248                   /* A A B */
249                   to_next -= 1;
250                   n_left_to_next += 1;
251                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
252                   break;
253
254                 case 3:
255                   /* A B C */
256                   to_next -= 2;
257                   n_left_to_next += 2;
258                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
259                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
260                   if (next0 == next1)
261                     {
262                       /* A B B */
263                       vlib_put_next_frame (vm, node, next, n_left_to_next);
264                       next = next1;
265                       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
266                     }
267                 }
268             }
269         }
270     
271       while (n_left_from > 0 && n_left_to_next > 0)
272         {
273           vlib_buffer_t * p0;
274           ip4_header_t * ip0;
275           __attribute__((unused)) tcp_header_t * tcp0;
276           ip_lookup_next_t next0;
277           const load_balance_t *lb0;
278           ip4_fib_mtrie_t * mtrie0;
279           ip4_fib_mtrie_leaf_t leaf0;
280           ip4_address_t * dst_addr0;
281           __attribute__((unused)) u32 pi0, fib_index0, is_tcp_udp0, lbi0;
282           flow_hash_config_t flow_hash_config0;
283           const dpo_id_t *dpo0;
284           u32 hash_c0;
285
286           pi0 = from[0];
287           to_next[0] = pi0;
288
289           p0 = vlib_get_buffer (vm, pi0);
290
291           ip0 = vlib_buffer_get_current (p0);
292
293           dst_addr0 = &ip0->dst_address;
294
295           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
296           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
297             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
298
299           if (! lookup_for_responses_to_locally_received_packets)
300             {
301               mtrie0 = &ip4_fib_get( fib_index0)->mtrie;
302
303               leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
304
305               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
306             }
307
308           tcp0 = (void *) (ip0 + 1);
309
310           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
311                          || ip0->protocol == IP_PROTOCOL_UDP);
312
313           if (! lookup_for_responses_to_locally_received_packets)
314             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
315
316           if (! lookup_for_responses_to_locally_received_packets)
317             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
318
319           if (! lookup_for_responses_to_locally_received_packets)
320             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
321
322           if (lookup_for_responses_to_locally_received_packets)
323             lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
324           else
325             {
326               /* Handle default route. */
327               leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
328               lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
329             }
330
331           lb0 = load_balance_get (lbi0);
332
333           /* Use flow hash to compute multipath adjacency. */
334           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
335           if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
336             {
337               flow_hash_config0 = lb0->lb_hash_config;
338
339               hash_c0 = vnet_buffer (p0)->ip.flow_hash = 
340                 ip4_compute_flow_hash (ip0, flow_hash_config0);
341             }
342
343           ASSERT (lb0->lb_n_buckets > 0);
344           ASSERT (is_pow2 (lb0->lb_n_buckets));
345
346           dpo0 = load_balance_get_bucket_i(lb0,
347                                            (hash_c0 &
348                                             (lb0->lb_n_buckets_minus_1)));
349
350           next0 = dpo0->dpoi_next_node;
351           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
352
353           vlib_increment_combined_counter 
354               (cm, cpu_index, lbi0, 1,
355                vlib_buffer_length_in_chain (vm, p0));
356
357           from += 1;
358           to_next += 1;
359           n_left_to_next -= 1;
360           n_left_from -= 1;
361
362           if (PREDICT_FALSE (next0 != next))
363             {
364               n_left_to_next += 1;
365               vlib_put_next_frame (vm, node, next, n_left_to_next);
366               next = next0;
367               vlib_get_next_frame (vm, node, next,
368                                    to_next, n_left_to_next);
369               to_next[0] = pi0;
370               to_next += 1;
371               n_left_to_next -= 1;
372             }
373         }
374
375       vlib_put_next_frame (vm, node, next, n_left_to_next);
376     }
377
378   if (node->flags & VLIB_NODE_FLAG_TRACE)
379     ip4_forward_next_trace(vm, node, frame, VLIB_TX);
380
381   return frame->n_vectors;
382 }
383
384 /** @brief IPv4 lookup node.
385     @node ip4-lookup
386
387     This is the main IPv4 lookup dispatch node.
388
389     @param vm vlib_main_t corresponding to the current thread
390     @param node vlib_node_runtime_t
391     @param frame vlib_frame_t whose contents should be dispatched
392
393     @par Graph mechanics: buffer metadata, next index usage
394
395     @em Uses:
396     - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
397         - Indicates the @c sw_if_index value of the interface that the
398           packet was received on.
399     - <code>vnet_buffer(b)->sw_if_index[VLIB_TX]</code>
400         - When the value is @c ~0 then the node performs a longest prefix
401           match (LPM) for the packet destination address in the FIB attached
402           to the receive interface.
403         - Otherwise perform LPM for the packet destination address in the
404           indicated FIB. In this case <code>[VLIB_TX]</code> is a FIB index
405           value (0, 1, ...) and not a VRF id.
406
407     @em Sets:
408     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
409         - The lookup result adjacency index.
410
411     <em>Next Index:</em>
412     - Dispatches the packet to the node index found in
413       ip_adjacency_t @c adj->lookup_next_index
414       (where @c adj is the lookup result adjacency).
415 */
416 static uword
417 ip4_lookup (vlib_main_t * vm,
418             vlib_node_runtime_t * node,
419             vlib_frame_t * frame)
420 {
421   return ip4_lookup_inline (vm, node, frame,
422                             /* lookup_for_responses_to_locally_received_packets */ 0);
423
424 }
425
426 static u8 * format_ip4_lookup_trace (u8 * s, va_list * args);
427
428 VLIB_REGISTER_NODE (ip4_lookup_node) = {
429   .function = ip4_lookup,
430   .name = "ip4-lookup",
431   .vector_size = sizeof (u32),
432
433   .format_trace = format_ip4_lookup_trace,
434   .n_next_nodes = IP_LOOKUP_N_NEXT,
435   .next_nodes = IP4_LOOKUP_NEXT_NODES,
436 };
437
438 VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_node, ip4_lookup)
439
440 always_inline uword
441 ip4_load_balance (vlib_main_t * vm,
442                   vlib_node_runtime_t * node,
443                   vlib_frame_t * frame)
444 {
445   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters;
446   u32 n_left_from, n_left_to_next, * from, * to_next;
447   ip_lookup_next_t next;
448   u32 cpu_index = os_get_cpu_number();
449
450   from = vlib_frame_vector_args (frame);
451   n_left_from = frame->n_vectors;
452   next = node->cached_next_index;
453
454   if (node->flags & VLIB_NODE_FLAG_TRACE)
455       ip4_forward_next_trace(vm, node, frame, VLIB_TX);
456
457   while (n_left_from > 0)
458     {
459       vlib_get_next_frame (vm, node, next,
460                            to_next, n_left_to_next);
461
462     
463       while (n_left_from > 0 && n_left_to_next > 0)
464         {
465           ip_lookup_next_t next0;
466           const load_balance_t *lb0;
467           vlib_buffer_t * p0;
468           u32 pi0, lbi0, hc0;
469           const ip4_header_t *ip0;
470           const dpo_id_t *dpo0;
471
472           pi0 = from[0];
473           to_next[0] = pi0;
474
475           p0 = vlib_get_buffer (vm, pi0);
476
477           ip0 = vlib_buffer_get_current (p0);
478           lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
479
480           lb0 = load_balance_get(lbi0);
481           hc0 = lb0->lb_hash_config;
482           vnet_buffer(p0)->ip.flow_hash = ip4_compute_flow_hash(ip0, hc0);
483
484           dpo0 = load_balance_get_bucket_i(lb0, 
485                                            vnet_buffer(p0)->ip.flow_hash &
486                                            (lb0->lb_n_buckets_minus_1));
487
488           next0 = dpo0->dpoi_next_node;
489           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
490
491           vlib_increment_combined_counter 
492               (cm, cpu_index, lbi0, 1,
493                vlib_buffer_length_in_chain (vm, p0));
494
495           from += 1;
496           to_next += 1;
497           n_left_to_next -= 1;
498           n_left_from -= 1;
499
500           if (PREDICT_FALSE (next0 != next))
501             {
502               n_left_to_next += 1;
503               vlib_put_next_frame (vm, node, next, n_left_to_next);
504               next = next0;
505               vlib_get_next_frame (vm, node, next,
506                                    to_next, n_left_to_next);
507               to_next[0] = pi0;
508               to_next += 1;
509               n_left_to_next -= 1;
510             }
511         }
512
513       vlib_put_next_frame (vm, node, next, n_left_to_next);
514     }
515
516   return frame->n_vectors;
517 }
518
519 static u8 * format_ip4_forward_next_trace (u8 * s, va_list * args);
520
521 VLIB_REGISTER_NODE (ip4_load_balance_node) = {
522   .function = ip4_load_balance,
523   .name = "ip4-load-balance",
524   .vector_size = sizeof (u32),
525   .sibling_of = "ip4-lookup",
526
527   .format_trace = format_ip4_forward_next_trace,
528 };
529
530 VLIB_NODE_FUNCTION_MULTIARCH (ip4_load_balance_node, ip4_load_balance)
531
532 /* get first interface address */
533 ip4_address_t *
534 ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
535                              ip_interface_address_t ** result_ia)
536 {
537   ip_lookup_main_t * lm = &im->lookup_main;
538   ip_interface_address_t * ia = 0;
539   ip4_address_t * result = 0;
540
541   foreach_ip_interface_address (lm, ia, sw_if_index, 
542                                 1 /* honor unnumbered */,
543   ({
544     ip4_address_t * a = ip_interface_address_get_address (lm, ia);
545     result = a;
546     break;
547   }));
548   if (result_ia)
549     *result_ia = result ? ia : 0;
550   return result;
551 }
552
553 static void
554 ip4_add_interface_routes (u32 sw_if_index,
555                           ip4_main_t * im, u32 fib_index,
556                           ip_interface_address_t * a)
557 {
558   ip_lookup_main_t * lm = &im->lookup_main;
559   ip4_address_t * address = ip_interface_address_get_address (lm, a);
560   fib_prefix_t pfx = {
561       .fp_len = a->address_length,
562       .fp_proto = FIB_PROTOCOL_IP4,
563       .fp_addr.ip4 = *address,
564   };
565
566   a->neighbor_probe_adj_index = ~0;
567
568   if (pfx.fp_len < 32)
569   {
570       fib_node_index_t fei;
571
572       fei = fib_table_entry_update_one_path(fib_index,
573                                             &pfx,
574                                             FIB_SOURCE_INTERFACE,
575                                             (FIB_ENTRY_FLAG_CONNECTED |
576                                              FIB_ENTRY_FLAG_ATTACHED),
577                                             FIB_PROTOCOL_IP4,
578                                             NULL, /* No next-hop address */
579                                             sw_if_index,
580                                             ~0, // invalid FIB index
581                                             1,
582                                             MPLS_LABEL_INVALID,
583                                             FIB_ROUTE_PATH_FLAG_NONE);
584       a->neighbor_probe_adj_index = fib_entry_get_adj(fei);
585   }
586
587   pfx.fp_len = 32;
588
589   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
590   {
591       u32 classify_table_index =
592           lm->classify_table_index_by_sw_if_index [sw_if_index];
593       if (classify_table_index != (u32) ~0)
594       {
595           dpo_id_t dpo = DPO_NULL;
596
597           dpo_set(&dpo,
598                   DPO_CLASSIFY,
599                   DPO_PROTO_IP4,
600                   classify_dpo_create(FIB_PROTOCOL_IP4,
601                                       classify_table_index));
602
603           fib_table_entry_special_dpo_add(fib_index,
604                                           &pfx,
605                                           FIB_SOURCE_CLASSIFY,
606                                           FIB_ENTRY_FLAG_NONE,
607                                           &dpo);
608           dpo_reset(&dpo);
609       }
610   }
611
612   fib_table_entry_update_one_path(fib_index,
613                                   &pfx,
614                                   FIB_SOURCE_INTERFACE,
615                                   (FIB_ENTRY_FLAG_CONNECTED |
616                                    FIB_ENTRY_FLAG_LOCAL),
617                                   FIB_PROTOCOL_IP4,
618                                   &pfx.fp_addr,
619                                   sw_if_index,
620                                   ~0, // invalid FIB index
621                                   1,
622                                   MPLS_LABEL_INVALID,
623                                   FIB_ROUTE_PATH_FLAG_NONE);
624 }
625
626 static void
627 ip4_del_interface_routes (ip4_main_t * im,
628                           u32 fib_index,
629                           ip4_address_t * address,
630                           u32 address_length)
631 {
632     fib_prefix_t pfx = {
633         .fp_len = address_length,
634         .fp_proto = FIB_PROTOCOL_IP4,
635         .fp_addr.ip4 = *address,
636     };
637
638     if (pfx.fp_len < 32)
639     {
640         fib_table_entry_delete(fib_index,
641                                &pfx,
642                                FIB_SOURCE_INTERFACE);
643     }
644
645     pfx.fp_len = 32;
646     fib_table_entry_delete(fib_index,
647                            &pfx,
648                            FIB_SOURCE_INTERFACE);
649 }
650
651 void
652 ip4_sw_interface_enable_disable (u32 sw_if_index,
653                                  u32 is_enable)
654 {
655   vlib_main_t * vm = vlib_get_main();
656   ip4_main_t * im = &ip4_main;
657   ip_lookup_main_t * lm = &im->lookup_main;
658   u32 ci, cast;
659   u32 lookup_feature_index;
660
661   vec_validate_init_empty (im->ip_enabled_by_sw_if_index, sw_if_index, 0);
662
663   /*
664    * enable/disable only on the 1<->0 transition
665    */
666   if (is_enable)
667     {
668       if (1 != ++im->ip_enabled_by_sw_if_index[sw_if_index])
669         return;
670     }
671   else
672     {
673       ASSERT(im->ip_enabled_by_sw_if_index[sw_if_index] > 0);
674       if (0 != --im->ip_enabled_by_sw_if_index[sw_if_index])
675         return;
676     }
677
678   for (cast = 0; cast <= VNET_IP_RX_MULTICAST_FEAT; cast++)
679     {
680       ip_config_main_t * cm = &lm->feature_config_mains[cast];
681       vnet_config_main_t * vcm = &cm->config_main;
682
683       vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
684       ci = cm->config_index_by_sw_if_index[sw_if_index];
685
686       if (cast == VNET_IP_RX_UNICAST_FEAT)
687         lookup_feature_index = im->ip4_unicast_rx_feature_lookup;
688       else
689         lookup_feature_index = im->ip4_multicast_rx_feature_lookup;
690
691       if (is_enable)
692         ci = vnet_config_add_feature (vm, vcm,
693                                       ci,
694                                       lookup_feature_index,
695                                       /* config data */ 0,
696                                       /* # bytes of config data */ 0);
697       else
698         ci = vnet_config_del_feature (vm, vcm,
699                                       ci,
700                                       lookup_feature_index,
701                                       /* config data */ 0,
702                                       /* # bytes of config data */ 0);
703       cm->config_index_by_sw_if_index[sw_if_index] = ci;
704     }
705 }
706
707 static clib_error_t *
708 ip4_add_del_interface_address_internal (vlib_main_t * vm,
709                                         u32 sw_if_index,
710                                         ip4_address_t * address,
711                                         u32 address_length,
712                                         u32 is_del)
713 {
714   vnet_main_t * vnm = vnet_get_main();
715   ip4_main_t * im = &ip4_main;
716   ip_lookup_main_t * lm = &im->lookup_main;
717   clib_error_t * error = 0;
718   u32 if_address_index, elts_before;
719   ip4_address_fib_t ip4_af, * addr_fib = 0;
720
721   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
722   ip4_addr_fib_init (&ip4_af, address,
723                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
724   vec_add1 (addr_fib, ip4_af);
725
726   /* FIXME-LATER
727    * there is no support for adj-fib handling in the presence of overlapping
728    * subnets on interfaces. Easy fix - disallow overlapping subnets, like
729    * most routers do.
730    */
731   if (! is_del)
732     {
733       /* When adding an address check that it does not conflict
734          with an existing address. */
735       ip_interface_address_t * ia;
736       foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, 
737                                     0 /* honor unnumbered */,
738       ({
739         ip4_address_t * x = ip_interface_address_get_address (&im->lookup_main, ia);
740
741         if (ip4_destination_matches_route (im, address, x, ia->address_length)
742             || ip4_destination_matches_route (im, x, address, address_length))
743           return clib_error_create ("failed to add %U which conflicts with %U for interface %U",
744                                     format_ip4_address_and_length, address, address_length,
745                                     format_ip4_address_and_length, x, ia->address_length,
746                                     format_vnet_sw_if_index_name, vnm, sw_if_index);
747        }));
748     }
749
750   elts_before = pool_elts (lm->if_address_pool);
751
752   error = ip_interface_address_add_del
753     (lm,
754      sw_if_index,
755      addr_fib,
756      address_length,
757      is_del,
758      &if_address_index);
759   if (error)
760     goto done;
761   
762   ip4_sw_interface_enable_disable(sw_if_index, !is_del);
763
764   if (is_del)
765       ip4_del_interface_routes (im, ip4_af.fib_index, address,
766                                 address_length);
767   else
768       ip4_add_interface_routes (sw_if_index,
769                                 im, ip4_af.fib_index,
770                                 pool_elt_at_index 
771                                 (lm->if_address_pool, if_address_index));
772
773   /* If pool did not grow/shrink: add duplicate address. */
774   if (elts_before != pool_elts (lm->if_address_pool))
775     {
776       ip4_add_del_interface_address_callback_t * cb;
777       vec_foreach (cb, im->add_del_interface_address_callbacks)
778         cb->function (im, cb->function_opaque, sw_if_index,
779                       address, address_length,
780                       if_address_index,
781                       is_del);
782     }
783
784  done:
785   vec_free (addr_fib);
786   return error;
787 }
788
789 clib_error_t *
790 ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
791                                ip4_address_t * address, u32 address_length,
792                                u32 is_del)
793 {
794   return ip4_add_del_interface_address_internal
795     (vm, sw_if_index, address, address_length,
796      is_del);
797 }
798
799 /* Built-in ip4 unicast rx feature path definition */
800 VNET_IP4_UNICAST_FEATURE_INIT (ip4_inacl, static) = {
801   .node_name = "ip4-inacl", 
802   .runs_before = ORDER_CONSTRAINTS {"ip4-source-check-via-rx", 0},
803   .feature_index = &ip4_main.ip4_unicast_rx_feature_check_access,
804 };
805
806 VNET_IP4_UNICAST_FEATURE_INIT (ip4_source_check_1, static) = {
807   .node_name = "ip4-source-check-via-rx",
808   .runs_before = ORDER_CONSTRAINTS {"ip4-source-check-via-any", 0},
809   .feature_index = 
810   &ip4_main.ip4_unicast_rx_feature_source_reachable_via_rx,
811 };
812
813 VNET_IP4_UNICAST_FEATURE_INIT (ip4_source_check_2, static) = {
814   .node_name = "ip4-source-check-via-any",
815   .runs_before = ORDER_CONSTRAINTS {"ip4-policer-classify", 0},
816   .feature_index = 
817   &ip4_main.ip4_unicast_rx_feature_source_reachable_via_any,
818 };
819
820 VNET_IP4_UNICAST_FEATURE_INIT (ip4_source_and_port_range_check_rx, static) = {
821   .node_name = "ip4-source-and-port-range-check-rx",
822   .runs_before = ORDER_CONSTRAINTS {"ip4-policer-classify", 0},
823   .feature_index =
824   &ip4_main.ip4_unicast_rx_feature_source_and_port_range_check,
825 };
826
827 VNET_IP4_UNICAST_FEATURE_INIT (ip4_policer_classify, static) = {
828   .node_name = "ip4-policer-classify",
829   .runs_before = ORDER_CONSTRAINTS {"ipsec-input-ip4", 0},
830   .feature_index =
831   &ip4_main.ip4_unicast_rx_feature_policer_classify,
832 };
833
834 VNET_IP4_UNICAST_FEATURE_INIT (ip4_ipsec, static) = {
835   .node_name = "ipsec-input-ip4",
836   .runs_before = ORDER_CONSTRAINTS {"vpath-input-ip4", 0},
837   .feature_index = &ip4_main.ip4_unicast_rx_feature_ipsec,
838 };
839
840 VNET_IP4_UNICAST_FEATURE_INIT (ip4_vpath, static) = {
841   .node_name = "vpath-input-ip4",
842   .runs_before = ORDER_CONSTRAINTS {"ip4-lookup", 0},
843   .feature_index = &ip4_main.ip4_unicast_rx_feature_vpath,
844 };
845
846 VNET_IP4_UNICAST_FEATURE_INIT (ip4_lookup, static) = {
847   .node_name = "ip4-lookup",
848   .runs_before = ORDER_CONSTRAINTS {"ip4-drop", 0},
849   .feature_index = &ip4_main.ip4_unicast_rx_feature_lookup,
850 };
851
852 VNET_IP4_UNICAST_FEATURE_INIT (ip4_drop, static) = {
853   .node_name = "ip4-drop",
854   .runs_before = 0, /* not before any other features */
855   .feature_index = &ip4_main.ip4_unicast_rx_feature_drop,
856 };
857
858
859 /* Built-in ip4 multicast rx feature path definition */
860 VNET_IP4_MULTICAST_FEATURE_INIT (ip4_vpath_mc, static) = {
861   .node_name = "vpath-input-ip4",
862   .runs_before = ORDER_CONSTRAINTS {"ip4-lookup-multicast", 0},
863   .feature_index = &ip4_main.ip4_multicast_rx_feature_vpath,
864 };
865
866 VNET_IP4_MULTICAST_FEATURE_INIT (ip4_lookup_mc, static) = {
867   .node_name = "ip4-lookup-multicast",
868   .runs_before = ORDER_CONSTRAINTS {"ip4-drop", 0},
869   .feature_index = &ip4_main.ip4_multicast_rx_feature_lookup,
870 };
871
872 VNET_IP4_MULTICAST_FEATURE_INIT (ip4_mc_drop, static) = {
873   .node_name = "ip4-drop",
874   .runs_before = 0, /* last feature */
875   .feature_index = &ip4_main.ip4_multicast_rx_feature_drop,
876 };
877
878 static char * rx_feature_start_nodes[] = 
879   { "ip4-input", "ip4-input-no-checksum"};
880
881 static char * tx_feature_start_nodes[] = 
882 {
883   "ip4-rewrite-transit",
884   "ip4-midchain",
885 };
886
887 /* Source and port-range check ip4 tx feature path definition */
888 VNET_IP4_TX_FEATURE_INIT (ip4_source_and_port_range_check_tx, static) = {
889   .node_name = "ip4-source-and-port-range-check-tx",
890   .runs_before = ORDER_CONSTRAINTS {"interface-output", 0},
891   .feature_index =
892   &ip4_main.ip4_unicast_tx_feature_source_and_port_range_check,
893
894 };
895
896 /* Built-in ip4 tx feature path definition */
897 VNET_IP4_TX_FEATURE_INIT (interface_output, static) = {
898   .node_name = "interface-output",
899   .runs_before = 0, /* not before any other features */
900   .feature_index = &ip4_main.ip4_tx_feature_interface_output,
901 };
902
903 static clib_error_t *
904 ip4_feature_init (vlib_main_t * vm, ip4_main_t * im)
905 {
906   ip_lookup_main_t * lm = &im->lookup_main;
907   clib_error_t * error;
908   vnet_cast_t cast;
909   ip_config_main_t * cm;
910   vnet_config_main_t * vcm;
911   char **feature_start_nodes;
912   int feature_start_len;
913
914   for (cast = 0; cast < VNET_N_IP_FEAT; cast++)
915     {
916       cm = &lm->feature_config_mains[cast];
917       vcm = &cm->config_main;
918
919       if (cast < VNET_IP_TX_FEAT)
920         {
921           feature_start_nodes = rx_feature_start_nodes;
922           feature_start_len = ARRAY_LEN(rx_feature_start_nodes);
923         }
924       else
925         {
926           feature_start_nodes = tx_feature_start_nodes;
927           feature_start_len = ARRAY_LEN(tx_feature_start_nodes);
928         }
929       
930       if ((error = ip_feature_init_cast (vm, cm, vcm, 
931                                          feature_start_nodes,
932                                          feature_start_len,
933                                          im->next_feature[cast],
934                                          &im->feature_nodes[cast])))
935         return error;
936     }
937
938   return 0;
939 }
940
941 static clib_error_t *
942 ip4_sw_interface_add_del (vnet_main_t * vnm,
943                           u32 sw_if_index,
944                           u32 is_add)
945 {
946   vlib_main_t * vm = vnm->vlib_main;
947   ip4_main_t * im = &ip4_main;
948   ip_lookup_main_t * lm = &im->lookup_main;
949   u32 ci, cast;
950   u32 feature_index;
951
952   /* Fill in lookup tables with default table (0). */
953   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
954
955   for (cast = 0; cast < VNET_N_IP_FEAT; cast++)
956     {
957       ip_config_main_t * cm = &lm->feature_config_mains[cast];
958       vnet_config_main_t * vcm = &cm->config_main;
959
960       vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
961       ci = cm->config_index_by_sw_if_index[sw_if_index];
962
963       if (cast == VNET_IP_RX_UNICAST_FEAT)
964         feature_index = im->ip4_unicast_rx_feature_drop;
965       else if (cast == VNET_IP_RX_MULTICAST_FEAT)
966         feature_index = im->ip4_multicast_rx_feature_drop;
967       else
968         feature_index = im->ip4_tx_feature_interface_output;
969
970       if (is_add)
971         ci = vnet_config_add_feature (vm, vcm, 
972                                       ci,
973                                       feature_index,
974                                       /* config data */ 0,
975                                       /* # bytes of config data */ 0);
976       else
977         {
978           ci = vnet_config_del_feature (vm, vcm, ci,
979                                         feature_index,
980                                         /* config data */ 0,
981                                         /* # bytes of config data */ 0);
982           if (vec_len(im->ip_enabled_by_sw_if_index) > sw_if_index)
983               im->ip_enabled_by_sw_if_index[sw_if_index] = 0;
984         }
985       cm->config_index_by_sw_if_index[sw_if_index] = ci;
986       /*
987        * note: do not update the tx feature count here.
988        */
989     }
990
991   return /* no error */ 0;
992 }
993
994 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
995
996 /* Global IP4 main. */
997 ip4_main_t ip4_main;
998
999 clib_error_t *
1000 ip4_lookup_init (vlib_main_t * vm)
1001 {
1002   ip4_main_t * im = &ip4_main;
1003   clib_error_t * error;
1004   uword i;
1005
1006   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1007     {
1008       u32 m;
1009
1010       if (i < 32)
1011         m = pow2_mask (i) << (32 - i);
1012       else 
1013         m = ~0;
1014       im->fib_masks[i] = clib_host_to_net_u32 (m);
1015     }
1016
1017   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1018
1019   /* Create FIB with index 0 and table id of 0. */
1020   fib_table_find_or_create_and_lock(FIB_PROTOCOL_IP4, 0);
1021
1022   {
1023     pg_node_t * pn;
1024     pn = pg_get_node (ip4_lookup_node.index);
1025     pn->unformat_edit = unformat_pg_ip4_header;
1026   }
1027
1028   {
1029     ethernet_arp_header_t h;
1030
1031     memset (&h, 0, sizeof (h));
1032
1033     /* Set target ethernet address to all zeros. */
1034     memset (h.ip4_over_ethernet[1].ethernet, 0, sizeof (h.ip4_over_ethernet[1].ethernet));
1035
1036 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1037 #define _8(f,v) h.f = v;
1038     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1039     _16 (l3_type, ETHERNET_TYPE_IP4);
1040     _8 (n_l2_address_bytes, 6);
1041     _8 (n_l3_address_bytes, 4);
1042     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1043 #undef _16
1044 #undef _8
1045
1046     vlib_packet_template_init (vm,
1047                                &im->ip4_arp_request_packet_template,
1048                                /* data */ &h,
1049                                sizeof (h),
1050                                /* alloc chunk size */ 8,
1051                                "ip4 arp");
1052   }
1053
1054   error = ip4_feature_init (vm, im);
1055
1056   return error;
1057 }
1058
1059 VLIB_INIT_FUNCTION (ip4_lookup_init);
1060
1061 typedef struct {
1062   /* Adjacency taken. */
1063   u32 dpo_index;
1064   u32 flow_hash;
1065   u32 fib_index;
1066
1067   /* Packet data, possibly *after* rewrite. */
1068   u8 packet_data[64 - 1*sizeof(u32)];
1069 } ip4_forward_next_trace_t;
1070
1071 static u8 * format_ip4_forward_next_trace (u8 * s, va_list * args)
1072 {
1073   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1074   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1075   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1076   uword indent = format_get_indent (s);
1077   s = format (s, "%U%U",
1078               format_white_space, indent,
1079               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1080   return s;
1081 }
1082
1083 static u8 * format_ip4_lookup_trace (u8 * s, va_list * args)
1084 {
1085   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1086   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1087   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1088   uword indent = format_get_indent (s);
1089
1090   s = format (s, "fib %d dpo-idx %d flow hash: 0x%08x",
1091               t->fib_index, t->dpo_index, t->flow_hash);
1092   s = format (s, "\n%U%U",
1093               format_white_space, indent,
1094               format_ip4_header, t->packet_data, sizeof (t->packet_data));
1095   return s;
1096 }
1097
1098 static u8 * format_ip4_rewrite_trace (u8 * s, va_list * args)
1099 {
1100   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1101   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1102   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1103   vnet_main_t * vnm = vnet_get_main();
1104   uword indent = format_get_indent (s);
1105
1106   s = format (s, "tx_sw_if_index %d dpo-idx %d : %U flow hash: 0x%08x",
1107               t->fib_index, t->dpo_index, format_ip_adjacency,
1108               vnm, t->dpo_index, FORMAT_IP_ADJACENCY_NONE,
1109               t->flow_hash);
1110   s = format (s, "\n%U%U",
1111               format_white_space, indent,
1112               format_ip_adjacency_packet_data,
1113               vnm, t->dpo_index,
1114               t->packet_data, sizeof (t->packet_data));
1115   return s;
1116 }
1117
1118 /* Common trace function for all ip4-forward next nodes. */
1119 void
1120 ip4_forward_next_trace (vlib_main_t * vm,
1121                         vlib_node_runtime_t * node,
1122                         vlib_frame_t * frame,
1123                         vlib_rx_or_tx_t which_adj_index)
1124 {
1125   u32 * from, n_left;
1126   ip4_main_t * im = &ip4_main;
1127
1128   n_left = frame->n_vectors;
1129   from = vlib_frame_vector_args (frame);
1130   
1131   while (n_left >= 4)
1132     {
1133       u32 bi0, bi1;
1134       vlib_buffer_t * b0, * b1;
1135       ip4_forward_next_trace_t * t0, * t1;
1136
1137       /* Prefetch next iteration. */
1138       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1139       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1140
1141       bi0 = from[0];
1142       bi1 = from[1];
1143
1144       b0 = vlib_get_buffer (vm, bi0);
1145       b1 = vlib_get_buffer (vm, bi1);
1146
1147       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1148         {
1149           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1150           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1151           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1152           t0->fib_index = (vnet_buffer(b0)->sw_if_index[VLIB_TX] != (u32)~0) ?
1153               vnet_buffer(b0)->sw_if_index[VLIB_TX] :
1154               vec_elt (im->fib_index_by_sw_if_index,
1155                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
1156
1157           clib_memcpy (t0->packet_data,
1158                   vlib_buffer_get_current (b0),
1159                   sizeof (t0->packet_data));
1160         }
1161       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1162         {
1163           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1164           t1->dpo_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1165           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1166           t1->fib_index = (vnet_buffer(b1)->sw_if_index[VLIB_TX] != (u32)~0) ?
1167               vnet_buffer(b1)->sw_if_index[VLIB_TX] :
1168               vec_elt (im->fib_index_by_sw_if_index,
1169                        vnet_buffer(b1)->sw_if_index[VLIB_RX]);
1170           clib_memcpy (t1->packet_data,
1171                   vlib_buffer_get_current (b1),
1172                   sizeof (t1->packet_data));
1173         }
1174       from += 2;
1175       n_left -= 2;
1176     }
1177
1178   while (n_left >= 1)
1179     {
1180       u32 bi0;
1181       vlib_buffer_t * b0;
1182       ip4_forward_next_trace_t * t0;
1183
1184       bi0 = from[0];
1185
1186       b0 = vlib_get_buffer (vm, bi0);
1187
1188       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1189         {
1190           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1191           t0->dpo_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1192           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1193           t0->fib_index = (vnet_buffer(b0)->sw_if_index[VLIB_TX] != (u32)~0) ?
1194               vnet_buffer(b0)->sw_if_index[VLIB_TX] :
1195               vec_elt (im->fib_index_by_sw_if_index,
1196                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
1197           clib_memcpy (t0->packet_data,
1198                   vlib_buffer_get_current (b0),
1199                   sizeof (t0->packet_data));
1200         }
1201       from += 1;
1202       n_left -= 1;
1203     }
1204 }
1205
1206 static uword
1207 ip4_drop_or_punt (vlib_main_t * vm,
1208                   vlib_node_runtime_t * node,
1209                   vlib_frame_t * frame,
1210                   ip4_error_t error_code)
1211 {
1212   u32 * buffers = vlib_frame_vector_args (frame);
1213   uword n_packets = frame->n_vectors;
1214
1215   vlib_error_drop_buffers (vm, node,
1216                            buffers,
1217                            /* stride */ 1,
1218                            n_packets,
1219                            /* next */ 0,
1220                            ip4_input_node.index,
1221                            error_code);
1222
1223   if (node->flags & VLIB_NODE_FLAG_TRACE)
1224     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1225
1226   return n_packets;
1227 }
1228
1229 static uword
1230 ip4_drop (vlib_main_t * vm,
1231           vlib_node_runtime_t * node,
1232           vlib_frame_t * frame)
1233 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_DROP); }
1234
1235 static uword
1236 ip4_punt (vlib_main_t * vm,
1237           vlib_node_runtime_t * node,
1238           vlib_frame_t * frame)
1239 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT); }
1240
1241 VLIB_REGISTER_NODE (ip4_drop_node,static) = {
1242   .function = ip4_drop,
1243   .name = "ip4-drop",
1244   .vector_size = sizeof (u32),
1245
1246   .format_trace = format_ip4_forward_next_trace,
1247
1248   .n_next_nodes = 1,
1249   .next_nodes = {
1250     [0] = "error-drop",
1251   },
1252 };
1253
1254 VLIB_NODE_FUNCTION_MULTIARCH (ip4_drop_node, ip4_drop)
1255
1256 VLIB_REGISTER_NODE (ip4_punt_node,static) = {
1257   .function = ip4_punt,
1258   .name = "ip4-punt",
1259   .vector_size = sizeof (u32),
1260
1261   .format_trace = format_ip4_forward_next_trace,
1262
1263   .n_next_nodes = 1,
1264   .next_nodes = {
1265     [0] = "error-punt",
1266   },
1267 };
1268
1269 VLIB_NODE_FUNCTION_MULTIARCH (ip4_punt_node, ip4_punt)
1270
1271 /* Compute TCP/UDP/ICMP4 checksum in software. */
1272 u16
1273 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1274                               ip4_header_t * ip0)
1275 {
1276   ip_csum_t sum0;
1277   u32 ip_header_length, payload_length_host_byte_order;
1278   u32 n_this_buffer, n_bytes_left;
1279   u16 sum16;
1280   void * data_this_buffer;
1281   
1282   /* Initialize checksum with ip header. */
1283   ip_header_length = ip4_header_bytes (ip0);
1284   payload_length_host_byte_order = clib_net_to_host_u16 (ip0->length) - ip_header_length;
1285   sum0 = clib_host_to_net_u32 (payload_length_host_byte_order + (ip0->protocol << 16));
1286
1287   if (BITS (uword) == 32)
1288     {
1289       sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u32));
1290       sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32));
1291     }
1292   else
1293     sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1294
1295   n_bytes_left = n_this_buffer = payload_length_host_byte_order;
1296   data_this_buffer = (void *) ip0 + ip_header_length;
1297   if (n_this_buffer + ip_header_length > p0->current_length)
1298     n_this_buffer = p0->current_length > ip_header_length ? p0->current_length - ip_header_length : 0;
1299   while (1)
1300     {
1301       sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
1302       n_bytes_left -= n_this_buffer;
1303       if (n_bytes_left == 0)
1304         break;
1305
1306       ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
1307       p0 = vlib_get_buffer (vm, p0->next_buffer);
1308       data_this_buffer = vlib_buffer_get_current (p0);
1309       n_this_buffer = p0->current_length;
1310     }
1311
1312   sum16 = ~ ip_csum_fold (sum0);
1313
1314   return sum16;
1315 }
1316
1317 static u32
1318 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1319 {
1320   ip4_header_t * ip0 = vlib_buffer_get_current (p0);
1321   udp_header_t * udp0;
1322   u16 sum16;
1323
1324   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1325           || ip0->protocol == IP_PROTOCOL_UDP);
1326
1327   udp0 = (void *) (ip0 + 1);
1328   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1329     {
1330       p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1331                     | IP_BUFFER_L4_CHECKSUM_CORRECT);
1332       return p0->flags;
1333     }
1334
1335   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1336
1337   p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1338                 | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT));
1339
1340   return p0->flags;
1341 }
1342
1343 static uword
1344 ip4_local (vlib_main_t * vm,
1345            vlib_node_runtime_t * node,
1346            vlib_frame_t * frame)
1347 {
1348   ip4_main_t * im = &ip4_main;
1349   ip_lookup_main_t * lm = &im->lookup_main;
1350   ip_local_next_t next_index;
1351   u32 * from, * to_next, n_left_from, n_left_to_next;
1352   vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
1353
1354   from = vlib_frame_vector_args (frame);
1355   n_left_from = frame->n_vectors;
1356   next_index = node->cached_next_index;
1357   
1358   if (node->flags & VLIB_NODE_FLAG_TRACE)
1359     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1360
1361   while (n_left_from > 0)
1362     {
1363       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1364
1365       while (n_left_from >= 4 && n_left_to_next >= 2)
1366         {
1367           vlib_buffer_t * p0, * p1;
1368           ip4_header_t * ip0, * ip1;
1369           udp_header_t * udp0, * udp1;
1370           ip4_fib_mtrie_t * mtrie0, * mtrie1;
1371           ip4_fib_mtrie_leaf_t leaf0, leaf1;
1372           const dpo_id_t *dpo0, *dpo1;
1373           const load_balance_t *lb0, *lb1;
1374           u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, lbi0;
1375           u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, lbi1;
1376           i32 len_diff0, len_diff1;
1377           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
1378           u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
1379           u8 enqueue_code;
1380       
1381           pi0 = to_next[0] = from[0];
1382           pi1 = to_next[1] = from[1];
1383           from += 2;
1384           n_left_from -= 2;
1385           to_next += 2;
1386           n_left_to_next -= 2;
1387       
1388           p0 = vlib_get_buffer (vm, pi0);
1389           p1 = vlib_get_buffer (vm, pi1);
1390
1391           ip0 = vlib_buffer_get_current (p0);
1392           ip1 = vlib_buffer_get_current (p1);
1393
1394           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
1395                                 vnet_buffer(p0)->sw_if_index[VLIB_RX]);
1396           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, 
1397                                 vnet_buffer(p1)->sw_if_index[VLIB_RX]);
1398
1399           mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
1400           mtrie1 = &ip4_fib_get (fib_index1)->mtrie;
1401
1402           leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
1403
1404           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
1405           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0);
1406
1407           /* Treat IP frag packets as "experimental" protocol for now
1408              until support of IP frag reassembly is implemented */
1409           proto0 = ip4_is_fragment(ip0) ? 0xfe : ip0->protocol;
1410           proto1 = ip4_is_fragment(ip1) ? 0xfe : ip1->protocol;
1411           is_udp0 = proto0 == IP_PROTOCOL_UDP;
1412           is_udp1 = proto1 == IP_PROTOCOL_UDP;
1413           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
1414           is_tcp_udp1 = is_udp1 || proto1 == IP_PROTOCOL_TCP;
1415
1416           flags0 = p0->flags;
1417           flags1 = p1->flags;
1418
1419           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1420           good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1421
1422           udp0 = ip4_next_header (ip0);
1423           udp1 = ip4_next_header (ip1);
1424
1425           /* Don't verify UDP checksum for packets with explicit zero checksum. */
1426           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1427           good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1428
1429           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
1430           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1);
1431
1432           /* Verify UDP length. */
1433           ip_len0 = clib_net_to_host_u16 (ip0->length);
1434           ip_len1 = clib_net_to_host_u16 (ip1->length);
1435           udp_len0 = clib_net_to_host_u16 (udp0->length);
1436           udp_len1 = clib_net_to_host_u16 (udp1->length);
1437
1438           len_diff0 = ip_len0 - udp_len0;
1439           len_diff1 = ip_len1 - udp_len1;
1440
1441           len_diff0 = is_udp0 ? len_diff0 : 0;
1442           len_diff1 = is_udp1 ? len_diff1 : 0;
1443
1444           if (PREDICT_FALSE (! (is_tcp_udp0 & is_tcp_udp1
1445                                 & good_tcp_udp0 & good_tcp_udp1)))
1446             {
1447               if (is_tcp_udp0)
1448                 {
1449                   if (is_tcp_udp0
1450                       && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1451                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
1452                   good_tcp_udp0 =
1453                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1454                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1455                 }
1456               if (is_tcp_udp1)
1457                 {
1458                   if (is_tcp_udp1
1459                       && ! (flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1460                     flags1 = ip4_tcp_udp_validate_checksum (vm, p1);
1461                   good_tcp_udp1 =
1462                     (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1463                   good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1464                 }
1465             }
1466
1467           good_tcp_udp0 &= len_diff0 >= 0;
1468           good_tcp_udp1 &= len_diff1 >= 0;
1469
1470           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1471           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2);
1472
1473           error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;
1474
1475           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
1476           error1 = len_diff1 < 0 ? IP4_ERROR_UDP_LENGTH : error1;
1477
1478           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1479           error0 = (is_tcp_udp0 && ! good_tcp_udp0
1480                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0
1481                     : error0);
1482           error1 = (is_tcp_udp1 && ! good_tcp_udp1
1483                     ? IP4_ERROR_TCP_CHECKSUM + is_udp1
1484                     : error1);
1485
1486           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1487           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
1488           leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
1489           leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
1490
1491           vnet_buffer (p0)->ip.adj_index[VLIB_RX] = lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1492           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
1493
1494           vnet_buffer (p1)->ip.adj_index[VLIB_RX] = lbi1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
1495           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = lbi1;
1496
1497           lb0 = load_balance_get(lbi0);
1498           lb1 = load_balance_get(lbi1);
1499           dpo0 = load_balance_get_bucket_i(lb0, 0);
1500           dpo1 = load_balance_get_bucket_i(lb1, 0);
1501
1502           /* 
1503            * Must have a route to source otherwise we drop the packet.
1504            * ip4 broadcasts are accepted, e.g. to make dhcp client work
1505            *
1506            * The checks are:
1507            *  - the source is a recieve => it's from us => bogus, do this
1508            *    first since it sets a different error code.
1509            *  - uRPF check for any route to source - accept if passes.
1510            *  - allow packets destined to the broadcast address from unknown sources
1511            */
1512           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1513                      dpo0->dpoi_type == DPO_RECEIVE) ?
1514                     IP4_ERROR_SPOOFED_LOCAL_PACKETS : 
1515                     error0);
1516           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1517                      !fib_urpf_check_size(lb0->lb_urpf) &&
1518                      ip0->dst_address.as_u32 != 0xFFFFFFFF)
1519                     ? IP4_ERROR_SRC_LOOKUP_MISS
1520                     : error0);
1521           error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1522                      dpo1->dpoi_type == DPO_RECEIVE) ?
1523                     IP4_ERROR_SPOOFED_LOCAL_PACKETS : 
1524                     error1);
1525           error1 = ((error1 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1526                      !fib_urpf_check_size(lb1->lb_urpf) &&
1527                      ip1->dst_address.as_u32 != 0xFFFFFFFF)
1528                     ? IP4_ERROR_SRC_LOOKUP_MISS
1529                     : error1);
1530
1531           next0 = lm->local_next_by_ip_protocol[proto0];
1532           next1 = lm->local_next_by_ip_protocol[proto1];
1533
1534           next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
1535           next1 = error1 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1;
1536
1537           p0->error = error0 ? error_node->errors[error0] : 0;
1538           p1->error = error1 ? error_node->errors[error1] : 0;
1539
1540           enqueue_code = (next0 != next_index) + 2*(next1 != next_index);
1541
1542           if (PREDICT_FALSE (enqueue_code != 0))
1543             {
1544               switch (enqueue_code)
1545                 {
1546                 case 1:
1547                   /* A B A */
1548                   to_next[-2] = pi1;
1549                   to_next -= 1;
1550                   n_left_to_next += 1;
1551                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
1552                   break;
1553
1554                 case 2:
1555                   /* A A B */
1556                   to_next -= 1;
1557                   n_left_to_next += 1;
1558                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
1559                   break;
1560
1561                 case 3:
1562                   /* A B B or A B C */
1563                   to_next -= 2;
1564                   n_left_to_next += 2;
1565                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
1566                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
1567                   if (next0 == next1)
1568                     {
1569                       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1570                       next_index = next1;
1571                       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1572                     }
1573                   break;
1574                 }
1575             }
1576         }
1577
1578       while (n_left_from > 0 && n_left_to_next > 0)
1579         {
1580           vlib_buffer_t * p0;
1581           ip4_header_t * ip0;
1582           udp_header_t * udp0;
1583           ip4_fib_mtrie_t * mtrie0;
1584           ip4_fib_mtrie_leaf_t leaf0;
1585           u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, lbi0;
1586           i32 len_diff0;
1587           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
1588           load_balance_t *lb0;
1589           const dpo_id_t *dpo0;
1590
1591           pi0 = to_next[0] = from[0];
1592           from += 1;
1593           n_left_from -= 1;
1594           to_next += 1;
1595           n_left_to_next -= 1;
1596       
1597           p0 = vlib_get_buffer (vm, pi0);
1598
1599           ip0 = vlib_buffer_get_current (p0);
1600
1601           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
1602                                 vnet_buffer(p0)->sw_if_index[VLIB_RX]);
1603
1604           mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
1605
1606           leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
1607
1608           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
1609
1610           /* Treat IP frag packets as "experimental" protocol for now
1611              until support of IP frag reassembly is implemented */
1612           proto0 = ip4_is_fragment(ip0) ? 0xfe : ip0->protocol;
1613           is_udp0 = proto0 == IP_PROTOCOL_UDP;
1614           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
1615
1616           flags0 = p0->flags;
1617
1618           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1619
1620           udp0 = ip4_next_header (ip0);
1621
1622           /* Don't verify UDP checksum for packets with explicit zero checksum. */
1623           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1624
1625           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
1626
1627           /* Verify UDP length. */
1628           ip_len0 = clib_net_to_host_u16 (ip0->length);
1629           udp_len0 = clib_net_to_host_u16 (udp0->length);
1630
1631           len_diff0 = ip_len0 - udp_len0;
1632
1633           len_diff0 = is_udp0 ? len_diff0 : 0;
1634
1635           if (PREDICT_FALSE (! (is_tcp_udp0 & good_tcp_udp0)))
1636             {
1637               if (is_tcp_udp0)
1638                 {
1639                   if (is_tcp_udp0
1640                       && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1641                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
1642                   good_tcp_udp0 =
1643                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1644                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1645                 }
1646             }
1647
1648           good_tcp_udp0 &= len_diff0 >= 0;
1649
1650           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1651
1652           error0 = IP4_ERROR_UNKNOWN_PROTOCOL;
1653
1654           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
1655
1656           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1657           error0 = (is_tcp_udp0 && ! good_tcp_udp0
1658                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0
1659                     : error0);
1660
1661           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1662           leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
1663
1664           lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1665           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbi0;
1666
1667           lb0 = load_balance_get(lbi0);
1668           dpo0 = load_balance_get_bucket_i(lb0, 0);
1669
1670           vnet_buffer (p0)->ip.adj_index[VLIB_TX] =
1671               vnet_buffer (p0)->ip.adj_index[VLIB_RX] =
1672                   dpo0->dpoi_index;
1673
1674           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1675                      dpo0->dpoi_type == DPO_RECEIVE) ?
1676                     IP4_ERROR_SPOOFED_LOCAL_PACKETS : 
1677                     error0);
1678           error0 = ((error0 == IP4_ERROR_UNKNOWN_PROTOCOL &&
1679                      !fib_urpf_check_size(lb0->lb_urpf) &&
1680                      ip0->dst_address.as_u32 != 0xFFFFFFFF)
1681                     ? IP4_ERROR_SRC_LOOKUP_MISS
1682                     : error0);
1683
1684           next0 = lm->local_next_by_ip_protocol[proto0];
1685
1686           next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
1687
1688           p0->error = error0? error_node->errors[error0] : 0;
1689
1690           if (PREDICT_FALSE (next0 != next_index))
1691             {
1692               n_left_to_next += 1;
1693               vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1694
1695               next_index = next0;
1696               vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1697               to_next[0] = pi0;
1698               to_next += 1;
1699               n_left_to_next -= 1;
1700             }
1701         }
1702   
1703       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1704     }
1705
1706   return frame->n_vectors;
1707 }
1708
1709 VLIB_REGISTER_NODE (ip4_local_node,static) = {
1710   .function = ip4_local,
1711   .name = "ip4-local",
1712   .vector_size = sizeof (u32),
1713
1714   .format_trace = format_ip4_forward_next_trace,
1715
1716   .n_next_nodes = IP_LOCAL_N_NEXT,
1717   .next_nodes = {
1718     [IP_LOCAL_NEXT_DROP] = "error-drop",
1719     [IP_LOCAL_NEXT_PUNT] = "error-punt",
1720     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
1721     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
1722   },
1723 };
1724
1725 VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local)
1726
1727 void ip4_register_protocol (u32 protocol, u32 node_index)
1728 {
1729   vlib_main_t * vm = vlib_get_main();
1730   ip4_main_t * im = &ip4_main;
1731   ip_lookup_main_t * lm = &im->lookup_main;
1732
1733   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
1734   lm->local_next_by_ip_protocol[protocol] = vlib_node_add_next (vm, ip4_local_node.index, node_index);
1735 }
1736
1737 static clib_error_t *
1738 show_ip_local_command_fn (vlib_main_t * vm,
1739                           unformat_input_t * input,
1740                          vlib_cli_command_t * cmd)
1741 {
1742   ip4_main_t * im = &ip4_main;
1743   ip_lookup_main_t * lm = &im->lookup_main;
1744   int i;
1745
1746   vlib_cli_output (vm, "Protocols handled by ip4_local");
1747   for (i = 0; i < ARRAY_LEN(lm->local_next_by_ip_protocol); i++)
1748     {
1749       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
1750         vlib_cli_output (vm, "%d", i);
1751     }
1752   return 0;
1753 }
1754
1755
1756
1757 VLIB_CLI_COMMAND (show_ip_local, static) = {
1758   .path = "show ip local",
1759   .function = show_ip_local_command_fn,
1760   .short_help = "Show ip local protocol table",
1761 };
1762
1763 always_inline uword
1764 ip4_arp_inline (vlib_main_t * vm,
1765                 vlib_node_runtime_t * node,
1766                 vlib_frame_t * frame,
1767                 int is_glean)
1768 {
1769   vnet_main_t * vnm = vnet_get_main();
1770   ip4_main_t * im = &ip4_main;
1771   ip_lookup_main_t * lm = &im->lookup_main;
1772   u32 * from, * to_next_drop;
1773   uword n_left_from, n_left_to_next_drop, next_index;
1774   static f64 time_last_seed_change = -1e100;
1775   static u32 hash_seeds[3];
1776   static uword hash_bitmap[256 / BITS (uword)]; 
1777   f64 time_now;
1778
1779   if (node->flags & VLIB_NODE_FLAG_TRACE)
1780     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1781
1782   time_now = vlib_time_now (vm);
1783   if (time_now - time_last_seed_change > 1e-3)
1784     {
1785       uword i;
1786       u32 * r = clib_random_buffer_get_data (&vm->random_buffer,
1787                                              sizeof (hash_seeds));
1788       for (i = 0; i < ARRAY_LEN (hash_seeds); i++)
1789         hash_seeds[i] = r[i];
1790
1791       /* Mark all hash keys as been no-seen before. */
1792       for (i = 0; i < ARRAY_LEN (hash_bitmap); i++)
1793         hash_bitmap[i] = 0;
1794
1795       time_last_seed_change = time_now;
1796     }
1797
1798   from = vlib_frame_vector_args (frame);
1799   n_left_from = frame->n_vectors;
1800   next_index = node->cached_next_index;
1801   if (next_index == IP4_ARP_NEXT_DROP)
1802     next_index = IP4_ARP_N_NEXT; /* point to first interface */
1803
1804   while (n_left_from > 0)
1805     {
1806       vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
1807                            to_next_drop, n_left_to_next_drop);
1808
1809       while (n_left_from > 0 && n_left_to_next_drop > 0)
1810         {
1811           u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
1812           ip_adjacency_t * adj0;
1813           vlib_buffer_t * p0;
1814           ip4_header_t * ip0;
1815           uword bm0;
1816
1817           pi0 = from[0];
1818
1819           p0 = vlib_get_buffer (vm, pi0);
1820
1821           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
1822           adj0 = ip_get_adjacency (lm, adj_index0);
1823           ip0 = vlib_buffer_get_current (p0);
1824
1825           /*
1826            * this is the Glean case, so we are ARPing for the
1827            * packet's destination 
1828            */
1829           a0 = hash_seeds[0];
1830           b0 = hash_seeds[1];
1831           c0 = hash_seeds[2];
1832
1833           sw_if_index0 = adj0->rewrite_header.sw_if_index;
1834           vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
1835
1836           if (is_glean)
1837           {
1838               a0 ^= ip0->dst_address.data_u32;
1839           }
1840           else
1841           {
1842               a0 ^= adj0->sub_type.nbr.next_hop.ip4.data_u32;
1843           }
1844           b0 ^= sw_if_index0;
1845
1846           hash_v3_finalize32 (a0, b0, c0);
1847
1848           c0 &= BITS (hash_bitmap) - 1;
1849           c0 = c0 / BITS (uword);
1850           m0 = (uword) 1 << (c0 % BITS (uword));
1851
1852           bm0 = hash_bitmap[c0];
1853           drop0 = (bm0 & m0) != 0;
1854
1855           /* Mark it as seen. */
1856           hash_bitmap[c0] = bm0 | m0;
1857
1858           from += 1;
1859           n_left_from -= 1;
1860           to_next_drop[0] = pi0;
1861           to_next_drop += 1;
1862           n_left_to_next_drop -= 1;
1863
1864           p0->error = node->errors[drop0 ? IP4_ARP_ERROR_DROP : IP4_ARP_ERROR_REQUEST_SENT];
1865
1866           if (drop0)
1867             continue;
1868
1869           /* 
1870            * Can happen if the control-plane is programming tables
1871            * with traffic flowing; at least that's today's lame excuse.
1872            */
1873           if ((is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_GLEAN) ||
1874               (!is_glean && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP))
1875           {
1876             p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
1877           }
1878           else
1879           /* Send ARP request. */
1880           {
1881             u32 bi0 = 0;
1882             vlib_buffer_t * b0;
1883             ethernet_arp_header_t * h0;
1884             vnet_hw_interface_t * hw_if0;
1885
1886             h0 = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi0);
1887
1888             /* Add rewrite/encap string for ARP packet. */
1889             vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t));
1890
1891             hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
1892
1893             /* Src ethernet address in ARP header. */
1894             clib_memcpy (h0->ip4_over_ethernet[0].ethernet, hw_if0->hw_address,
1895                     sizeof (h0->ip4_over_ethernet[0].ethernet));
1896
1897             if (is_glean)
1898             {
1899                 /* The interface's source address is stashed in the Glean Adj */
1900                 h0->ip4_over_ethernet[0].ip4 = adj0->sub_type.glean.receive_addr.ip4;
1901
1902                 /* Copy in destination address we are requesting. This is the
1903                 * glean case, so it's the packet's destination.*/
1904                 h0->ip4_over_ethernet[1].ip4.data_u32 = ip0->dst_address.data_u32;
1905             }
1906             else
1907             {
1908                 /* Src IP address in ARP header. */
1909                 if (ip4_src_address_for_packet(lm, sw_if_index0,
1910                                                &h0->ip4_over_ethernet[0].ip4))
1911                 {
1912                     /* No source address available */
1913                     p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
1914                     vlib_buffer_free(vm, &bi0, 1);
1915                     continue;
1916                 }
1917
1918                 /* Copy in destination address we are requesting from the
1919                    incomplete adj */
1920                 h0->ip4_over_ethernet[1].ip4.data_u32 =
1921                     adj0->sub_type.nbr.next_hop.ip4.as_u32;
1922             }
1923
1924             vlib_buffer_copy_trace_flag (vm, p0, bi0);
1925             b0 = vlib_get_buffer (vm, bi0);
1926             vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
1927
1928             vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
1929
1930             vlib_set_next_frame_buffer (vm, node, adj0->rewrite_header.next_index, bi0);
1931           }
1932         }
1933
1934       vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
1935     }
1936
1937   return frame->n_vectors;
1938 }
1939
1940 static uword
1941 ip4_arp (vlib_main_t * vm,
1942          vlib_node_runtime_t * node,
1943          vlib_frame_t * frame)
1944 {
1945     return (ip4_arp_inline(vm, node, frame, 0));
1946 }
1947
1948 static uword
1949 ip4_glean (vlib_main_t * vm,
1950            vlib_node_runtime_t * node,
1951            vlib_frame_t * frame)
1952 {
1953     return (ip4_arp_inline(vm, node, frame, 1));
1954 }
1955
1956 static char * ip4_arp_error_strings[] = {
1957   [IP4_ARP_ERROR_DROP] = "address overflow drops",
1958   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
1959   [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
1960   [IP4_ARP_ERROR_REPLICATE_DROP] = "ARP replication completed",
1961   [IP4_ARP_ERROR_REPLICATE_FAIL] = "ARP replication failed",
1962   [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
1963 };
1964
1965 VLIB_REGISTER_NODE (ip4_arp_node) = {
1966   .function = ip4_arp,
1967   .name = "ip4-arp",
1968   .vector_size = sizeof (u32),
1969
1970   .format_trace = format_ip4_forward_next_trace,
1971
1972   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
1973   .error_strings = ip4_arp_error_strings,
1974
1975   .n_next_nodes = IP4_ARP_N_NEXT,
1976   .next_nodes = {
1977     [IP4_ARP_NEXT_DROP] = "error-drop",
1978   },
1979 };
1980
1981 VLIB_REGISTER_NODE (ip4_glean_node) = {
1982   .function = ip4_glean,
1983   .name = "ip4-glean",
1984   .vector_size = sizeof (u32),
1985
1986   .format_trace = format_ip4_forward_next_trace,
1987
1988   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
1989   .error_strings = ip4_arp_error_strings,
1990
1991   .n_next_nodes = IP4_ARP_N_NEXT,
1992   .next_nodes = {
1993     [IP4_ARP_NEXT_DROP] = "error-drop",
1994   },
1995 };
1996
1997 #define foreach_notrace_ip4_arp_error           \
1998 _(DROP)                                         \
1999 _(REQUEST_SENT)                                 \
2000 _(REPLICATE_DROP)                               \
2001 _(REPLICATE_FAIL)
2002
2003 clib_error_t * arp_notrace_init (vlib_main_t * vm)
2004 {
2005   vlib_node_runtime_t *rt = 
2006     vlib_node_get_runtime (vm, ip4_arp_node.index);
2007
2008   /* don't trace ARP request packets */
2009 #define _(a)                                    \
2010     vnet_pcap_drop_trace_filter_add_del         \
2011         (rt->errors[IP4_ARP_ERROR_##a],         \
2012          1 /* is_add */);
2013     foreach_notrace_ip4_arp_error;
2014 #undef _
2015   return 0;
2016 }
2017
2018 VLIB_INIT_FUNCTION(arp_notrace_init);
2019
2020
2021 /* Send an ARP request to see if given destination is reachable on given interface. */
2022 clib_error_t *
2023 ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index)
2024 {
2025   vnet_main_t * vnm = vnet_get_main();
2026   ip4_main_t * im = &ip4_main;
2027   ethernet_arp_header_t * h;
2028   ip4_address_t * src;
2029   ip_interface_address_t * ia;
2030   ip_adjacency_t * adj;
2031   vnet_hw_interface_t * hi;
2032   vnet_sw_interface_t * si;
2033   vlib_buffer_t * b;
2034   u32 bi = 0;
2035
2036   si = vnet_get_sw_interface (vnm, sw_if_index);
2037
2038   if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
2039     {
2040       return clib_error_return (0, "%U: interface %U down",
2041                                 format_ip4_address, dst, 
2042                                 format_vnet_sw_if_index_name, vnm, 
2043                                 sw_if_index);
2044     }
2045
2046   src = ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
2047   if (! src)
2048     {
2049       vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
2050       return clib_error_return 
2051         (0, "no matching interface address for destination %U (interface %U)",
2052          format_ip4_address, dst,
2053          format_vnet_sw_if_index_name, vnm, sw_if_index);
2054     }
2055
2056   adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index);
2057
2058   h = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi);
2059
2060   hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
2061
2062   clib_memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address, sizeof (h->ip4_over_ethernet[0].ethernet));
2063
2064   h->ip4_over_ethernet[0].ip4 = src[0];
2065   h->ip4_over_ethernet[1].ip4 = dst[0];
2066
2067   b = vlib_get_buffer (vm, bi);
2068   vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
2069
2070   /* Add encapsulation string for software interface (e.g. ethernet header). */
2071   vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
2072   vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
2073
2074   {
2075     vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
2076     u32 * to_next = vlib_frame_vector_args (f);
2077     to_next[0] = bi;
2078     f->n_vectors = 1;
2079     vlib_put_frame_to_node (vm, hi->output_node_index, f);
2080   }
2081
2082   return /* no error */ 0;
2083 }
2084
2085 typedef enum {
2086   IP4_REWRITE_NEXT_DROP,
2087   IP4_REWRITE_NEXT_ARP,
2088   IP4_REWRITE_NEXT_ICMP_ERROR,
2089 } ip4_rewrite_next_t;
2090
2091 always_inline uword
2092 ip4_rewrite_inline (vlib_main_t * vm,
2093                     vlib_node_runtime_t * node,
2094                     vlib_frame_t * frame,
2095                     int rewrite_for_locally_received_packets,
2096                     int is_midchain)
2097 {
2098   ip_lookup_main_t * lm = &ip4_main.lookup_main;
2099   u32 * from = vlib_frame_vector_args (frame);
2100   u32 n_left_from, n_left_to_next, * to_next, next_index;
2101   vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
2102   vlib_rx_or_tx_t adj_rx_tx = rewrite_for_locally_received_packets ? VLIB_RX : VLIB_TX;
2103   ip_config_main_t * cm = &lm->feature_config_mains[VNET_IP_TX_FEAT];
2104
2105   n_left_from = frame->n_vectors;
2106   next_index = node->cached_next_index;
2107   u32 cpu_index = os_get_cpu_number();
2108   
2109   while (n_left_from > 0)
2110     {
2111       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2112
2113       while (n_left_from >= 4 && n_left_to_next >= 2)
2114         {
2115           ip_adjacency_t * adj0, * adj1;
2116           vlib_buffer_t * p0, * p1;
2117           ip4_header_t * ip0, * ip1;
2118           u32 pi0, rw_len0, next0, error0, checksum0, adj_index0;
2119           u32 pi1, rw_len1, next1, error1, checksum1, adj_index1;
2120           u32 next0_override, next1_override;
2121           u32 tx_sw_if_index0, tx_sw_if_index1;
2122
2123           if (rewrite_for_locally_received_packets)
2124               next0_override = next1_override = 0;
2125
2126           /* Prefetch next iteration. */
2127           {
2128             vlib_buffer_t * p2, * p3;
2129
2130             p2 = vlib_get_buffer (vm, from[2]);
2131             p3 = vlib_get_buffer (vm, from[3]);
2132
2133             vlib_prefetch_buffer_header (p2, STORE);
2134             vlib_prefetch_buffer_header (p3, STORE);
2135
2136             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
2137             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
2138           }
2139
2140           pi0 = to_next[0] = from[0];
2141           pi1 = to_next[1] = from[1];
2142
2143           from += 2;
2144           n_left_from -= 2;
2145           to_next += 2;
2146           n_left_to_next -= 2;
2147       
2148           p0 = vlib_get_buffer (vm, pi0);
2149           p1 = vlib_get_buffer (vm, pi1);
2150
2151           adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
2152           adj_index1 = vnet_buffer (p1)->ip.adj_index[adj_rx_tx];
2153
2154           /* We should never rewrite a pkt using the MISS adjacency */
2155           ASSERT(adj_index0 && adj_index1);
2156
2157           ip0 = vlib_buffer_get_current (p0);
2158           ip1 = vlib_buffer_get_current (p1);
2159
2160           error0 = error1 = IP4_ERROR_NONE;
2161           next0 = next1 = IP4_REWRITE_NEXT_DROP;
2162
2163           /* Decrement TTL & update checksum.
2164              Works either endian, so no need for byte swap. */
2165           if (! rewrite_for_locally_received_packets)
2166             {
2167               i32 ttl0 = ip0->ttl, ttl1 = ip1->ttl;
2168
2169               /* Input node should have reject packets with ttl 0. */
2170               ASSERT (ip0->ttl > 0);
2171               ASSERT (ip1->ttl > 0);
2172
2173               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2174               checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100);
2175
2176               checksum0 += checksum0 >= 0xffff;
2177               checksum1 += checksum1 >= 0xffff;
2178
2179               ip0->checksum = checksum0;
2180               ip1->checksum = checksum1;
2181
2182               ttl0 -= 1;
2183               ttl1 -= 1;
2184
2185               ip0->ttl = ttl0;
2186               ip1->ttl = ttl1;
2187
2188               /*
2189                * If the ttl drops below 1 when forwarding, generate
2190                * an ICMP response.
2191                */
2192               if (PREDICT_FALSE(ttl0 <= 0))
2193                 {
2194                   error0 = IP4_ERROR_TIME_EXPIRED;
2195                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32)~0;
2196                   icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
2197                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2198                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2199                 }
2200               if (PREDICT_FALSE(ttl1 <= 0))
2201                 {
2202                   error1 = IP4_ERROR_TIME_EXPIRED;
2203                   vnet_buffer (p1)->sw_if_index[VLIB_TX] = (u32)~0;
2204                   icmp4_error_set_vnet_buffer(p1, ICMP4_time_exceeded,
2205                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2206                   next1 = IP4_REWRITE_NEXT_ICMP_ERROR;
2207                 }
2208
2209               /* Verify checksum. */
2210               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2211               ASSERT (ip1->checksum == ip4_header_checksum (ip1));
2212             }
2213
2214           /* Rewrite packet header and updates lengths. */
2215           adj0 = ip_get_adjacency (lm, adj_index0);
2216           adj1 = ip_get_adjacency (lm, adj_index1);
2217       
2218           if (rewrite_for_locally_received_packets)
2219             {
2220               if (PREDICT_FALSE(adj0->lookup_next_index
2221                                 == IP_LOOKUP_NEXT_ARP))
2222                 next0_override = IP4_REWRITE_NEXT_ARP;
2223               if (PREDICT_FALSE(adj1->lookup_next_index
2224                                 == IP_LOOKUP_NEXT_ARP))
2225                 next1_override = IP4_REWRITE_NEXT_ARP;
2226             }
2227
2228           /* Worth pipelining. No guarantee that adj0,1 are hot... */
2229           rw_len0 = adj0[0].rewrite_header.data_bytes;
2230           rw_len1 = adj1[0].rewrite_header.data_bytes;
2231           vnet_buffer(p0)->ip.save_rewrite_length = rw_len0;
2232           vnet_buffer(p1)->ip.save_rewrite_length = rw_len1;
2233
2234           /* Check MTU of outgoing interface. */
2235           error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes
2236                     ? IP4_ERROR_MTU_EXCEEDED
2237                     : error0);
2238           error1 = (vlib_buffer_length_in_chain (vm, p1) > adj1[0].rewrite_header.max_l3_packet_bytes
2239                     ? IP4_ERROR_MTU_EXCEEDED
2240                     : error1);
2241
2242           next0 = (error0 == IP4_ERROR_NONE)
2243             ? adj0[0].rewrite_header.next_index : next0;
2244
2245           if (rewrite_for_locally_received_packets)
2246               next0 = next0 && next0_override ? next0_override : next0;
2247
2248           next1 = (error1 == IP4_ERROR_NONE)
2249             ? adj1[0].rewrite_header.next_index : next1;
2250
2251           if (rewrite_for_locally_received_packets)
2252               next1 = next1 && next1_override ? next1_override : next1;
2253
2254           /* 
2255            * We've already accounted for an ethernet_header_t elsewhere
2256            */
2257           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
2258               vlib_increment_combined_counter 
2259                   (&adjacency_counters,
2260                    cpu_index, adj_index0, 
2261                    /* packet increment */ 0,
2262                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
2263
2264           if (PREDICT_FALSE (rw_len1 > sizeof(ethernet_header_t)))
2265               vlib_increment_combined_counter 
2266                   (&adjacency_counters,
2267                    cpu_index, adj_index1, 
2268                    /* packet increment */ 0,
2269                    /* byte increment */ rw_len1-sizeof(ethernet_header_t));
2270
2271           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2272            * to see the IP headerr */
2273           if (PREDICT_TRUE(error0 == IP4_ERROR_NONE))
2274             {
2275               p0->current_data -= rw_len0;
2276               p0->current_length += rw_len0;
2277               tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2278               vnet_buffer (p0)->sw_if_index[VLIB_TX] =
2279                   tx_sw_if_index0;
2280
2281               if (PREDICT_FALSE 
2282                   (clib_bitmap_get (lm->tx_sw_if_has_ip_output_features, 
2283                                     tx_sw_if_index0)))
2284                 {
2285                   p0->current_config_index = 
2286                     vec_elt (cm->config_index_by_sw_if_index, 
2287                              tx_sw_if_index0);
2288                   vnet_get_config_data (&cm->config_main,
2289                                         &p0->current_config_index,
2290                                         &next0,
2291                                         /* # bytes of config data */ 0);
2292                 }
2293             }
2294           if (PREDICT_TRUE(error1 == IP4_ERROR_NONE))
2295             {
2296               p1->current_data -= rw_len1;
2297               p1->current_length += rw_len1;
2298
2299               tx_sw_if_index1 = adj1[0].rewrite_header.sw_if_index;
2300               vnet_buffer (p1)->sw_if_index[VLIB_TX] =
2301                   tx_sw_if_index1;
2302
2303               if (PREDICT_FALSE 
2304                   (clib_bitmap_get (lm->tx_sw_if_has_ip_output_features, 
2305                                     tx_sw_if_index1)))
2306                 {
2307                   p1->current_config_index = 
2308                     vec_elt (cm->config_index_by_sw_if_index, 
2309                              tx_sw_if_index1);
2310                   vnet_get_config_data (&cm->config_main,
2311                                         &p1->current_config_index,
2312                                         &next1,
2313                                         /* # bytes of config data */ 0);
2314                 }
2315             }
2316
2317           /* Guess we are only writing on simple Ethernet header. */
2318           vnet_rewrite_two_headers (adj0[0], adj1[0],
2319                                     ip0, ip1,
2320                                     sizeof (ethernet_header_t));
2321
2322           if (is_midchain)
2323           {
2324               adj0->sub_type.midchain.fixup_func(vm, adj0, p0);
2325               adj1->sub_type.midchain.fixup_func(vm, adj1, p1);
2326           }
2327       
2328           vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
2329                                            to_next, n_left_to_next,
2330                                            pi0, pi1, next0, next1);
2331         }
2332
2333       while (n_left_from > 0 && n_left_to_next > 0)
2334         {
2335           ip_adjacency_t * adj0;
2336           vlib_buffer_t * p0;
2337           ip4_header_t * ip0;
2338           u32 pi0, rw_len0, adj_index0, next0, error0, checksum0;
2339           u32 next0_override;
2340           u32 tx_sw_if_index0;
2341
2342           if (rewrite_for_locally_received_packets)
2343               next0_override = 0;
2344
2345           pi0 = to_next[0] = from[0];
2346
2347           p0 = vlib_get_buffer (vm, pi0);
2348
2349           adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
2350
2351           /* We should never rewrite a pkt using the MISS adjacency */
2352           ASSERT(adj_index0);
2353
2354           adj0 = ip_get_adjacency (lm, adj_index0);
2355       
2356           ip0 = vlib_buffer_get_current (p0);
2357
2358           error0 = IP4_ERROR_NONE;
2359           next0 = IP4_REWRITE_NEXT_DROP;            /* drop on error */
2360
2361           /* Decrement TTL & update checksum. */
2362           if (! rewrite_for_locally_received_packets)
2363             {
2364               i32 ttl0 = ip0->ttl;
2365
2366               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2367
2368               checksum0 += checksum0 >= 0xffff;
2369
2370               ip0->checksum = checksum0;
2371
2372               ASSERT (ip0->ttl > 0);
2373
2374               ttl0 -= 1;
2375
2376               ip0->ttl = ttl0;
2377
2378               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2379
2380               if (PREDICT_FALSE(ttl0 <= 0))
2381                 {
2382                   /*
2383                    * If the ttl drops below 1 when forwarding, generate
2384                    * an ICMP response.
2385                    */
2386                   error0 = IP4_ERROR_TIME_EXPIRED;
2387                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2388                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32)~0;
2389                   icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
2390                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2391                 }
2392             }
2393
2394           if (rewrite_for_locally_received_packets)
2395             {
2396               /* 
2397                * We have to override the next_index in ARP adjacencies,
2398                * because they're set up for ip4-arp, not this node...
2399                */
2400               if (PREDICT_FALSE(adj0->lookup_next_index
2401                                 == IP_LOOKUP_NEXT_ARP))
2402                 next0_override = IP4_REWRITE_NEXT_ARP;
2403             }
2404
2405           /* Guess we are only writing on simple Ethernet header. */
2406           vnet_rewrite_one_header (adj0[0], ip0, 
2407                                    sizeof (ethernet_header_t));
2408           
2409           /* Update packet buffer attributes/set output interface. */
2410           rw_len0 = adj0[0].rewrite_header.data_bytes;
2411           vnet_buffer(p0)->ip.save_rewrite_length = rw_len0;
2412           
2413           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
2414               vlib_increment_combined_counter 
2415                   (&adjacency_counters,
2416                    cpu_index, adj_index0, 
2417                    /* packet increment */ 0,
2418                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
2419           
2420           /* Check MTU of outgoing interface. */
2421           error0 = (vlib_buffer_length_in_chain (vm, p0) 
2422                     > adj0[0].rewrite_header.max_l3_packet_bytes
2423                     ? IP4_ERROR_MTU_EXCEEDED
2424                     : error0);
2425
2426           p0->error = error_node->errors[error0];
2427
2428           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2429            * to see the IP headerr */
2430           if (PREDICT_TRUE(error0 == IP4_ERROR_NONE))
2431             {
2432               p0->current_data -= rw_len0;
2433               p0->current_length += rw_len0;
2434               tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index;
2435
2436               vnet_buffer (p0)->sw_if_index[VLIB_TX] = tx_sw_if_index0;
2437               next0 = adj0[0].rewrite_header.next_index;
2438
2439               if (is_midchain)
2440                 {
2441                   adj0->sub_type.midchain.fixup_func(vm, adj0, p0);
2442                 }
2443
2444               if (PREDICT_FALSE 
2445                   (clib_bitmap_get (lm->tx_sw_if_has_ip_output_features, 
2446                                     tx_sw_if_index0)))
2447                   {
2448                     p0->current_config_index = 
2449                       vec_elt (cm->config_index_by_sw_if_index, 
2450                                tx_sw_if_index0);
2451                     vnet_get_config_data (&cm->config_main,
2452                                           &p0->current_config_index,
2453                                           &next0,
2454                                           /* # bytes of config data */ 0);
2455                   }
2456             }
2457
2458           if (rewrite_for_locally_received_packets)
2459               next0 = next0 && next0_override ? next0_override : next0;
2460
2461           from += 1;
2462           n_left_from -= 1;
2463           to_next += 1;
2464           n_left_to_next -= 1;
2465       
2466           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
2467                                            to_next, n_left_to_next,
2468                                            pi0, next0);
2469         }
2470   
2471       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2472     }
2473
2474   /* Need to do trace after rewrites to pick up new packet data. */
2475   if (node->flags & VLIB_NODE_FLAG_TRACE)
2476     ip4_forward_next_trace (vm, node, frame, adj_rx_tx);
2477
2478   return frame->n_vectors;
2479 }
2480
2481
2482 /** @brief IPv4 transit rewrite node.
2483     @node ip4-rewrite-transit
2484
2485     This is the IPv4 transit-rewrite node: decrement TTL, fix the ipv4
2486     header checksum, fetch the ip adjacency, check the outbound mtu,
2487     apply the adjacency rewrite, and send pkts to the adjacency
2488     rewrite header's rewrite_next_index.
2489
2490     @param vm vlib_main_t corresponding to the current thread
2491     @param node vlib_node_runtime_t
2492     @param frame vlib_frame_t whose contents should be dispatched
2493
2494     @par Graph mechanics: buffer metadata, next index usage
2495
2496     @em Uses:
2497     - <code>vnet_buffer(b)->ip.adj_index[VLIB_TX]</code>
2498         - the rewrite adjacency index
2499     - <code>adj->lookup_next_index</code>
2500         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2501           the packet will be dropped. 
2502     - <code>adj->rewrite_header</code>
2503         - Rewrite string length, rewrite string, next_index
2504
2505     @em Sets:
2506     - <code>b->current_data, b->current_length</code>
2507         - Updated net of applying the rewrite string
2508
2509     <em>Next Indices:</em>
2510     - <code> adj->rewrite_header.next_index </code>
2511       or @c error-drop 
2512 */
2513 static uword
2514 ip4_rewrite_transit (vlib_main_t * vm,
2515                      vlib_node_runtime_t * node,
2516                      vlib_frame_t * frame)
2517 {
2518   return ip4_rewrite_inline (vm, node, frame,
2519                              /* rewrite_for_locally_received_packets */ 0, 0);
2520 }
2521
2522 /** @brief IPv4 local rewrite node.
2523     @node ip4-rewrite-local
2524
2525     This is the IPv4 local rewrite node. Fetch the ip adjacency, check
2526     the outbound interface mtu, apply the adjacency rewrite, and send
2527     pkts to the adjacency rewrite header's rewrite_next_index. Deal
2528     with hemorrhoids of the form "some clown sends an icmp4 w/ src =
2529     dst = interface addr."
2530
2531     @param vm vlib_main_t corresponding to the current thread
2532     @param node vlib_node_runtime_t
2533     @param frame vlib_frame_t whose contents should be dispatched
2534
2535     @par Graph mechanics: buffer metadata, next index usage
2536
2537     @em Uses:
2538     - <code>vnet_buffer(b)->ip.adj_index[VLIB_RX]</code>
2539         - the rewrite adjacency index
2540     - <code>adj->lookup_next_index</code>
2541         - Must be IP_LOOKUP_NEXT_REWRITE or IP_LOOKUP_NEXT_ARP, otherwise
2542           the packet will be dropped. 
2543     - <code>adj->rewrite_header</code>
2544         - Rewrite string length, rewrite string, next_index
2545
2546     @em Sets:
2547     - <code>b->current_data, b->current_length</code>
2548         - Updated net of applying the rewrite string
2549
2550     <em>Next Indices:</em>
2551     - <code> adj->rewrite_header.next_index </code>
2552       or @c error-drop 
2553 */
2554
2555 static uword
2556 ip4_rewrite_local (vlib_main_t * vm,
2557                    vlib_node_runtime_t * node,
2558                    vlib_frame_t * frame)
2559 {
2560   return ip4_rewrite_inline (vm, node, frame,
2561                              /* rewrite_for_locally_received_packets */ 1, 0);
2562 }
2563
2564 static uword
2565 ip4_midchain (vlib_main_t * vm,
2566               vlib_node_runtime_t * node,
2567               vlib_frame_t * frame)
2568 {
2569   return ip4_rewrite_inline (vm, node, frame,
2570                              /* rewrite_for_locally_received_packets */ 0, 1);
2571 }
2572
2573 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2574   .function = ip4_rewrite_transit,
2575   .name = "ip4-rewrite-transit",
2576   .vector_size = sizeof (u32),
2577
2578   .format_trace = format_ip4_rewrite_trace,
2579
2580   .n_next_nodes = 3,
2581   .next_nodes = {
2582     [IP4_REWRITE_NEXT_DROP] = "error-drop",
2583     [IP4_REWRITE_NEXT_ARP] = "ip4-arp",
2584     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2585   },
2586 };
2587
2588 VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_node, ip4_rewrite_transit)
2589
2590 VLIB_REGISTER_NODE (ip4_midchain_node) = {
2591   .function = ip4_midchain,
2592   .name = "ip4-midchain",
2593   .vector_size = sizeof (u32),
2594
2595   .format_trace = format_ip4_forward_next_trace,
2596
2597   .sibling_of = "ip4-rewrite-transit",
2598 };
2599
2600 VLIB_NODE_FUNCTION_MULTIARCH (ip4_midchain_node, ip4_midchain)
2601
2602 VLIB_REGISTER_NODE (ip4_rewrite_local_node) = {
2603   .function = ip4_rewrite_local,
2604   .name = "ip4-rewrite-local",
2605   .vector_size = sizeof (u32),
2606
2607   .sibling_of = "ip4-rewrite-transit",
2608
2609   .format_trace = format_ip4_rewrite_trace,
2610
2611   .n_next_nodes = 0,
2612 };
2613
2614 VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_local_node, ip4_rewrite_local)
2615
2616 static clib_error_t *
2617 add_del_interface_table (vlib_main_t * vm,
2618                          unformat_input_t * input,
2619                          vlib_cli_command_t * cmd)
2620 {
2621   vnet_main_t * vnm = vnet_get_main();
2622   clib_error_t * error = 0;
2623   u32 sw_if_index, table_id;
2624
2625   sw_if_index = ~0;
2626
2627   if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
2628     {
2629       error = clib_error_return (0, "unknown interface `%U'",
2630                                  format_unformat_error, input);
2631       goto done;
2632     }
2633
2634   if (unformat (input, "%d", &table_id))
2635     ;
2636   else
2637     {
2638       error = clib_error_return (0, "expected table id `%U'",
2639                                  format_unformat_error, input);
2640       goto done;
2641     }
2642
2643   {
2644     ip4_main_t * im = &ip4_main;
2645     u32 fib_index;
2646
2647     fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4,
2648                                                    table_id);
2649
2650     //
2651     // FIXME-LATER
2652     //  changing an interface's table has consequences for any connecteds
2653     //  and adj-fibs already installed.
2654     //
2655     vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
2656     im->fib_index_by_sw_if_index[sw_if_index] = fib_index;
2657   }
2658
2659  done:
2660   return error;
2661 }
2662
2663 /*?
2664  * Place the indicated interface into the supplied VRF
2665  *
2666  * @cliexpar
2667  * @cliexstart{set interface ip table}
2668  *
2669  *  vpp# set interface ip table GigabitEthernet2/0/0 2
2670  *
2671  * Interface addresses added after setting the interface IP table end up in the indicated VRF table.
2672  * Predictable but potentially counter-intuitive results occur if you provision interface addresses in multiple FIBs.
2673  * Upon RX, packets will be processed in the last IP table ID provisioned.
2674  * It might be marginally useful to evade source RPF drops to put an interface address into multiple FIBs.
2675  * @cliexend
2676  ?*/
2677 VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = {
2678   .path = "set interface ip table",
2679   .function = add_del_interface_table,
2680   .short_help = "Add/delete FIB table id for interface",
2681 };
2682
2683
2684 static uword
2685 ip4_lookup_multicast (vlib_main_t * vm,
2686                       vlib_node_runtime_t * node,
2687                       vlib_frame_t * frame)
2688 {
2689   ip4_main_t * im = &ip4_main;
2690   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
2691   u32 n_left_from, n_left_to_next, * from, * to_next;
2692   ip_lookup_next_t next;
2693   u32 cpu_index = os_get_cpu_number();
2694
2695   from = vlib_frame_vector_args (frame);
2696   n_left_from = frame->n_vectors;
2697   next = node->cached_next_index;
2698
2699   while (n_left_from > 0)
2700     {
2701       vlib_get_next_frame (vm, node, next,
2702                            to_next, n_left_to_next);
2703
2704       while (n_left_from >= 4 && n_left_to_next >= 2)
2705         {
2706           vlib_buffer_t * p0, * p1;
2707           u32 pi0, pi1, lb_index0, lb_index1, wrong_next;
2708           ip_lookup_next_t next0, next1;
2709           ip4_header_t * ip0, * ip1;
2710           u32 fib_index0, fib_index1;
2711           const dpo_id_t *dpo0, *dpo1;
2712           const load_balance_t * lb0, * lb1;
2713
2714           /* Prefetch next iteration. */
2715           {
2716             vlib_buffer_t * p2, * p3;
2717
2718             p2 = vlib_get_buffer (vm, from[2]);
2719             p3 = vlib_get_buffer (vm, from[3]);
2720
2721             vlib_prefetch_buffer_header (p2, LOAD);
2722             vlib_prefetch_buffer_header (p3, LOAD);
2723
2724             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
2725             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
2726           }
2727
2728           pi0 = to_next[0] = from[0];
2729           pi1 = to_next[1] = from[1];
2730
2731           p0 = vlib_get_buffer (vm, pi0);
2732           p1 = vlib_get_buffer (vm, pi1);
2733
2734           ip0 = vlib_buffer_get_current (p0);
2735           ip1 = vlib_buffer_get_current (p1);
2736
2737           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
2738           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
2739           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
2740             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
2741           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
2742             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
2743
2744           lb_index0 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0),
2745                                                &ip0->dst_address);
2746           lb_index1 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index1),
2747                                                &ip1->dst_address);
2748
2749           lb0 = load_balance_get (lb_index0);
2750           lb1 = load_balance_get (lb_index1);
2751
2752           ASSERT (lb0->lb_n_buckets > 0);
2753           ASSERT (is_pow2 (lb0->lb_n_buckets));
2754           ASSERT (lb1->lb_n_buckets > 0);
2755           ASSERT (is_pow2 (lb1->lb_n_buckets));
2756
2757           vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash 
2758               (ip0, lb0->lb_hash_config);
2759                                                                   
2760           vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash 
2761               (ip1, lb1->lb_hash_config);
2762
2763           dpo0 = load_balance_get_bucket_i(lb0,
2764                                            (vnet_buffer (p0)->ip.flow_hash &
2765                                             (lb0->lb_n_buckets_minus_1)));
2766           dpo1 = load_balance_get_bucket_i(lb1,
2767                                            (vnet_buffer (p1)->ip.flow_hash &
2768                                             (lb0->lb_n_buckets_minus_1)));
2769
2770           next0 = dpo0->dpoi_next_node;
2771           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
2772           next1 = dpo1->dpoi_next_node;
2773           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
2774
2775           if (1) /* $$$$$$ HACK FIXME */
2776           vlib_increment_combined_counter 
2777               (cm, cpu_index, lb_index0, 1,
2778                vlib_buffer_length_in_chain (vm, p0));
2779           if (1) /* $$$$$$ HACK FIXME */
2780           vlib_increment_combined_counter 
2781               (cm, cpu_index, lb_index1, 1,
2782                vlib_buffer_length_in_chain (vm, p1));
2783
2784           from += 2;
2785           to_next += 2;
2786           n_left_to_next -= 2;
2787           n_left_from -= 2;
2788
2789           wrong_next = (next0 != next) + 2*(next1 != next);
2790           if (PREDICT_FALSE (wrong_next != 0))
2791             {
2792               switch (wrong_next)
2793                 {
2794                 case 1:
2795                   /* A B A */
2796                   to_next[-2] = pi1;
2797                   to_next -= 1;
2798                   n_left_to_next += 1;
2799                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
2800                   break;
2801
2802                 case 2:
2803                   /* A A B */
2804                   to_next -= 1;
2805                   n_left_to_next += 1;
2806                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
2807                   break;
2808
2809                 case 3:
2810                   /* A B C */
2811                   to_next -= 2;
2812                   n_left_to_next += 2;
2813                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
2814                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
2815                   if (next0 == next1)
2816                     {
2817                       /* A B B */
2818                       vlib_put_next_frame (vm, node, next, n_left_to_next);
2819                       next = next1;
2820                       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
2821                     }
2822                 }
2823             }
2824         }
2825     
2826       while (n_left_from > 0 && n_left_to_next > 0)
2827         {
2828           vlib_buffer_t * p0;
2829           ip4_header_t * ip0;
2830           u32 pi0, lb_index0;
2831           ip_lookup_next_t next0;
2832           u32 fib_index0;
2833           const dpo_id_t *dpo0;
2834           const load_balance_t * lb0;
2835
2836           pi0 = from[0];
2837           to_next[0] = pi0;
2838
2839           p0 = vlib_get_buffer (vm, pi0);
2840
2841           ip0 = vlib_buffer_get_current (p0);
2842
2843           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
2844                                 vnet_buffer (p0)->sw_if_index[VLIB_RX]);
2845           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
2846               fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
2847           
2848           lb_index0 = ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0),
2849                                                &ip0->dst_address);
2850
2851           lb0 = load_balance_get (lb_index0);
2852
2853           ASSERT (lb0->lb_n_buckets > 0);
2854           ASSERT (is_pow2 (lb0->lb_n_buckets));
2855
2856           vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash 
2857               (ip0, lb0->lb_hash_config);
2858
2859           dpo0 = load_balance_get_bucket_i(lb0,
2860                                            (vnet_buffer (p0)->ip.flow_hash &
2861                                             (lb0->lb_n_buckets_minus_1)));
2862
2863           next0 = dpo0->dpoi_next_node;
2864           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
2865
2866           if (1) /* $$$$$$ HACK FIXME */
2867               vlib_increment_combined_counter 
2868                   (cm, cpu_index, lb_index0, 1,
2869                    vlib_buffer_length_in_chain (vm, p0));
2870
2871           from += 1;
2872           to_next += 1;
2873           n_left_to_next -= 1;
2874           n_left_from -= 1;
2875
2876           if (PREDICT_FALSE (next0 != next))
2877             {
2878               n_left_to_next += 1;
2879               vlib_put_next_frame (vm, node, next, n_left_to_next);
2880               next = next0;
2881               vlib_get_next_frame (vm, node, next,
2882                                    to_next, n_left_to_next);
2883               to_next[0] = pi0;
2884               to_next += 1;
2885               n_left_to_next -= 1;
2886             }
2887         }
2888
2889       vlib_put_next_frame (vm, node, next, n_left_to_next);
2890     }
2891
2892   if (node->flags & VLIB_NODE_FLAG_TRACE)
2893       ip4_forward_next_trace(vm, node, frame, VLIB_TX);
2894
2895   return frame->n_vectors;
2896 }
2897
2898 VLIB_REGISTER_NODE (ip4_lookup_multicast_node,static) = {
2899   .function = ip4_lookup_multicast,
2900   .name = "ip4-lookup-multicast",
2901   .vector_size = sizeof (u32),
2902   .sibling_of = "ip4-lookup",
2903   .format_trace = format_ip4_lookup_trace,
2904
2905   .n_next_nodes = 0,
2906 };
2907
2908 VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_multicast_node, ip4_lookup_multicast)
2909
2910 VLIB_REGISTER_NODE (ip4_multicast_node,static) = {
2911   .function = ip4_drop,
2912   .name = "ip4-multicast",
2913   .vector_size = sizeof (u32),
2914
2915   .format_trace = format_ip4_forward_next_trace,
2916
2917   .n_next_nodes = 1,
2918   .next_nodes = {
2919     [0] = "error-drop",
2920   },
2921 };
2922
2923 int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0)
2924 {
2925   ip4_fib_mtrie_t * mtrie0;
2926   ip4_fib_mtrie_leaf_t leaf0;
2927   u32 lbi0;
2928     
2929   mtrie0 = &ip4_fib_get (fib_index0)->mtrie;
2930
2931   leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
2932   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 0);
2933   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 1);
2934   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
2935   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
2936   
2937   /* Handle default route. */
2938   leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
2939   
2940   lbi0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
2941   
2942   return lbi0 == ip4_fib_table_lookup_lb (ip4_fib_get(fib_index0), a);
2943 }
2944  
2945 static clib_error_t *
2946 test_lookup_command_fn (vlib_main_t * vm,
2947                         unformat_input_t * input,
2948                         vlib_cli_command_t * cmd)
2949 {
2950   u32 table_id = 0;
2951   f64 count = 1;
2952   u32 n;
2953   int i;
2954   ip4_address_t ip4_base_address;
2955   u64 errors = 0;
2956
2957   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
2958       if (unformat (input, "table %d", &table_id))
2959         ;
2960       else if (unformat (input, "count %f", &count))
2961         ;
2962
2963       else if (unformat (input, "%U",
2964                          unformat_ip4_address, &ip4_base_address))
2965         ;
2966       else
2967         return clib_error_return (0, "unknown input `%U'",
2968                                   format_unformat_error, input);
2969   }
2970
2971   n = count;
2972
2973   for (i = 0; i < n; i++)
2974     {
2975       if (!ip4_lookup_validate (&ip4_base_address, table_id))
2976         errors++;
2977
2978       ip4_base_address.as_u32 = 
2979         clib_host_to_net_u32 (1 + 
2980                               clib_net_to_host_u32 (ip4_base_address.as_u32));
2981     }
2982
2983   if (errors) 
2984     vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
2985   else
2986     vlib_cli_output (vm, "No errors in %d lookups\n", n);
2987
2988   return 0;
2989 }
2990
2991 VLIB_CLI_COMMAND (lookup_test_command, static) = {
2992     .path = "test lookup",
2993     .short_help = "test lookup",
2994     .function = test_lookup_command_fn,
2995 };
2996
2997 int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
2998 {
2999   ip4_main_t * im4 = &ip4_main;
3000   ip4_fib_t * fib;
3001   uword * p = hash_get (im4->fib_index_by_table_id, table_id);
3002
3003   if (p == 0)
3004     return VNET_API_ERROR_NO_SUCH_FIB;
3005
3006   fib = ip4_fib_get (p[0]);
3007
3008   fib->flow_hash_config = flow_hash_config;
3009   return 0;
3010 }
3011  
3012 static clib_error_t *
3013 set_ip_flow_hash_command_fn (vlib_main_t * vm,
3014                              unformat_input_t * input,
3015                              vlib_cli_command_t * cmd)
3016 {
3017   int matched = 0;
3018   u32 table_id = 0;
3019   u32 flow_hash_config = 0;
3020   int rv;
3021
3022   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
3023     if (unformat (input, "table %d", &table_id))
3024       matched = 1;
3025 #define _(a,v) \
3026     else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
3027     foreach_flow_hash_bit
3028 #undef _
3029     else break;
3030   }
3031   
3032   if (matched == 0)
3033     return clib_error_return (0, "unknown input `%U'",
3034                               format_unformat_error, input);
3035   
3036   rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
3037   switch (rv)
3038     {
3039     case 0:
3040       break;
3041       
3042     case VNET_API_ERROR_NO_SUCH_FIB:
3043       return clib_error_return (0, "no such FIB table %d", table_id);
3044       
3045     default:
3046       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
3047       break;
3048     }
3049   
3050   return 0;
3051 }
3052  
3053 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) = {
3054   .path = "set ip flow-hash",
3055   .short_help = 
3056   "set ip table flow-hash table <fib-id> src dst sport dport proto reverse",
3057   .function = set_ip_flow_hash_command_fn,
3058 };
3059  
3060 int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index, 
3061                                  u32 table_index)
3062 {
3063   vnet_main_t * vnm = vnet_get_main();
3064   vnet_interface_main_t * im = &vnm->interface_main;
3065   ip4_main_t * ipm = &ip4_main;
3066   ip_lookup_main_t * lm = &ipm->lookup_main;
3067   vnet_classify_main_t * cm = &vnet_classify_main;
3068   ip4_address_t *if_addr;
3069
3070   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
3071     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
3072
3073   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
3074     return VNET_API_ERROR_NO_SUCH_ENTRY;
3075
3076   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
3077   lm->classify_table_index_by_sw_if_index [sw_if_index] = table_index;
3078
3079   if_addr = ip4_interface_first_address (ipm, sw_if_index, NULL);
3080
3081   if (NULL != if_addr)
3082   {
3083       fib_prefix_t pfx = {
3084           .fp_len = 32,
3085           .fp_proto = FIB_PROTOCOL_IP4,
3086           .fp_addr.ip4 = *if_addr,
3087       };
3088       u32 fib_index;
3089
3090       fib_index = fib_table_get_index_for_sw_if_index(FIB_PROTOCOL_IP4,
3091                                                       sw_if_index);
3092
3093
3094       if (table_index != (u32) ~0)
3095       {
3096           dpo_id_t dpo = DPO_NULL;
3097
3098           dpo_set(&dpo,
3099                   DPO_CLASSIFY,
3100                   DPO_PROTO_IP4,
3101                   classify_dpo_create(FIB_PROTOCOL_IP4,
3102                                       table_index));
3103
3104           fib_table_entry_special_dpo_add(fib_index,
3105                                           &pfx,
3106                                           FIB_SOURCE_CLASSIFY,
3107                                           FIB_ENTRY_FLAG_NONE,
3108                                           &dpo);
3109           dpo_reset(&dpo);
3110       }
3111       else
3112       {
3113           fib_table_entry_special_remove(fib_index,
3114                                          &pfx,
3115                                          FIB_SOURCE_CLASSIFY);
3116       }
3117   }
3118
3119   return 0;
3120 }
3121
3122 static clib_error_t *
3123 set_ip_classify_command_fn (vlib_main_t * vm,
3124                             unformat_input_t * input,
3125                             vlib_cli_command_t * cmd)
3126 {
3127   u32 table_index = ~0;
3128   int table_index_set = 0;
3129   u32 sw_if_index = ~0;
3130   int rv;
3131   
3132   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
3133     if (unformat (input, "table-index %d", &table_index))
3134       table_index_set = 1;
3135     else if (unformat (input, "intfc %U", unformat_vnet_sw_interface, 
3136                        vnet_get_main(), &sw_if_index))
3137       ;
3138     else
3139       break;
3140   }
3141       
3142   if (table_index_set == 0)
3143     return clib_error_return (0, "classify table-index must be specified");
3144
3145   if (sw_if_index == ~0)
3146     return clib_error_return (0, "interface / subif must be specified");
3147
3148   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
3149
3150   switch (rv)
3151     {
3152     case 0:
3153       break;
3154
3155     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
3156       return clib_error_return (0, "No such interface");
3157
3158     case VNET_API_ERROR_NO_SUCH_ENTRY:
3159       return clib_error_return (0, "No such classifier table");
3160     }
3161   return 0;
3162 }
3163
3164 VLIB_CLI_COMMAND (set_ip_classify_command, static) = {
3165     .path = "set ip classify",
3166     .short_help = 
3167     "set ip classify intfc <int> table-index <index>",
3168     .function = set_ip_classify_command_fn,
3169 };
3170