VPP-179 Fix adjacency reference-count botches
[vpp.git] / vnet / vnet / ip / ip4_forward.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  * ip/ip4_forward.c: IP v4 forwarding
17  *
18  * Copyright (c) 2008 Eliot Dresselhaus
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining
21  * a copy of this software and associated documentation files (the
22  * "Software"), to deal in the Software without restriction, including
23  * without limitation the rights to use, copy, modify, merge, publish,
24  * distribute, sublicense, and/or sell copies of the Software, and to
25  * permit persons to whom the Software is furnished to do so, subject to
26  * the following conditions:
27  *
28  * The above copyright notice and this permission notice shall be
29  * included in all copies or substantial portions of the Software.
30  *
31  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38  */
39
40 #include <vnet/vnet.h>
41 #include <vnet/ip/ip.h>
42 #include <vnet/ethernet/ethernet.h>     /* for ethernet_header_t */
43 #include <vnet/ethernet/arp_packet.h>   /* for ethernet_arp_header_t */
44 #include <vnet/ppp/ppp.h>
45 #include <vnet/srp/srp.h>       /* for srp_hw_interface_class */
46 #include <vnet/api_errno.h>     /* for API error numbers */
47
48 /* This is really, really simple but stupid fib. */
49 u32
50 ip4_fib_lookup_with_table (ip4_main_t * im, u32 fib_index,
51                            ip4_address_t * dst,
52                            u32 disable_default_route)
53 {
54   ip_lookup_main_t * lm = &im->lookup_main;
55   ip4_fib_t * fib = vec_elt_at_index (im->fibs, fib_index);
56   uword * p, * hash, key;
57   i32 i, i_min, dst_address, ai;
58
59   i_min = disable_default_route ? 1 : 0;
60   dst_address = clib_mem_unaligned (&dst->data_u32, u32);
61   for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= i_min; i--)
62     {
63       hash = fib->adj_index_by_dst_address[i];
64       if (! hash)
65         continue;
66
67       key = dst_address & im->fib_masks[i];
68       if ((p = hash_get (hash, key)) != 0)
69         {
70           ai = p[0];
71           goto done;
72         }
73     }
74     
75   /* Nothing matches in table. */
76   ai = lm->miss_adj_index;
77
78  done:
79   return ai;
80 }
81
82 static ip4_fib_t *
83 create_fib_with_table_id (ip4_main_t * im, u32 table_id)
84 {
85   ip4_fib_t * fib;
86   hash_set (im->fib_index_by_table_id, table_id, vec_len (im->fibs));
87   vec_add2 (im->fibs, fib, 1);
88   fib->table_id = table_id;
89   fib->index = fib - im->fibs;
90   fib->flow_hash_config = IP_FLOW_HASH_DEFAULT;
91   fib->fwd_classify_table_index = ~0;
92   fib->rev_classify_table_index = ~0;
93   ip4_mtrie_init (&fib->mtrie);
94   return fib;
95 }
96
97 ip4_fib_t *
98 find_ip4_fib_by_table_index_or_id (ip4_main_t * im, 
99                                    u32 table_index_or_id, u32 flags)
100 {
101   uword * p, fib_index;
102
103   fib_index = table_index_or_id;
104   if (! (flags & IP4_ROUTE_FLAG_FIB_INDEX))
105     {
106       if (table_index_or_id == ~0) {
107         table_index_or_id = 0;
108         while ((p = hash_get (im->fib_index_by_table_id, table_index_or_id))) {
109           table_index_or_id++;
110         }
111         return create_fib_with_table_id (im, table_index_or_id);
112       }
113
114       p = hash_get (im->fib_index_by_table_id, table_index_or_id);
115       if (! p)
116         return create_fib_with_table_id (im, table_index_or_id);
117       fib_index = p[0];
118     }
119   return vec_elt_at_index (im->fibs, fib_index);
120 }
121
122 static void
123 ip4_fib_init_adj_index_by_dst_address (ip_lookup_main_t * lm,
124                                        ip4_fib_t * fib,
125                                        u32 address_length)
126 {
127   hash_t * h;
128   uword max_index;
129
130   ASSERT (lm->fib_result_n_bytes >= sizeof (uword));
131   lm->fib_result_n_words = round_pow2 (lm->fib_result_n_bytes, sizeof (uword)) / sizeof (uword);
132
133   fib->adj_index_by_dst_address[address_length] =
134     hash_create (32 /* elts */, lm->fib_result_n_words * sizeof (uword));
135
136   hash_set_flags (fib->adj_index_by_dst_address[address_length],
137                   HASH_FLAG_NO_AUTO_SHRINK);
138
139   h = hash_header (fib->adj_index_by_dst_address[address_length]);
140   max_index = (hash_value_bytes (h) / sizeof (fib->new_hash_values[0])) - 1;
141
142   /* Initialize new/old hash value vectors. */
143   vec_validate_init_empty (fib->new_hash_values, max_index, ~0);
144   vec_validate_init_empty (fib->old_hash_values, max_index, ~0);
145 }
146
147 static void
148 ip4_fib_set_adj_index (ip4_main_t * im,
149                        ip4_fib_t * fib,
150                        u32 flags,
151                        u32 dst_address_u32,
152                        u32 dst_address_length,
153                        u32 adj_index)
154 {
155   ip_lookup_main_t * lm = &im->lookup_main;
156   uword * hash;
157
158   if (vec_bytes(fib->old_hash_values))
159     memset (fib->old_hash_values, ~0, vec_bytes (fib->old_hash_values));
160   if (vec_bytes(fib->new_hash_values))
161     memset (fib->new_hash_values, ~0, vec_bytes (fib->new_hash_values));
162   fib->new_hash_values[0] = adj_index;
163
164   /* Make sure adj index is valid. */
165   if (CLIB_DEBUG > 0)
166     (void) ip_get_adjacency (lm, adj_index);
167
168   hash = fib->adj_index_by_dst_address[dst_address_length];
169
170   hash = _hash_set3 (hash, dst_address_u32,
171                      fib->new_hash_values,
172                      fib->old_hash_values);
173
174   fib->adj_index_by_dst_address[dst_address_length] = hash;
175
176   if (vec_len (im->add_del_route_callbacks) > 0)
177     {
178       ip4_add_del_route_callback_t * cb;
179       ip4_address_t d;
180       uword * p;
181
182       d.data_u32 = dst_address_u32;
183       vec_foreach (cb, im->add_del_route_callbacks)
184         if ((flags & cb->required_flags) == cb->required_flags)
185           cb->function (im, cb->function_opaque,
186                         fib, flags,
187                         &d, dst_address_length,
188                         fib->old_hash_values,
189                         fib->new_hash_values);
190
191       p = hash_get (hash, dst_address_u32);
192       clib_memcpy (p, fib->new_hash_values, vec_bytes (fib->new_hash_values));
193     }
194 }
195
196 void ip4_add_del_route (ip4_main_t * im, ip4_add_del_route_args_t * a)
197 {
198   ip_lookup_main_t * lm = &im->lookup_main;
199   ip4_fib_t * fib;
200   u32 dst_address, dst_address_length, adj_index, old_adj_index;
201   uword * hash, is_del;
202   ip4_add_del_route_callback_t * cb;
203
204   /* Either create new adjacency or use given one depending on arguments. */
205   if (a->n_add_adj > 0)
206     {
207       ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index);
208       ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
209     }
210   else
211     adj_index = a->adj_index;
212
213   dst_address = a->dst_address.data_u32;
214   dst_address_length = a->dst_address_length;
215   fib = find_ip4_fib_by_table_index_or_id (im, a->table_index_or_table_id, a->flags);
216
217   ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
218   dst_address &= im->fib_masks[dst_address_length];
219
220   if (! fib->adj_index_by_dst_address[dst_address_length])
221     ip4_fib_init_adj_index_by_dst_address (lm, fib, dst_address_length);
222
223   hash = fib->adj_index_by_dst_address[dst_address_length];
224
225   is_del = (a->flags & IP4_ROUTE_FLAG_DEL) != 0;
226
227   if (is_del)
228     {
229       fib->old_hash_values[0] = ~0;
230       hash = _hash_unset (hash, dst_address, fib->old_hash_values);
231       fib->adj_index_by_dst_address[dst_address_length] = hash;
232
233       if (vec_len (im->add_del_route_callbacks) > 0
234           && fib->old_hash_values[0] != ~0) /* make sure destination was found in hash */
235         {
236           fib->new_hash_values[0] = ~0;
237           vec_foreach (cb, im->add_del_route_callbacks)
238             if ((a->flags & cb->required_flags) == cb->required_flags)
239               cb->function (im, cb->function_opaque,
240                             fib, a->flags,
241                             &a->dst_address, dst_address_length,
242                             fib->old_hash_values,
243                             fib->new_hash_values);
244         }
245     }
246   else
247     ip4_fib_set_adj_index (im, fib, a->flags, dst_address, dst_address_length,
248                            adj_index);
249
250   old_adj_index = fib->old_hash_values[0];
251
252   /* Avoid spurious reference count increments */
253   if (old_adj_index == adj_index
254       && adj_index != ~0
255       && !(a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY))
256     {
257       ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index);
258       if (adj->share_count > 0)
259         adj->share_count --;
260     }
261
262   ip4_fib_mtrie_add_del_route (fib, a->dst_address, dst_address_length,
263                                is_del ? old_adj_index : adj_index,
264                                is_del);
265
266   /* Delete old adjacency index if present and changed. */
267   if (! (a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY)
268       && old_adj_index != ~0
269       && old_adj_index != adj_index)
270     ip_del_adjacency (lm, old_adj_index);
271 }
272
273 void
274 ip4_add_del_route_next_hop (ip4_main_t * im,
275                             u32 flags,
276                             ip4_address_t * dst_address,
277                             u32 dst_address_length,
278                             ip4_address_t * next_hop,
279                             u32 next_hop_sw_if_index,
280                             u32 next_hop_weight, u32 adj_index, 
281                             u32 explicit_fib_index)
282 {
283   vnet_main_t * vnm = vnet_get_main();
284   ip_lookup_main_t * lm = &im->lookup_main;
285   u32 fib_index;
286   ip4_fib_t * fib;
287   u32 dst_address_u32, old_mp_adj_index, new_mp_adj_index;
288   u32 dst_adj_index, nh_adj_index;
289   uword * dst_hash, * dst_result;
290   uword * nh_hash, * nh_result;
291   ip_adjacency_t * dst_adj;
292   ip_multipath_adjacency_t * old_mp, * new_mp;
293   int is_del = (flags & IP4_ROUTE_FLAG_DEL) != 0;
294   int is_interface_next_hop;
295   clib_error_t * error = 0;
296
297   if (explicit_fib_index == (u32)~0)
298       fib_index = vec_elt (im->fib_index_by_sw_if_index, next_hop_sw_if_index);
299   else
300       fib_index = explicit_fib_index;
301
302   fib = vec_elt_at_index (im->fibs, fib_index);
303   
304   /* Lookup next hop to be added or deleted. */
305   is_interface_next_hop = next_hop->data_u32 == 0;
306   if (adj_index == (u32)~0)
307     {
308       if (is_interface_next_hop)
309         {
310           nh_result = hash_get (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index);
311           if (nh_result)
312             nh_adj_index = *nh_result;
313           else
314             {
315               ip_adjacency_t * adj;
316               adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
317                                       &nh_adj_index);
318               ip4_adjacency_set_interface_route (vnm, adj, next_hop_sw_if_index, /* if_address_index */ ~0);
319               ip_call_add_del_adjacency_callbacks (lm, nh_adj_index, /* is_del */ 0);
320               hash_set (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index, nh_adj_index);
321             }
322         }
323       else
324         {
325           nh_hash = fib->adj_index_by_dst_address[32];
326           nh_result = hash_get (nh_hash, next_hop->data_u32);
327           
328           /* Next hop must be known. */
329           if (! nh_result)
330             {
331               ip_adjacency_t * adj;
332
333               nh_adj_index = ip4_fib_lookup_with_table (im, fib_index,
334                                                         next_hop, 0);
335               adj = ip_get_adjacency (lm, nh_adj_index);
336               /* if ARP interface adjacencty is present, we need to
337                  install ARP adjaceny for specific next hop */
338               if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP &&
339                   adj->arp.next_hop.ip4.as_u32 == 0)
340                 {
341                   nh_adj_index = vnet_arp_glean_add(fib_index, next_hop);
342                 }
343               else
344                 {
345                   /* Next hop is not known, so create indirect adj */
346                   ip_adjacency_t add_adj;
347                   memset (&add_adj, 0, sizeof(add_adj));
348                   add_adj.n_adj = 1;
349                   add_adj.lookup_next_index = IP_LOOKUP_NEXT_INDIRECT;
350                   add_adj.indirect.next_hop.ip4.as_u32 = next_hop->as_u32;
351                   add_adj.explicit_fib_index = explicit_fib_index;
352                   ip_add_adjacency (lm, &add_adj, 1, &nh_adj_index);
353                 }
354             }
355           else
356             nh_adj_index = *nh_result;
357         }
358     }
359   else
360     {
361       nh_adj_index = adj_index;
362     }
363   ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
364   dst_address_u32 = dst_address->data_u32 & im->fib_masks[dst_address_length];
365
366   dst_hash = fib->adj_index_by_dst_address[dst_address_length];
367   dst_result = hash_get (dst_hash, dst_address_u32);
368   if (dst_result)
369     {
370       dst_adj_index = dst_result[0];
371       dst_adj = ip_get_adjacency (lm, dst_adj_index);
372     }
373   else
374     {
375       /* For deletes destination must be known. */
376       if (is_del)
377         {
378           vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
379           error = clib_error_return (0, "unknown destination %U/%d",
380                                      format_ip4_address, dst_address,
381                                      dst_address_length);
382           goto done;
383         }
384
385       dst_adj_index = ~0;
386       dst_adj = 0;
387     }
388
389   /* Ignore adds of X/32 with next hop of X. */
390   if (! is_del
391       && dst_address_length == 32
392       && dst_address->data_u32 == next_hop->data_u32 
393       && adj_index != (u32)~0)
394     {
395       vnm->api_errno = VNET_API_ERROR_PREFIX_MATCHES_NEXT_HOP;
396       error = clib_error_return (0, "prefix matches next hop %U/%d",
397                                  format_ip4_address, dst_address,
398                                  dst_address_length);
399       goto done;
400     }
401
402   /* Destination is not known and default weight is set so add route
403      to existing non-multipath adjacency */
404   if (dst_adj_index == ~0 && next_hop_weight == 1 && next_hop_sw_if_index == ~0)
405     {
406       /* create / delete additional mapping of existing adjacency */
407       ip4_add_del_route_args_t a;
408       ip_adjacency_t * nh_adj = ip_get_adjacency (lm, nh_adj_index);
409
410       a.table_index_or_table_id = fib_index;
411       a.flags = ((is_del ? IP4_ROUTE_FLAG_DEL : IP4_ROUTE_FLAG_ADD)
412                  | IP4_ROUTE_FLAG_FIB_INDEX
413                  | IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY
414                  | (flags & (IP4_ROUTE_FLAG_NO_REDISTRIBUTE
415                              | IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP)));
416       a.dst_address = dst_address[0];
417       a.dst_address_length = dst_address_length;
418       a.adj_index = nh_adj_index;
419       a.add_adj = 0;
420       a.n_add_adj = 0;
421
422       ip4_add_del_route (im, &a);
423
424       /* adjust share count. This cannot be the only use of the adjacency */
425       nh_adj->share_count += is_del ? -1 : 1;
426         
427       goto done;
428     }
429
430   old_mp_adj_index = dst_adj ? dst_adj->heap_handle : ~0;
431
432   if (! ip_multipath_adjacency_add_del_next_hop
433       (lm, is_del,
434        old_mp_adj_index,
435        nh_adj_index,
436        next_hop_weight,
437        &new_mp_adj_index))
438     {
439       vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_FOUND_MP;
440       error = clib_error_return (0, "requested deleting next-hop %U not found in multi-path",
441                                  format_ip4_address, next_hop);
442       goto done;
443     }
444   
445   old_mp = new_mp = 0;
446   if (old_mp_adj_index != ~0)
447     old_mp = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
448   if (new_mp_adj_index != ~0)
449     new_mp = vec_elt_at_index (lm->multipath_adjacencies, new_mp_adj_index);
450
451   if (old_mp != new_mp)
452     {
453       ip4_add_del_route_args_t a;
454       ip_adjacency_t * adj;
455
456       a.table_index_or_table_id = fib_index;
457       a.flags = ((is_del && ! new_mp ? IP4_ROUTE_FLAG_DEL : IP4_ROUTE_FLAG_ADD)
458                  | IP4_ROUTE_FLAG_FIB_INDEX
459                  | IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY
460                  | (flags & (IP4_ROUTE_FLAG_NO_REDISTRIBUTE | IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP)));
461       a.dst_address = dst_address[0];
462       a.dst_address_length = dst_address_length;
463       a.adj_index = new_mp ? new_mp->adj_index : dst_adj_index;
464       a.add_adj = 0;
465       a.n_add_adj = 0;
466
467       ip4_add_del_route (im, &a);
468
469       adj = ip_get_adjacency (lm, new_mp ? new_mp->adj_index : dst_adj_index);
470       if (adj->n_adj == 1)
471         adj->share_count += is_del ? -1 : 1;
472     }
473
474  done:
475   if (error)
476     clib_error_report (error);
477 }
478
479 void *
480 ip4_get_route (ip4_main_t * im,
481                u32 table_index_or_table_id,
482                u32 flags,
483                u8 * address,
484                u32 address_length)
485 {
486   ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
487   u32 dst_address = * (u32 *) address;
488   uword * hash, * p;
489
490   ASSERT (address_length < ARRAY_LEN (im->fib_masks));
491   dst_address &= im->fib_masks[address_length];
492
493   hash = fib->adj_index_by_dst_address[address_length];
494   p = hash_get (hash, dst_address);
495   return (void *) p;
496 }
497
498 void
499 ip4_foreach_matching_route (ip4_main_t * im,
500                             u32 table_index_or_table_id,
501                             u32 flags,
502                             ip4_address_t * address,
503                             u32 address_length,
504                             ip4_address_t ** results,
505                             u8 ** result_lengths)
506 {
507   ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
508   u32 dst_address = address->data_u32;
509   u32 this_length = address_length;
510   
511   if (*results)
512     _vec_len (*results) = 0;
513   if (*result_lengths)
514     _vec_len (*result_lengths) = 0;
515
516   while (this_length <= 32 && vec_len (results) == 0)
517     {
518       uword k, v;
519       hash_foreach (k, v, fib->adj_index_by_dst_address[this_length], ({
520         if (0 == ((k ^ dst_address) & im->fib_masks[address_length]))
521           {
522             ip4_address_t a;
523             a.data_u32 = k;
524             vec_add1 (*results, a);
525             vec_add1 (*result_lengths, this_length);
526           }
527       }));
528
529       this_length++;
530     }
531 }
532
533 void ip4_maybe_remap_adjacencies (ip4_main_t * im,
534                                   u32 table_index_or_table_id,
535                                   u32 flags)
536 {
537   ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
538   ip_lookup_main_t * lm = &im->lookup_main;
539   u32 i, l;
540   ip4_address_t a;
541   ip4_add_del_route_callback_t * cb;
542   static ip4_address_t * to_delete;
543
544   if (lm->n_adjacency_remaps == 0)
545     return;
546
547   for (l = 0; l <= 32; l++)
548     {
549       hash_pair_t * p;
550       uword * hash = fib->adj_index_by_dst_address[l];
551
552       if (hash_elts (hash) == 0)
553         continue;
554
555       if (to_delete)
556         _vec_len (to_delete) = 0;
557
558       hash_foreach_pair (p, hash, ({
559         u32 adj_index = p->value[0];
560         u32 m = vec_elt (lm->adjacency_remap_table, adj_index);
561
562         if (m)
563           {
564             /* Record destination address from hash key. */
565             a.data_u32 = p->key;
566
567             /* New adjacency points to nothing: so delete prefix. */
568             if (m == ~0)
569               vec_add1 (to_delete, a);
570             else
571               {
572                 /* Remap to new adjacency. */
573                 clib_memcpy (fib->old_hash_values, p->value, vec_bytes (fib->old_hash_values));
574
575                 /* Set new adjacency value. */
576                 fib->new_hash_values[0] = p->value[0] = m - 1;
577
578                 vec_foreach (cb, im->add_del_route_callbacks)
579                   if ((flags & cb->required_flags) == cb->required_flags)
580                     cb->function (im, cb->function_opaque,
581                                   fib, flags | IP4_ROUTE_FLAG_ADD,
582                                   &a, l,
583                                   fib->old_hash_values,
584                                   fib->new_hash_values);
585               }
586           }
587       }));
588
589       fib->new_hash_values[0] = ~0;
590       for (i = 0; i < vec_len (to_delete); i++)
591         {
592           hash = _hash_unset (hash, to_delete[i].data_u32, fib->old_hash_values);
593           vec_foreach (cb, im->add_del_route_callbacks)
594             if ((flags & cb->required_flags) == cb->required_flags)
595               cb->function (im, cb->function_opaque,
596                             fib, flags | IP4_ROUTE_FLAG_DEL,
597                             &a, l,
598                             fib->old_hash_values,
599                             fib->new_hash_values);
600         }
601     }
602
603   /* Also remap adjacencies in mtrie. */
604   ip4_mtrie_maybe_remap_adjacencies (lm, &fib->mtrie);
605
606   /* Reset mapping table. */
607   vec_zero (lm->adjacency_remap_table);
608
609   /* All remaps have been performed. */
610   lm->n_adjacency_remaps = 0;
611 }
612
613 void ip4_delete_matching_routes (ip4_main_t * im,
614                                  u32 table_index_or_table_id,
615                                  u32 flags,
616                                  ip4_address_t * address,
617                                  u32 address_length)
618 {
619   static ip4_address_t * matching_addresses;
620   static u8 * matching_address_lengths;
621   u32 l, i;
622   ip4_add_del_route_args_t a;
623
624   a.flags = IP4_ROUTE_FLAG_DEL | IP4_ROUTE_FLAG_NO_REDISTRIBUTE | flags;
625   a.table_index_or_table_id = table_index_or_table_id;
626   a.adj_index = ~0;
627   a.add_adj = 0;
628   a.n_add_adj = 0;
629
630   for (l = address_length + 1; l <= 32; l++)
631     {
632       ip4_foreach_matching_route (im, table_index_or_table_id, flags,
633                                   address,
634                                   l,
635                                   &matching_addresses,
636                                   &matching_address_lengths);
637       for (i = 0; i < vec_len (matching_addresses); i++)
638         {
639           a.dst_address = matching_addresses[i];
640           a.dst_address_length = matching_address_lengths[i];
641           ip4_add_del_route (im, &a);
642         }
643     }
644
645   ip4_maybe_remap_adjacencies (im, table_index_or_table_id, flags);
646 }
647
648 void
649 ip4_forward_next_trace (vlib_main_t * vm,
650                         vlib_node_runtime_t * node,
651                         vlib_frame_t * frame,
652                         vlib_rx_or_tx_t which_adj_index);
653
654 always_inline uword
655 ip4_lookup_inline (vlib_main_t * vm,
656                    vlib_node_runtime_t * node,
657                    vlib_frame_t * frame,
658                    int lookup_for_responses_to_locally_received_packets,
659                    int is_indirect)
660 {
661   ip4_main_t * im = &ip4_main;
662   ip_lookup_main_t * lm = &im->lookup_main;
663   vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
664   u32 n_left_from, n_left_to_next, * from, * to_next;
665   ip_lookup_next_t next;
666   u32 cpu_index = os_get_cpu_number();
667
668   from = vlib_frame_vector_args (frame);
669   n_left_from = frame->n_vectors;
670   next = node->cached_next_index;
671
672   while (n_left_from > 0)
673     {
674       vlib_get_next_frame (vm, node, next,
675                            to_next, n_left_to_next);
676
677       while (n_left_from >= 4 && n_left_to_next >= 2)
678         {
679           vlib_buffer_t * p0, * p1;
680           ip4_header_t * ip0, * ip1;
681           __attribute__((unused)) tcp_header_t * tcp0, * tcp1;
682           ip_lookup_next_t next0, next1;
683           ip_adjacency_t * adj0, * adj1;
684           ip4_fib_mtrie_t * mtrie0, * mtrie1;
685           ip4_fib_mtrie_leaf_t leaf0, leaf1;
686           ip4_address_t * dst_addr0, *dst_addr1;
687           __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0;
688           __attribute__((unused)) u32 pi1, fib_index1, adj_index1, is_tcp_udp1;
689           u32 flow_hash_config0, flow_hash_config1;
690           u32 hash_c0, hash_c1;
691           u32 wrong_next;
692
693           /* Prefetch next iteration. */
694           {
695             vlib_buffer_t * p2, * p3;
696
697             p2 = vlib_get_buffer (vm, from[2]);
698             p3 = vlib_get_buffer (vm, from[3]);
699
700             vlib_prefetch_buffer_header (p2, LOAD);
701             vlib_prefetch_buffer_header (p3, LOAD);
702
703             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
704             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
705           }
706
707           pi0 = to_next[0] = from[0];
708           pi1 = to_next[1] = from[1];
709
710           p0 = vlib_get_buffer (vm, pi0);
711           p1 = vlib_get_buffer (vm, pi1);
712
713           ip0 = vlib_buffer_get_current (p0);
714           ip1 = vlib_buffer_get_current (p1);
715
716           if (is_indirect)
717             {
718               ip_adjacency_t * iadj0, * iadj1;
719               iadj0 = ip_get_adjacency (lm, vnet_buffer(p0)->ip.adj_index[VLIB_TX]);
720               iadj1 = ip_get_adjacency (lm, vnet_buffer(p1)->ip.adj_index[VLIB_TX]);
721               dst_addr0 = &iadj0->indirect.next_hop.ip4;
722               dst_addr1 = &iadj1->indirect.next_hop.ip4;
723             }
724           else
725             {
726               dst_addr0 = &ip0->dst_address;
727               dst_addr1 = &ip1->dst_address;
728             }
729
730           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
731           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
732           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
733             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
734           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
735             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
736
737
738           if (! lookup_for_responses_to_locally_received_packets)
739             {
740               mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
741               mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie;
742
743               leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
744
745               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
746               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 0);
747             }
748
749           tcp0 = (void *) (ip0 + 1);
750           tcp1 = (void *) (ip1 + 1);
751
752           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
753                          || ip0->protocol == IP_PROTOCOL_UDP);
754           is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
755                          || ip1->protocol == IP_PROTOCOL_UDP);
756
757           if (! lookup_for_responses_to_locally_received_packets)
758             {
759               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
760               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 1);
761             }
762
763           if (! lookup_for_responses_to_locally_received_packets)
764             {
765               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
766               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 2);
767             }
768
769           if (! lookup_for_responses_to_locally_received_packets)
770             {
771               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
772               leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, dst_addr1, 3);
773             }
774
775           if (lookup_for_responses_to_locally_received_packets)
776             {
777               adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
778               adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
779             }
780           else
781             {
782               /* Handle default route. */
783               leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
784               leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
785
786               adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
787               adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
788             }
789
790           ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
791                                                            dst_addr0,
792                                                            /* no_default_route */ 0));
793           ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1,
794                                                            dst_addr1,
795                                                            /* no_default_route */ 0));
796           adj0 = ip_get_adjacency (lm, adj_index0);
797           adj1 = ip_get_adjacency (lm, adj_index1);
798
799           next0 = adj0->lookup_next_index;
800           next1 = adj1->lookup_next_index;
801
802           /* Use flow hash to compute multipath adjacency. */
803           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
804           hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
805           if (PREDICT_FALSE (adj0->n_adj > 1))
806             {
807               flow_hash_config0 = 
808                 vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
809               hash_c0 = vnet_buffer (p0)->ip.flow_hash = 
810                 ip4_compute_flow_hash (ip0, flow_hash_config0);
811             }
812           if (PREDICT_FALSE(adj1->n_adj > 1))
813             {
814               flow_hash_config1 = 
815                 vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config;
816               hash_c1 = vnet_buffer (p1)->ip.flow_hash = 
817                 ip4_compute_flow_hash (ip1, flow_hash_config1);
818             }
819
820           ASSERT (adj0->n_adj > 0);
821           ASSERT (adj1->n_adj > 0);
822           ASSERT (is_pow2 (adj0->n_adj));
823           ASSERT (is_pow2 (adj1->n_adj));
824           adj_index0 += (hash_c0 & (adj0->n_adj - 1));
825           adj_index1 += (hash_c1 & (adj1->n_adj - 1));
826
827           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
828           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
829
830           vlib_increment_combined_counter 
831               (cm, cpu_index, adj_index0, 1,
832                vlib_buffer_length_in_chain (vm, p0) 
833                + sizeof(ethernet_header_t));
834           vlib_increment_combined_counter 
835               (cm, cpu_index, adj_index1, 1,
836                vlib_buffer_length_in_chain (vm, p1)
837                + sizeof(ethernet_header_t));
838
839           from += 2;
840           to_next += 2;
841           n_left_to_next -= 2;
842           n_left_from -= 2;
843
844           wrong_next = (next0 != next) + 2*(next1 != next);
845           if (PREDICT_FALSE (wrong_next != 0))
846             {
847               switch (wrong_next)
848                 {
849                 case 1:
850                   /* A B A */
851                   to_next[-2] = pi1;
852                   to_next -= 1;
853                   n_left_to_next += 1;
854                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
855                   break;
856
857                 case 2:
858                   /* A A B */
859                   to_next -= 1;
860                   n_left_to_next += 1;
861                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
862                   break;
863
864                 case 3:
865                   /* A B C */
866                   to_next -= 2;
867                   n_left_to_next += 2;
868                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
869                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
870                   if (next0 == next1)
871                     {
872                       /* A B B */
873                       vlib_put_next_frame (vm, node, next, n_left_to_next);
874                       next = next1;
875                       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
876                     }
877                 }
878             }
879         }
880     
881       while (n_left_from > 0 && n_left_to_next > 0)
882         {
883           vlib_buffer_t * p0;
884           ip4_header_t * ip0;
885           __attribute__((unused)) tcp_header_t * tcp0;
886           ip_lookup_next_t next0;
887           ip_adjacency_t * adj0;
888           ip4_fib_mtrie_t * mtrie0;
889           ip4_fib_mtrie_leaf_t leaf0;
890           ip4_address_t * dst_addr0;
891           __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0;
892           u32 flow_hash_config0, hash_c0;
893
894           pi0 = from[0];
895           to_next[0] = pi0;
896
897           p0 = vlib_get_buffer (vm, pi0);
898
899           ip0 = vlib_buffer_get_current (p0);
900
901           if (is_indirect)
902             {
903               ip_adjacency_t * iadj0;
904               iadj0 = ip_get_adjacency (lm, vnet_buffer(p0)->ip.adj_index[VLIB_TX]);
905               dst_addr0 = &iadj0->indirect.next_hop.ip4;
906             }
907           else
908             {
909               dst_addr0 = &ip0->dst_address;
910             }
911
912           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
913           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
914             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
915
916           if (! lookup_for_responses_to_locally_received_packets)
917             {
918               mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
919
920               leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
921
922               leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 0);
923             }
924
925           tcp0 = (void *) (ip0 + 1);
926
927           is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
928                          || ip0->protocol == IP_PROTOCOL_UDP);
929
930           if (! lookup_for_responses_to_locally_received_packets)
931             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 1);
932
933           if (! lookup_for_responses_to_locally_received_packets)
934             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 2);
935
936           if (! lookup_for_responses_to_locally_received_packets)
937             leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, dst_addr0, 3);
938
939           if (lookup_for_responses_to_locally_received_packets)
940             adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
941           else
942             {
943               /* Handle default route. */
944               leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
945               adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
946             }
947
948           ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
949                                                            dst_addr0,
950                                                            /* no_default_route */ 0));
951
952           adj0 = ip_get_adjacency (lm, adj_index0);
953
954           next0 = adj0->lookup_next_index;
955
956           /* Use flow hash to compute multipath adjacency. */
957           hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
958           if (PREDICT_FALSE(adj0->n_adj > 1))
959             {
960               flow_hash_config0 = 
961                 vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
962
963               hash_c0 = vnet_buffer (p0)->ip.flow_hash = 
964                 ip4_compute_flow_hash (ip0, flow_hash_config0);
965             }
966
967           ASSERT (adj0->n_adj > 0);
968           ASSERT (is_pow2 (adj0->n_adj));
969           adj_index0 += (hash_c0 & (adj0->n_adj - 1));
970
971           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
972
973           vlib_increment_combined_counter 
974               (cm, cpu_index, adj_index0, 1,
975                vlib_buffer_length_in_chain (vm, p0)
976                + sizeof(ethernet_header_t));
977
978           from += 1;
979           to_next += 1;
980           n_left_to_next -= 1;
981           n_left_from -= 1;
982
983           if (PREDICT_FALSE (next0 != next))
984             {
985               n_left_to_next += 1;
986               vlib_put_next_frame (vm, node, next, n_left_to_next);
987               next = next0;
988               vlib_get_next_frame (vm, node, next,
989                                    to_next, n_left_to_next);
990               to_next[0] = pi0;
991               to_next += 1;
992               n_left_to_next -= 1;
993             }
994         }
995
996       vlib_put_next_frame (vm, node, next, n_left_to_next);
997     }
998
999   if (node->flags & VLIB_NODE_FLAG_TRACE)
1000     ip4_forward_next_trace(vm, node, frame, VLIB_TX);
1001
1002   return frame->n_vectors;
1003 }
1004
1005 static uword
1006 ip4_lookup (vlib_main_t * vm,
1007             vlib_node_runtime_t * node,
1008             vlib_frame_t * frame)
1009 {
1010   return ip4_lookup_inline (vm, node, frame,
1011                             /* lookup_for_responses_to_locally_received_packets */ 0,
1012                             /* is_indirect */ 0);
1013
1014 }
1015
1016 void ip4_adjacency_set_interface_route (vnet_main_t * vnm,
1017                                         ip_adjacency_t * adj,
1018                                         u32 sw_if_index,
1019                                         u32 if_address_index)
1020 {
1021   vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
1022   ip_lookup_next_t n;
1023   vnet_l3_packet_type_t packet_type;
1024   u32 node_index;
1025
1026   if (hw->hw_class_index == ethernet_hw_interface_class.index
1027       || hw->hw_class_index == srp_hw_interface_class.index)
1028     {
1029       /* 
1030        * We have a bit of a problem in this case. ip4-arp uses
1031        * the rewrite_header.next_index to hand pkts to the
1032        * indicated inteface output node. We can end up in
1033        * ip4_rewrite_local, too, which also pays attention to 
1034        * rewrite_header.next index. Net result: a hack in
1035        * ip4_rewrite_local...
1036        */
1037       n = IP_LOOKUP_NEXT_ARP;
1038       node_index = ip4_arp_node.index;
1039       adj->if_address_index = if_address_index;
1040       adj->arp.next_hop.ip4.as_u32 = 0;
1041       ip46_address_reset(&adj->arp.next_hop);
1042       packet_type = VNET_L3_PACKET_TYPE_ARP;
1043     }
1044   else
1045     {
1046       n = IP_LOOKUP_NEXT_REWRITE;
1047       node_index = ip4_rewrite_node.index;
1048       packet_type = VNET_L3_PACKET_TYPE_IP4;
1049     }
1050
1051   adj->lookup_next_index = n;
1052   vnet_rewrite_for_sw_interface
1053     (vnm,
1054      packet_type,
1055      sw_if_index,
1056      node_index,
1057      VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
1058      &adj->rewrite_header,
1059      sizeof (adj->rewrite_data));
1060 }
1061
1062 static void
1063 ip4_add_interface_routes (u32 sw_if_index,
1064                           ip4_main_t * im, u32 fib_index,
1065                           ip_interface_address_t * a)
1066 {
1067   vnet_main_t * vnm = vnet_get_main();
1068   ip_lookup_main_t * lm = &im->lookup_main;
1069   ip_adjacency_t * adj;
1070   ip4_address_t * address = ip_interface_address_get_address (lm, a);
1071   ip4_add_del_route_args_t x;
1072   vnet_hw_interface_t * hw_if = vnet_get_sup_hw_interface (vnm, sw_if_index);
1073   u32 classify_table_index;
1074
1075   /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
1076   x.table_index_or_table_id = fib_index;
1077   x.flags = (IP4_ROUTE_FLAG_ADD
1078              | IP4_ROUTE_FLAG_FIB_INDEX
1079              | IP4_ROUTE_FLAG_NO_REDISTRIBUTE);
1080   x.dst_address = address[0];
1081   x.dst_address_length = a->address_length;
1082   x.n_add_adj = 0;
1083   x.add_adj = 0;
1084
1085   a->neighbor_probe_adj_index = ~0;
1086   if (a->address_length < 32)
1087     {
1088       adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
1089                               &x.adj_index);
1090       ip4_adjacency_set_interface_route (vnm, adj, sw_if_index, a - lm->if_address_pool);
1091       ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
1092       ip4_add_del_route (im, &x);
1093       a->neighbor_probe_adj_index = x.adj_index;
1094     }
1095   
1096   /* Add e.g. 1.1.1.1/32 as local to this host. */
1097   adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
1098                           &x.adj_index);
1099   
1100   classify_table_index = ~0;
1101   if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
1102     classify_table_index = lm->classify_table_index_by_sw_if_index [sw_if_index];
1103   if (classify_table_index != (u32) ~0)
1104     {
1105       adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
1106       adj->classify.table_index = classify_table_index;
1107     }
1108   else
1109     adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
1110   
1111   adj->if_address_index = a - lm->if_address_pool;
1112   adj->rewrite_header.sw_if_index = sw_if_index;
1113   adj->rewrite_header.max_l3_packet_bytes = hw_if->max_l3_packet_bytes[VLIB_RX];
1114   /* 
1115    * Local adjs are never to be rewritten. Spoofed pkts w/ src = dst = local
1116    * fail an RPF-ish check, but still go thru the rewrite code...
1117    */
1118   adj->rewrite_header.data_bytes = 0;
1119
1120   ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
1121   x.dst_address_length = 32;
1122   ip4_add_del_route (im, &x);
1123 }
1124
1125 static void
1126 ip4_del_interface_routes (ip4_main_t * im, u32 fib_index, ip4_address_t * address, u32 address_length)
1127 {
1128   ip4_add_del_route_args_t x;
1129
1130   /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
1131   x.table_index_or_table_id = fib_index;
1132   x.flags = (IP4_ROUTE_FLAG_DEL
1133              | IP4_ROUTE_FLAG_FIB_INDEX
1134              | IP4_ROUTE_FLAG_NO_REDISTRIBUTE);
1135   x.dst_address = address[0];
1136   x.dst_address_length = address_length;
1137   x.adj_index = ~0;
1138   x.n_add_adj = 0;
1139   x.add_adj = 0;
1140
1141   if (address_length < 32)
1142     ip4_add_del_route (im, &x);
1143
1144   x.dst_address_length = 32;
1145   ip4_add_del_route (im, &x);
1146
1147   ip4_delete_matching_routes (im,
1148                               fib_index,
1149                               IP4_ROUTE_FLAG_FIB_INDEX,
1150                               address,
1151                               address_length);
1152 }
1153
1154 typedef struct {
1155     u32 sw_if_index;
1156     ip4_address_t address;
1157     u32 length;
1158 } ip4_interface_address_t;
1159
1160 static clib_error_t *
1161 ip4_add_del_interface_address_internal (vlib_main_t * vm,
1162                                         u32 sw_if_index,
1163                                         ip4_address_t * new_address,
1164                                         u32 new_length,
1165                                         u32 redistribute,
1166                                         u32 insert_routes,
1167                                         u32 is_del);
1168
1169 static clib_error_t *
1170 ip4_add_del_interface_address_internal (vlib_main_t * vm,
1171                                         u32 sw_if_index,
1172                                         ip4_address_t * address,
1173                                         u32 address_length,
1174                                         u32 redistribute,
1175                                         u32 insert_routes,
1176                                         u32 is_del)
1177 {
1178   vnet_main_t * vnm = vnet_get_main();
1179   ip4_main_t * im = &ip4_main;
1180   ip_lookup_main_t * lm = &im->lookup_main;
1181   clib_error_t * error = 0;
1182   u32 if_address_index, elts_before;
1183   ip4_address_fib_t ip4_af, * addr_fib = 0;
1184
1185   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1186   ip4_addr_fib_init (&ip4_af, address,
1187                      vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
1188   vec_add1 (addr_fib, ip4_af);
1189
1190   /* When adding an address check that it does not conflict with an existing address. */
1191   if (! is_del)
1192     {
1193       ip_interface_address_t * ia;
1194       foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, 
1195                                     0 /* honor unnumbered */,
1196       ({
1197         ip4_address_t * x = ip_interface_address_get_address (&im->lookup_main, ia);
1198
1199         if (ip4_destination_matches_route (im, address, x, ia->address_length)
1200             || ip4_destination_matches_route (im, x, address, address_length))
1201           return clib_error_create ("failed to add %U which conflicts with %U for interface %U",
1202                                     format_ip4_address_and_length, address, address_length,
1203                                     format_ip4_address_and_length, x, ia->address_length,
1204                                     format_vnet_sw_if_index_name, vnm, sw_if_index);
1205       }));
1206     }
1207
1208   elts_before = pool_elts (lm->if_address_pool);
1209
1210   error = ip_interface_address_add_del
1211     (lm,
1212      sw_if_index,
1213      addr_fib,
1214      address_length,
1215      is_del,
1216      &if_address_index);
1217   if (error)
1218     goto done;
1219   
1220   if (vnet_sw_interface_is_admin_up (vnm, sw_if_index) && insert_routes)
1221     {
1222       if (is_del)
1223         ip4_del_interface_routes (im, ip4_af.fib_index, address,
1224                                   address_length);
1225       
1226       else
1227           ip4_add_interface_routes (sw_if_index,
1228                                     im, ip4_af.fib_index,
1229                                     pool_elt_at_index 
1230                                     (lm->if_address_pool, if_address_index));
1231     }
1232
1233   /* If pool did not grow/shrink: add duplicate address. */
1234   if (elts_before != pool_elts (lm->if_address_pool))
1235     {
1236       ip4_add_del_interface_address_callback_t * cb;
1237       vec_foreach (cb, im->add_del_interface_address_callbacks)
1238         cb->function (im, cb->function_opaque, sw_if_index,
1239                       address, address_length,
1240                       if_address_index,
1241                       is_del);
1242     }
1243
1244  done:
1245   vec_free (addr_fib);
1246   return error;
1247 }
1248
1249 clib_error_t *
1250 ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
1251                                ip4_address_t * address, u32 address_length,
1252                                u32 is_del)
1253 {
1254   return ip4_add_del_interface_address_internal
1255     (vm, sw_if_index, address, address_length,
1256      /* redistribute */ 1,
1257      /* insert_routes */ 1,
1258      is_del);
1259 }
1260
1261 static clib_error_t *
1262 ip4_sw_interface_admin_up_down (vnet_main_t * vnm,
1263                                 u32 sw_if_index,
1264                                 u32 flags)
1265 {
1266   ip4_main_t * im = &ip4_main;
1267   ip_interface_address_t * ia;
1268   ip4_address_t * a;
1269   u32 is_admin_up, fib_index;
1270   
1271   /* Fill in lookup tables with default table (0). */
1272   vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
1273   
1274   vec_validate_init_empty (im->lookup_main.if_address_pool_index_by_sw_if_index, sw_if_index, ~0);
1275   
1276   is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
1277   
1278   fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
1279
1280   foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, 
1281                                 0 /* honor unnumbered */,
1282   ({
1283     a = ip_interface_address_get_address (&im->lookup_main, ia);
1284     if (is_admin_up)
1285       ip4_add_interface_routes (sw_if_index,
1286                                 im, fib_index,
1287                                 ia);
1288     else
1289       ip4_del_interface_routes (im, fib_index,
1290                                 a, ia->address_length);
1291   }));
1292
1293   return 0;
1294 }
1295  
1296 VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
1297
1298 /* Built-in ip4 unicast rx feature path definition */
1299 VNET_IP4_UNICAST_FEATURE_INIT (ip4_inacl, static) = {
1300   .node_name = "ip4-inacl", 
1301   .runs_before = {"ip4-source-check-via-rx", 0}, 
1302   .feature_index = &ip4_main.ip4_unicast_rx_feature_check_access,
1303 };
1304
1305 VNET_IP4_UNICAST_FEATURE_INIT (ip4_source_check_1, static) = {
1306   .node_name = "ip4-source-check-via-rx",
1307   .runs_before = {"ip4-source-check-via-any", 0},
1308   .feature_index = 
1309   &ip4_main.ip4_unicast_rx_feature_source_reachable_via_rx,
1310 };
1311
1312 VNET_IP4_UNICAST_FEATURE_INIT (ip4_source_check_2, static) = {
1313   .node_name = "ip4-source-check-via-any",
1314   .runs_before = {"ipsec-input-ip4", 0},
1315   .feature_index = 
1316   &ip4_main.ip4_unicast_rx_feature_source_reachable_via_any,
1317 };
1318
1319 VNET_IP4_UNICAST_FEATURE_INIT (ip4_ipsec, static) = {
1320   .node_name = "ipsec-input-ip4",
1321   .runs_before = {"vpath-input-ip4", 0},
1322   .feature_index = &ip4_main.ip4_unicast_rx_feature_ipsec,
1323 };
1324
1325 VNET_IP4_UNICAST_FEATURE_INIT (ip4_vpath, static) = {
1326   .node_name = "vpath-input-ip4",
1327   .runs_before = {"ip4-lookup", 0},
1328   .feature_index = &ip4_main.ip4_unicast_rx_feature_vpath,
1329 };
1330
1331 VNET_IP4_UNICAST_FEATURE_INIT (ip4_lookup, static) = {
1332   .node_name = "ip4-lookup",
1333   .runs_before = {0}, /* not before any other features */
1334   .feature_index = &ip4_main.ip4_unicast_rx_feature_lookup,
1335 };
1336
1337 /* Built-in ip4 multicast rx feature path definition */
1338 VNET_IP4_MULTICAST_FEATURE_INIT (ip4_vpath_mc, static) = {
1339   .node_name = "vpath-input-ip4",
1340   .runs_before = {"ip4-lookup-multicast", 0},
1341   .feature_index = &ip4_main.ip4_multicast_rx_feature_vpath,
1342 };
1343
1344 VNET_IP4_MULTICAST_FEATURE_INIT (ip4_lookup_mc, static) = {
1345   .node_name = "ip4-lookup-multicast",
1346   .runs_before = {0}, /* not before any other features */
1347   .feature_index = &ip4_main.ip4_multicast_rx_feature_lookup,
1348 };
1349
1350 static char * feature_start_nodes[] = 
1351   { "ip4-input", "ip4-input-no-checksum"};
1352
1353 static clib_error_t *
1354 ip4_feature_init (vlib_main_t * vm, ip4_main_t * im)
1355 {
1356   ip_lookup_main_t * lm = &im->lookup_main;
1357   clib_error_t * error;
1358   vnet_cast_t cast;
1359
1360   for (cast = 0; cast < VNET_N_CAST; cast++)
1361     {
1362       ip_config_main_t * cm = &lm->rx_config_mains[cast];
1363       vnet_config_main_t * vcm = &cm->config_main;
1364
1365       if ((error = ip_feature_init_cast (vm, cm, vcm, 
1366                                          feature_start_nodes,
1367                                          ARRAY_LEN(feature_start_nodes),
1368                                          cast,
1369                                          1 /* is_ip4 */)))
1370         return error;
1371     }
1372   return 0;
1373 }
1374
1375 static clib_error_t *
1376 ip4_sw_interface_add_del (vnet_main_t * vnm,
1377                           u32 sw_if_index,
1378                           u32 is_add)
1379 {
1380   vlib_main_t * vm = vnm->vlib_main;
1381   ip4_main_t * im = &ip4_main;
1382   ip_lookup_main_t * lm = &im->lookup_main;
1383   u32 ci, cast;
1384   u32 feature_index;
1385
1386   for (cast = 0; cast < VNET_N_CAST; cast++)
1387     {
1388       ip_config_main_t * cm = &lm->rx_config_mains[cast];
1389       vnet_config_main_t * vcm = &cm->config_main;
1390
1391       vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
1392       ci = cm->config_index_by_sw_if_index[sw_if_index];
1393
1394       if (cast == VNET_UNICAST)
1395         feature_index = im->ip4_unicast_rx_feature_lookup;
1396       else
1397         feature_index = im->ip4_multicast_rx_feature_lookup;
1398
1399       if (is_add)
1400         ci = vnet_config_add_feature (vm, vcm,
1401                                       ci,
1402                                       feature_index,
1403                                       /* config data */ 0,
1404                                       /* # bytes of config data */ 0);
1405       else
1406         ci = vnet_config_del_feature (vm, vcm,
1407                                       ci,
1408                                       feature_index,
1409                                       /* config data */ 0,
1410                                       /* # bytes of config data */ 0);
1411
1412       cm->config_index_by_sw_if_index[sw_if_index] = ci;
1413     }
1414
1415   return /* no error */ 0;
1416 }
1417
1418 VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
1419
1420 static u8 * format_ip4_lookup_trace (u8 * s, va_list * args);
1421
1422 VLIB_REGISTER_NODE (ip4_lookup_node) = {
1423   .function = ip4_lookup,
1424   .name = "ip4-lookup",
1425   .vector_size = sizeof (u32),
1426
1427   .format_trace = format_ip4_lookup_trace,
1428
1429   .n_next_nodes = IP4_LOOKUP_N_NEXT,
1430   .next_nodes = IP4_LOOKUP_NEXT_NODES,
1431 };
1432
1433 VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_node, ip4_lookup)
1434
1435 static uword
1436 ip4_indirect (vlib_main_t * vm,
1437                vlib_node_runtime_t * node,
1438                vlib_frame_t * frame)
1439 {
1440   return ip4_lookup_inline (vm, node, frame,
1441                             /* lookup_for_responses_to_locally_received_packets */ 0,
1442                             /* is_indirect */ 1);
1443 }
1444
1445 VLIB_REGISTER_NODE (ip4_indirect_node) = {
1446   .function = ip4_indirect,
1447   .name = "ip4-indirect",
1448   .vector_size = sizeof (u32),
1449   .sibling_of = "ip4-lookup",
1450   .format_trace = format_ip4_lookup_trace,
1451
1452   .n_next_nodes = 0,
1453 };
1454
1455 VLIB_NODE_FUNCTION_MULTIARCH (ip4_indirect_node, ip4_indirect)
1456
1457
1458 /* Global IP4 main. */
1459 ip4_main_t ip4_main;
1460
1461 clib_error_t *
1462 ip4_lookup_init (vlib_main_t * vm)
1463 {
1464   ip4_main_t * im = &ip4_main;
1465   clib_error_t * error;
1466   uword i;
1467
1468   for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
1469     {
1470       u32 m;
1471
1472       if (i < 32)
1473         m = pow2_mask (i) << (32 - i);
1474       else 
1475         m = ~0;
1476       im->fib_masks[i] = clib_host_to_net_u32 (m);
1477     }
1478
1479   /* Create FIB with index 0 and table id of 0. */
1480   find_ip4_fib_by_table_index_or_id (im, /* table id */ 0, IP4_ROUTE_FLAG_TABLE_ID);
1481
1482   ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
1483
1484   {
1485     pg_node_t * pn;
1486     pn = pg_get_node (ip4_lookup_node.index);
1487     pn->unformat_edit = unformat_pg_ip4_header;
1488   }
1489
1490   {
1491     ethernet_arp_header_t h;
1492
1493     memset (&h, 0, sizeof (h));
1494
1495     /* Set target ethernet address to all zeros. */
1496     memset (h.ip4_over_ethernet[1].ethernet, 0, sizeof (h.ip4_over_ethernet[1].ethernet));
1497
1498 #define _16(f,v) h.f = clib_host_to_net_u16 (v);
1499 #define _8(f,v) h.f = v;
1500     _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
1501     _16 (l3_type, ETHERNET_TYPE_IP4);
1502     _8 (n_l2_address_bytes, 6);
1503     _8 (n_l3_address_bytes, 4);
1504     _16 (opcode, ETHERNET_ARP_OPCODE_request);
1505 #undef _16
1506 #undef _8
1507
1508     vlib_packet_template_init (vm,
1509                                &im->ip4_arp_request_packet_template,
1510                                /* data */ &h,
1511                                sizeof (h),
1512                                /* alloc chunk size */ 8,
1513                                "ip4 arp");
1514   }
1515
1516   error = ip4_feature_init (vm, im);
1517
1518   return error;
1519 }
1520
1521 VLIB_INIT_FUNCTION (ip4_lookup_init);
1522
1523 typedef struct {
1524   /* Adjacency taken. */
1525   u32 adj_index;
1526   u32 flow_hash;
1527   u32 fib_index;
1528
1529   /* Packet data, possibly *after* rewrite. */
1530   u8 packet_data[64 - 1*sizeof(u32)];
1531 } ip4_forward_next_trace_t;
1532
1533 static u8 * format_ip4_forward_next_trace (u8 * s, va_list * args)
1534 {
1535   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1536   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1537   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1538   uword indent = format_get_indent (s);
1539   s = format (s, "%U%U",
1540                 format_white_space, indent,
1541                 format_ip4_header, t->packet_data);
1542   return s;
1543 }
1544
1545 static u8 * format_ip4_lookup_trace (u8 * s, va_list * args)
1546 {
1547   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1548   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1549   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1550   vnet_main_t * vnm = vnet_get_main();
1551   ip4_main_t * im = &ip4_main;
1552   uword indent = format_get_indent (s);
1553
1554   s = format (s, "fib %d adj-idx %d : %U flow hash: 0x%08x",
1555               t->fib_index, t->adj_index, format_ip_adjacency,
1556               vnm, &im->lookup_main, t->adj_index, t->flow_hash);
1557   s = format (s, "\n%U%U",
1558               format_white_space, indent,
1559               format_ip4_header, t->packet_data);
1560   return s;
1561 }
1562
1563 static u8 * format_ip4_rewrite_trace (u8 * s, va_list * args)
1564 {
1565   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1566   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1567   ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
1568   vnet_main_t * vnm = vnet_get_main();
1569   ip4_main_t * im = &ip4_main;
1570   uword indent = format_get_indent (s);
1571
1572   s = format (s, "tx_sw_if_index %d adj-idx %d : %U flow hash: 0x%08x",
1573               t->fib_index, t->adj_index, format_ip_adjacency,
1574               vnm, &im->lookup_main, t->adj_index, t->flow_hash);
1575   s = format (s, "\n%U%U",
1576               format_white_space, indent,
1577               format_ip_adjacency_packet_data,
1578               vnm, &im->lookup_main, t->adj_index,
1579               t->packet_data, sizeof (t->packet_data));
1580   return s;
1581 }
1582
1583 /* Common trace function for all ip4-forward next nodes. */
1584 void
1585 ip4_forward_next_trace (vlib_main_t * vm,
1586                         vlib_node_runtime_t * node,
1587                         vlib_frame_t * frame,
1588                         vlib_rx_or_tx_t which_adj_index)
1589 {
1590   u32 * from, n_left;
1591   ip4_main_t * im = &ip4_main;
1592
1593   n_left = frame->n_vectors;
1594   from = vlib_frame_vector_args (frame);
1595   
1596   while (n_left >= 4)
1597     {
1598       u32 bi0, bi1;
1599       vlib_buffer_t * b0, * b1;
1600       ip4_forward_next_trace_t * t0, * t1;
1601
1602       /* Prefetch next iteration. */
1603       vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
1604       vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
1605
1606       bi0 = from[0];
1607       bi1 = from[1];
1608
1609       b0 = vlib_get_buffer (vm, bi0);
1610       b1 = vlib_get_buffer (vm, bi1);
1611
1612       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1613         {
1614           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1615           t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1616           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1617           t0->fib_index = (vnet_buffer(b0)->sw_if_index[VLIB_TX] != (u32)~0) ?
1618               vnet_buffer(b0)->sw_if_index[VLIB_TX] :
1619               vec_elt (im->fib_index_by_sw_if_index,
1620                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
1621
1622           clib_memcpy (t0->packet_data,
1623                   vlib_buffer_get_current (b0),
1624                   sizeof (t0->packet_data));
1625         }
1626       if (b1->flags & VLIB_BUFFER_IS_TRACED)
1627         {
1628           t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
1629           t1->adj_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
1630           t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
1631           t1->fib_index = (vnet_buffer(b1)->sw_if_index[VLIB_TX] != (u32)~0) ?
1632               vnet_buffer(b1)->sw_if_index[VLIB_TX] :
1633               vec_elt (im->fib_index_by_sw_if_index,
1634                        vnet_buffer(b1)->sw_if_index[VLIB_RX]);
1635           clib_memcpy (t1->packet_data,
1636                   vlib_buffer_get_current (b1),
1637                   sizeof (t1->packet_data));
1638         }
1639       from += 2;
1640       n_left -= 2;
1641     }
1642
1643   while (n_left >= 1)
1644     {
1645       u32 bi0;
1646       vlib_buffer_t * b0;
1647       ip4_forward_next_trace_t * t0;
1648
1649       bi0 = from[0];
1650
1651       b0 = vlib_get_buffer (vm, bi0);
1652
1653       if (b0->flags & VLIB_BUFFER_IS_TRACED)
1654         {
1655           t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
1656           t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
1657           t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
1658           t0->fib_index = (vnet_buffer(b0)->sw_if_index[VLIB_TX] != (u32)~0) ?
1659               vnet_buffer(b0)->sw_if_index[VLIB_TX] :
1660               vec_elt (im->fib_index_by_sw_if_index,
1661                        vnet_buffer(b0)->sw_if_index[VLIB_RX]);
1662           clib_memcpy (t0->packet_data,
1663                   vlib_buffer_get_current (b0),
1664                   sizeof (t0->packet_data));
1665         }
1666       from += 1;
1667       n_left -= 1;
1668     }
1669 }
1670
1671 static uword
1672 ip4_drop_or_punt (vlib_main_t * vm,
1673                   vlib_node_runtime_t * node,
1674                   vlib_frame_t * frame,
1675                   ip4_error_t error_code)
1676 {
1677   u32 * buffers = vlib_frame_vector_args (frame);
1678   uword n_packets = frame->n_vectors;
1679
1680   vlib_error_drop_buffers (vm, node,
1681                            buffers,
1682                            /* stride */ 1,
1683                            n_packets,
1684                            /* next */ 0,
1685                            ip4_input_node.index,
1686                            error_code);
1687
1688   if (node->flags & VLIB_NODE_FLAG_TRACE)
1689     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1690
1691   return n_packets;
1692 }
1693
1694 static uword
1695 ip4_drop (vlib_main_t * vm,
1696           vlib_node_runtime_t * node,
1697           vlib_frame_t * frame)
1698 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_DROP); }
1699
1700 static uword
1701 ip4_punt (vlib_main_t * vm,
1702           vlib_node_runtime_t * node,
1703           vlib_frame_t * frame)
1704 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT); }
1705
1706 static uword
1707 ip4_miss (vlib_main_t * vm,
1708           vlib_node_runtime_t * node,
1709           vlib_frame_t * frame)
1710 { return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_DST_LOOKUP_MISS); }
1711
1712 VLIB_REGISTER_NODE (ip4_drop_node,static) = {
1713   .function = ip4_drop,
1714   .name = "ip4-drop",
1715   .vector_size = sizeof (u32),
1716
1717   .format_trace = format_ip4_forward_next_trace,
1718
1719   .n_next_nodes = 1,
1720   .next_nodes = {
1721     [0] = "error-drop",
1722   },
1723 };
1724
1725 VLIB_NODE_FUNCTION_MULTIARCH (ip4_drop_node, ip4_drop)
1726
1727 VLIB_REGISTER_NODE (ip4_punt_node,static) = {
1728   .function = ip4_punt,
1729   .name = "ip4-punt",
1730   .vector_size = sizeof (u32),
1731
1732   .format_trace = format_ip4_forward_next_trace,
1733
1734   .n_next_nodes = 1,
1735   .next_nodes = {
1736     [0] = "error-punt",
1737   },
1738 };
1739
1740 VLIB_NODE_FUNCTION_MULTIARCH (ip4_punt_node, ip4_punt)
1741
1742 VLIB_REGISTER_NODE (ip4_miss_node,static) = {
1743   .function = ip4_miss,
1744   .name = "ip4-miss",
1745   .vector_size = sizeof (u32),
1746
1747   .format_trace = format_ip4_forward_next_trace,
1748
1749   .n_next_nodes = 1,
1750   .next_nodes = {
1751     [0] = "error-drop",
1752   },
1753 };
1754
1755 VLIB_NODE_FUNCTION_MULTIARCH (ip4_miss_node, ip4_miss)
1756
1757 /* Compute TCP/UDP/ICMP4 checksum in software. */
1758 u16
1759 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
1760                               ip4_header_t * ip0)
1761 {
1762   ip_csum_t sum0;
1763   u32 ip_header_length, payload_length_host_byte_order;
1764   u32 n_this_buffer, n_bytes_left;
1765   u16 sum16;
1766   void * data_this_buffer;
1767   
1768   /* Initialize checksum with ip header. */
1769   ip_header_length = ip4_header_bytes (ip0);
1770   payload_length_host_byte_order = clib_net_to_host_u16 (ip0->length) - ip_header_length;
1771   sum0 = clib_host_to_net_u32 (payload_length_host_byte_order + (ip0->protocol << 16));
1772
1773   if (BITS (uword) == 32)
1774     {
1775       sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u32));
1776       sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32));
1777     }
1778   else
1779     sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
1780
1781   n_bytes_left = n_this_buffer = payload_length_host_byte_order;
1782   data_this_buffer = (void *) ip0 + ip_header_length;
1783   if (n_this_buffer + ip_header_length > p0->current_length)
1784     n_this_buffer = p0->current_length > ip_header_length ? p0->current_length - ip_header_length : 0;
1785   while (1)
1786     {
1787       sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
1788       n_bytes_left -= n_this_buffer;
1789       if (n_bytes_left == 0)
1790         break;
1791
1792       ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
1793       p0 = vlib_get_buffer (vm, p0->next_buffer);
1794       data_this_buffer = vlib_buffer_get_current (p0);
1795       n_this_buffer = p0->current_length;
1796     }
1797
1798   sum16 = ~ ip_csum_fold (sum0);
1799
1800   return sum16;
1801 }
1802
1803 static u32
1804 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
1805 {
1806   ip4_header_t * ip0 = vlib_buffer_get_current (p0);
1807   udp_header_t * udp0;
1808   u16 sum16;
1809
1810   ASSERT (ip0->protocol == IP_PROTOCOL_TCP
1811           || ip0->protocol == IP_PROTOCOL_UDP);
1812
1813   udp0 = (void *) (ip0 + 1);
1814   if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
1815     {
1816       p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1817                     | IP_BUFFER_L4_CHECKSUM_CORRECT);
1818       return p0->flags;
1819     }
1820
1821   sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
1822
1823   p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
1824                 | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT));
1825
1826   return p0->flags;
1827 }
1828
1829 static uword
1830 ip4_local (vlib_main_t * vm,
1831            vlib_node_runtime_t * node,
1832            vlib_frame_t * frame)
1833 {
1834   ip4_main_t * im = &ip4_main;
1835   ip_lookup_main_t * lm = &im->lookup_main;
1836   ip_local_next_t next_index;
1837   u32 * from, * to_next, n_left_from, n_left_to_next;
1838   vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
1839
1840   from = vlib_frame_vector_args (frame);
1841   n_left_from = frame->n_vectors;
1842   next_index = node->cached_next_index;
1843   
1844   if (node->flags & VLIB_NODE_FLAG_TRACE)
1845     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
1846
1847   while (n_left_from > 0)
1848     {
1849       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1850
1851       while (n_left_from >= 4 && n_left_to_next >= 2)
1852         {
1853           vlib_buffer_t * p0, * p1;
1854           ip4_header_t * ip0, * ip1;
1855           udp_header_t * udp0, * udp1;
1856           ip4_fib_mtrie_t * mtrie0, * mtrie1;
1857           ip4_fib_mtrie_leaf_t leaf0, leaf1;
1858           ip_adjacency_t * adj0, * adj1;
1859           u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, adj_index0;
1860           u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, adj_index1;
1861           i32 len_diff0, len_diff1;
1862           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
1863           u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
1864           u8 enqueue_code;
1865       
1866           pi0 = to_next[0] = from[0];
1867           pi1 = to_next[1] = from[1];
1868           from += 2;
1869           n_left_from -= 2;
1870           to_next += 2;
1871           n_left_to_next -= 2;
1872       
1873           p0 = vlib_get_buffer (vm, pi0);
1874           p1 = vlib_get_buffer (vm, pi1);
1875
1876           ip0 = vlib_buffer_get_current (p0);
1877           ip1 = vlib_buffer_get_current (p1);
1878
1879           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
1880                                 vnet_buffer(p0)->sw_if_index[VLIB_RX]);
1881           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, 
1882                                 vnet_buffer(p1)->sw_if_index[VLIB_RX]);
1883
1884           mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
1885           mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie;
1886
1887           leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
1888
1889           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
1890           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0);
1891
1892           /* Treat IP frag packets as "experimental" protocol for now
1893              until support of IP frag reassembly is implemented */
1894           proto0 = ip4_is_fragment(ip0) ? 0xfe : ip0->protocol;
1895           proto1 = ip4_is_fragment(ip1) ? 0xfe : ip1->protocol;
1896           is_udp0 = proto0 == IP_PROTOCOL_UDP;
1897           is_udp1 = proto1 == IP_PROTOCOL_UDP;
1898           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
1899           is_tcp_udp1 = is_udp1 || proto1 == IP_PROTOCOL_TCP;
1900
1901           flags0 = p0->flags;
1902           flags1 = p1->flags;
1903
1904           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1905           good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1906
1907           udp0 = ip4_next_header (ip0);
1908           udp1 = ip4_next_header (ip1);
1909
1910           /* Don't verify UDP checksum for packets with explicit zero checksum. */
1911           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1912           good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1913
1914           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
1915           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1);
1916
1917           /* Verify UDP length. */
1918           ip_len0 = clib_net_to_host_u16 (ip0->length);
1919           ip_len1 = clib_net_to_host_u16 (ip1->length);
1920           udp_len0 = clib_net_to_host_u16 (udp0->length);
1921           udp_len1 = clib_net_to_host_u16 (udp1->length);
1922
1923           len_diff0 = ip_len0 - udp_len0;
1924           len_diff1 = ip_len1 - udp_len1;
1925
1926           len_diff0 = is_udp0 ? len_diff0 : 0;
1927           len_diff1 = is_udp1 ? len_diff1 : 0;
1928
1929           if (PREDICT_FALSE (! (is_tcp_udp0 & is_tcp_udp1
1930                                 & good_tcp_udp0 & good_tcp_udp1)))
1931             {
1932               if (is_tcp_udp0)
1933                 {
1934                   if (is_tcp_udp0
1935                       && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1936                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
1937                   good_tcp_udp0 =
1938                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1939                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
1940                 }
1941               if (is_tcp_udp1)
1942                 {
1943                   if (is_tcp_udp1
1944                       && ! (flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
1945                     flags1 = ip4_tcp_udp_validate_checksum (vm, p1);
1946                   good_tcp_udp1 =
1947                     (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
1948                   good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
1949                 }
1950             }
1951
1952           good_tcp_udp0 &= len_diff0 >= 0;
1953           good_tcp_udp1 &= len_diff1 >= 0;
1954
1955           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
1956           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2);
1957
1958           error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;
1959
1960           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
1961           error1 = len_diff1 < 0 ? IP4_ERROR_UDP_LENGTH : error1;
1962
1963           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
1964           error0 = (is_tcp_udp0 && ! good_tcp_udp0
1965                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0
1966                     : error0);
1967           error1 = (is_tcp_udp1 && ! good_tcp_udp1
1968                     ? IP4_ERROR_TCP_CHECKSUM + is_udp1
1969                     : error1);
1970
1971           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
1972           leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
1973
1974           vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
1975           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
1976
1977           vnet_buffer (p1)->ip.adj_index[VLIB_RX] = adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
1978           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
1979
1980           ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
1981                                                            &ip0->src_address,
1982                                                            /* no_default_route */ 1));
1983           ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1,
1984                                                            &ip1->src_address,
1985                                                            /* no_default_route */ 1));
1986
1987           adj0 = ip_get_adjacency (lm, adj_index0);
1988           adj1 = ip_get_adjacency (lm, adj_index1);
1989
1990           /* 
1991            * Must have a route to source otherwise we drop the packet.
1992            * ip4 broadcasts are accepted, e.g. to make dhcp client work
1993            */
1994           error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL
1995                     && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
1996                     && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
1997                     && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
1998                     && ip0->dst_address.as_u32 != 0xFFFFFFFF
1999                     ? IP4_ERROR_SRC_LOOKUP_MISS
2000                     : error0);
2001           error1 = (error1 == IP4_ERROR_UNKNOWN_PROTOCOL
2002                     && adj1->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
2003                     && adj1->lookup_next_index != IP_LOOKUP_NEXT_ARP
2004                     && adj1->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
2005                     && ip0->dst_address.as_u32 != 0xFFFFFFFF
2006                     ? IP4_ERROR_SRC_LOOKUP_MISS
2007                     : error1);
2008
2009           next0 = lm->local_next_by_ip_protocol[proto0];
2010           next1 = lm->local_next_by_ip_protocol[proto1];
2011
2012           next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
2013           next1 = error1 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1;
2014
2015           p0->error = error0 ? error_node->errors[error0] : 0;
2016           p1->error = error1 ? error_node->errors[error1] : 0;
2017
2018           enqueue_code = (next0 != next_index) + 2*(next1 != next_index);
2019
2020           if (PREDICT_FALSE (enqueue_code != 0))
2021             {
2022               switch (enqueue_code)
2023                 {
2024                 case 1:
2025                   /* A B A */
2026                   to_next[-2] = pi1;
2027                   to_next -= 1;
2028                   n_left_to_next += 1;
2029                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
2030                   break;
2031
2032                 case 2:
2033                   /* A A B */
2034                   to_next -= 1;
2035                   n_left_to_next += 1;
2036                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
2037                   break;
2038
2039                 case 3:
2040                   /* A B B or A B C */
2041                   to_next -= 2;
2042                   n_left_to_next += 2;
2043                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
2044                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
2045                   if (next0 == next1)
2046                     {
2047                       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2048                       next_index = next1;
2049                       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2050                     }
2051                   break;
2052                 }
2053             }
2054         }
2055
2056       while (n_left_from > 0 && n_left_to_next > 0)
2057         {
2058           vlib_buffer_t * p0;
2059           ip4_header_t * ip0;
2060           udp_header_t * udp0;
2061           ip4_fib_mtrie_t * mtrie0;
2062           ip4_fib_mtrie_leaf_t leaf0;
2063           ip_adjacency_t * adj0;
2064           u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, adj_index0;
2065           i32 len_diff0;
2066           u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
2067       
2068           pi0 = to_next[0] = from[0];
2069           from += 1;
2070           n_left_from -= 1;
2071           to_next += 1;
2072           n_left_to_next -= 1;
2073       
2074           p0 = vlib_get_buffer (vm, pi0);
2075
2076           ip0 = vlib_buffer_get_current (p0);
2077
2078           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
2079                                 vnet_buffer(p0)->sw_if_index[VLIB_RX]);
2080
2081           mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
2082
2083           leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
2084
2085           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
2086
2087           /* Treat IP frag packets as "experimental" protocol for now
2088              until support of IP frag reassembly is implemented */
2089           proto0 = ip4_is_fragment(ip0) ? 0xfe : ip0->protocol;
2090           is_udp0 = proto0 == IP_PROTOCOL_UDP;
2091           is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
2092
2093           flags0 = p0->flags;
2094
2095           good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
2096
2097           udp0 = ip4_next_header (ip0);
2098
2099           /* Don't verify UDP checksum for packets with explicit zero checksum. */
2100           good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
2101
2102           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
2103
2104           /* Verify UDP length. */
2105           ip_len0 = clib_net_to_host_u16 (ip0->length);
2106           udp_len0 = clib_net_to_host_u16 (udp0->length);
2107
2108           len_diff0 = ip_len0 - udp_len0;
2109
2110           len_diff0 = is_udp0 ? len_diff0 : 0;
2111
2112           if (PREDICT_FALSE (! (is_tcp_udp0 & good_tcp_udp0)))
2113             {
2114               if (is_tcp_udp0)
2115                 {
2116                   if (is_tcp_udp0
2117                       && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
2118                     flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
2119                   good_tcp_udp0 =
2120                     (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
2121                   good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
2122                 }
2123             }
2124
2125           good_tcp_udp0 &= len_diff0 >= 0;
2126
2127           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
2128
2129           error0 = IP4_ERROR_UNKNOWN_PROTOCOL;
2130
2131           error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
2132
2133           ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
2134           error0 = (is_tcp_udp0 && ! good_tcp_udp0
2135                     ? IP4_ERROR_TCP_CHECKSUM + is_udp0
2136                     : error0);
2137
2138           leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
2139
2140           vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
2141           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
2142
2143           ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
2144                                                            &ip0->src_address,
2145                                                            /* no_default_route */ 1));
2146
2147           adj0 = ip_get_adjacency (lm, adj_index0);
2148
2149           /* Must have a route to source otherwise we drop the packet. */
2150           error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL
2151                     && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
2152                     && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
2153                     && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
2154                     && ip0->dst_address.as_u32 != 0xFFFFFFFF
2155                     ? IP4_ERROR_SRC_LOOKUP_MISS
2156                     : error0);
2157
2158           next0 = lm->local_next_by_ip_protocol[proto0];
2159
2160           next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
2161
2162           p0->error = error0? error_node->errors[error0] : 0;
2163
2164           if (PREDICT_FALSE (next0 != next_index))
2165             {
2166               n_left_to_next += 1;
2167               vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2168
2169               next_index = next0;
2170               vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2171               to_next[0] = pi0;
2172               to_next += 1;
2173               n_left_to_next -= 1;
2174             }
2175         }
2176   
2177       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2178     }
2179
2180   return frame->n_vectors;
2181 }
2182
2183 VLIB_REGISTER_NODE (ip4_local_node,static) = {
2184   .function = ip4_local,
2185   .name = "ip4-local",
2186   .vector_size = sizeof (u32),
2187
2188   .format_trace = format_ip4_forward_next_trace,
2189
2190   .n_next_nodes = IP_LOCAL_N_NEXT,
2191   .next_nodes = {
2192     [IP_LOCAL_NEXT_DROP] = "error-drop",
2193     [IP_LOCAL_NEXT_PUNT] = "error-punt",
2194     [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
2195     [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
2196   },
2197 };
2198
2199 VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local)
2200
2201 void ip4_register_protocol (u32 protocol, u32 node_index)
2202 {
2203   vlib_main_t * vm = vlib_get_main();
2204   ip4_main_t * im = &ip4_main;
2205   ip_lookup_main_t * lm = &im->lookup_main;
2206
2207   ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
2208   lm->local_next_by_ip_protocol[protocol] = vlib_node_add_next (vm, ip4_local_node.index, node_index);
2209 }
2210
2211 static clib_error_t *
2212 show_ip_local_command_fn (vlib_main_t * vm,
2213                           unformat_input_t * input,
2214                          vlib_cli_command_t * cmd)
2215 {
2216   ip4_main_t * im = &ip4_main;
2217   ip_lookup_main_t * lm = &im->lookup_main;
2218   int i;
2219
2220   vlib_cli_output (vm, "Protocols handled by ip4_local");
2221   for (i = 0; i < ARRAY_LEN(lm->local_next_by_ip_protocol); i++)
2222     {
2223       if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
2224         vlib_cli_output (vm, "%d", i);
2225     }
2226   return 0;
2227 }
2228
2229
2230
2231 VLIB_CLI_COMMAND (show_ip_local, static) = {
2232   .path = "show ip local",
2233   .function = show_ip_local_command_fn,
2234   .short_help = "Show ip local protocol table",
2235 };
2236
2237 static uword
2238 ip4_arp (vlib_main_t * vm,
2239          vlib_node_runtime_t * node,
2240          vlib_frame_t * frame)
2241 {
2242   vnet_main_t * vnm = vnet_get_main();
2243   ip4_main_t * im = &ip4_main;
2244   ip_lookup_main_t * lm = &im->lookup_main;
2245   u32 * from, * to_next_drop;
2246   uword n_left_from, n_left_to_next_drop, next_index;
2247   static f64 time_last_seed_change = -1e100;
2248   static u32 hash_seeds[3];
2249   static uword hash_bitmap[256 / BITS (uword)]; 
2250   f64 time_now;
2251
2252   if (node->flags & VLIB_NODE_FLAG_TRACE)
2253     ip4_forward_next_trace (vm, node, frame, VLIB_TX);
2254
2255   time_now = vlib_time_now (vm);
2256   if (time_now - time_last_seed_change > 1e-3)
2257     {
2258       uword i;
2259       u32 * r = clib_random_buffer_get_data (&vm->random_buffer,
2260                                              sizeof (hash_seeds));
2261       for (i = 0; i < ARRAY_LEN (hash_seeds); i++)
2262         hash_seeds[i] = r[i];
2263
2264       /* Mark all hash keys as been no-seen before. */
2265       for (i = 0; i < ARRAY_LEN (hash_bitmap); i++)
2266         hash_bitmap[i] = 0;
2267
2268       time_last_seed_change = time_now;
2269     }
2270
2271   from = vlib_frame_vector_args (frame);
2272   n_left_from = frame->n_vectors;
2273   next_index = node->cached_next_index;
2274   if (next_index == IP4_ARP_NEXT_DROP)
2275     next_index = IP4_ARP_N_NEXT; /* point to first interface */
2276
2277   while (n_left_from > 0)
2278     {
2279       vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
2280                            to_next_drop, n_left_to_next_drop);
2281
2282       while (n_left_from > 0 && n_left_to_next_drop > 0)
2283         {
2284           vlib_buffer_t * p0;
2285           ip4_header_t * ip0;
2286           ethernet_header_t * eh0;
2287           u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
2288           uword bm0;
2289           ip_adjacency_t * adj0;
2290
2291           pi0 = from[0];
2292
2293           p0 = vlib_get_buffer (vm, pi0);
2294
2295           adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
2296           adj0 = ip_get_adjacency (lm, adj_index0);
2297           ip0 = vlib_buffer_get_current (p0);
2298
2299           /* If packet destination is not local, send ARP to next hop */
2300           if (adj0->arp.next_hop.ip4.as_u32)
2301             ip0->dst_address.data_u32 = adj0->arp.next_hop.ip4.as_u32;
2302
2303           /* 
2304            * if ip4_rewrite_local applied the IP_LOOKUP_NEXT_ARP
2305            * rewrite to this packet, we need to skip it here.
2306            * Note, to distinguish from src IP addr *.8.6.*, we
2307            * check for a bcast eth dest instead of IPv4 version.
2308            */
2309           eh0 = (ethernet_header_t*)ip0;
2310           if ((ip0->ip_version_and_header_length & 0xF0) != 0x40)
2311             {
2312               u32 vlan_num = 0;
2313               u16 * etype = &eh0->type;
2314               while ((*etype == clib_host_to_net_u16 (0x8100)) //dot1q 
2315                   || (*etype == clib_host_to_net_u16 (0x88a8)))//dot1ad 
2316                 {
2317                   vlan_num += 1;
2318                   etype += 2; //vlan tag also 16 bits, same as etype
2319                 }
2320               if (*etype == clib_host_to_net_u16 (0x0806))     //arp
2321                 {
2322                   vlib_buffer_advance (
2323                       p0, sizeof(ethernet_header_t) + (4*vlan_num));
2324                   ip0 = vlib_buffer_get_current (p0);
2325                 }
2326             }
2327
2328           a0 = hash_seeds[0];
2329           b0 = hash_seeds[1];
2330           c0 = hash_seeds[2];
2331
2332           sw_if_index0 = adj0->rewrite_header.sw_if_index;
2333           vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
2334
2335           a0 ^= ip0->dst_address.data_u32;
2336           b0 ^= sw_if_index0;
2337
2338           hash_v3_finalize32 (a0, b0, c0);
2339
2340           c0 &= BITS (hash_bitmap) - 1;
2341           c0 = c0 / BITS (uword);
2342           m0 = (uword) 1 << (c0 % BITS (uword));
2343
2344           bm0 = hash_bitmap[c0];
2345           drop0 = (bm0 & m0) != 0;
2346
2347           /* Mark it as seen. */
2348           hash_bitmap[c0] = bm0 | m0;
2349
2350           from += 1;
2351           n_left_from -= 1;
2352           to_next_drop[0] = pi0;
2353           to_next_drop += 1;
2354           n_left_to_next_drop -= 1;
2355
2356           p0->error = node->errors[drop0 ? IP4_ARP_ERROR_DROP : IP4_ARP_ERROR_REQUEST_SENT];
2357
2358           if (drop0)
2359             continue;
2360
2361           /* 
2362            * Can happen if the control-plane is programming tables
2363            * with traffic flowing; at least that's today's lame excuse.
2364            */
2365           if (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP) 
2366             {
2367               p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
2368             }
2369           else
2370           /* Send ARP request. */
2371           {
2372             u32 bi0 = 0;
2373             vlib_buffer_t * b0;
2374             ethernet_arp_header_t * h0;
2375             vnet_hw_interface_t * hw_if0;
2376
2377             h0 = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi0);
2378
2379             /* Add rewrite/encap string for ARP packet. */
2380             vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t));
2381
2382             hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
2383
2384             /* Src ethernet address in ARP header. */
2385             clib_memcpy (h0->ip4_over_ethernet[0].ethernet, hw_if0->hw_address,
2386                     sizeof (h0->ip4_over_ethernet[0].ethernet));
2387
2388             if (ip4_src_address_for_packet (im, p0, &h0->ip4_over_ethernet[0].ip4, sw_if_index0)) {
2389                 //No source address available
2390                 p0->error = node->errors[IP4_ARP_ERROR_NO_SOURCE_ADDRESS];
2391                 vlib_buffer_free(vm, &bi0, 1);
2392                 continue;
2393             }
2394
2395             /* Copy in destination address we are requesting. */
2396             h0->ip4_over_ethernet[1].ip4.data_u32 = ip0->dst_address.data_u32;
2397
2398             vlib_buffer_copy_trace_flag (vm, p0, bi0);
2399             b0 = vlib_get_buffer (vm, bi0);
2400             vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
2401
2402             vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
2403
2404             vlib_set_next_frame_buffer (vm, node, adj0->rewrite_header.next_index, bi0);
2405           }
2406         }
2407
2408       vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
2409     }
2410
2411   return frame->n_vectors;
2412 }
2413
2414 static char * ip4_arp_error_strings[] = {
2415   [IP4_ARP_ERROR_DROP] = "address overflow drops",
2416   [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
2417   [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
2418   [IP4_ARP_ERROR_REPLICATE_DROP] = "ARP replication completed",
2419   [IP4_ARP_ERROR_REPLICATE_FAIL] = "ARP replication failed",
2420   [IP4_ARP_ERROR_NO_SOURCE_ADDRESS] = "no source address for ARP request",
2421 };
2422
2423 VLIB_REGISTER_NODE (ip4_arp_node) = {
2424   .function = ip4_arp,
2425   .name = "ip4-arp",
2426   .vector_size = sizeof (u32),
2427
2428   .format_trace = format_ip4_forward_next_trace,
2429
2430   .n_errors = ARRAY_LEN (ip4_arp_error_strings),
2431   .error_strings = ip4_arp_error_strings,
2432
2433   .n_next_nodes = IP4_ARP_N_NEXT,
2434   .next_nodes = {
2435     [IP4_ARP_NEXT_DROP] = "error-drop",
2436   },
2437 };
2438
2439 #define foreach_notrace_ip4_arp_error           \
2440 _(DROP)                                         \
2441 _(REQUEST_SENT)                                 \
2442 _(REPLICATE_DROP)                               \
2443 _(REPLICATE_FAIL)
2444
2445 clib_error_t * arp_notrace_init (vlib_main_t * vm)
2446 {
2447   vlib_node_runtime_t *rt = 
2448     vlib_node_get_runtime (vm, ip4_arp_node.index);
2449
2450   /* don't trace ARP request packets */
2451 #define _(a)                                    \
2452     vnet_pcap_drop_trace_filter_add_del         \
2453         (rt->errors[IP4_ARP_ERROR_##a],         \
2454          1 /* is_add */);
2455     foreach_notrace_ip4_arp_error;
2456 #undef _
2457   return 0;
2458 }
2459
2460 VLIB_INIT_FUNCTION(arp_notrace_init);
2461
2462
2463 /* Send an ARP request to see if given destination is reachable on given interface. */
2464 clib_error_t *
2465 ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index)
2466 {
2467   vnet_main_t * vnm = vnet_get_main();
2468   ip4_main_t * im = &ip4_main;
2469   ethernet_arp_header_t * h;
2470   ip4_address_t * src;
2471   ip_interface_address_t * ia;
2472   ip_adjacency_t * adj;
2473   vnet_hw_interface_t * hi;
2474   vnet_sw_interface_t * si;
2475   vlib_buffer_t * b;
2476   u32 bi = 0;
2477
2478   si = vnet_get_sw_interface (vnm, sw_if_index);
2479
2480   if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
2481     {
2482       return clib_error_return (0, "%U: interface %U down",
2483                                 format_ip4_address, dst, 
2484                                 format_vnet_sw_if_index_name, vnm, 
2485                                 sw_if_index);
2486     }
2487
2488   src = ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
2489   if (! src)
2490     {
2491       vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
2492       return clib_error_return 
2493         (0, "no matching interface address for destination %U (interface %U)",
2494          format_ip4_address, dst,
2495          format_vnet_sw_if_index_name, vnm, sw_if_index);
2496     }
2497
2498   adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index);
2499
2500   h = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi);
2501
2502   hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
2503
2504   clib_memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address, sizeof (h->ip4_over_ethernet[0].ethernet));
2505
2506   h->ip4_over_ethernet[0].ip4 = src[0];
2507   h->ip4_over_ethernet[1].ip4 = dst[0];
2508
2509   b = vlib_get_buffer (vm, bi);
2510   vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
2511
2512   /* Add encapsulation string for software interface (e.g. ethernet header). */
2513   vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
2514   vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
2515
2516   {
2517     vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
2518     u32 * to_next = vlib_frame_vector_args (f);
2519     to_next[0] = bi;
2520     f->n_vectors = 1;
2521     vlib_put_frame_to_node (vm, hi->output_node_index, f);
2522   }
2523
2524   return /* no error */ 0;
2525 }
2526
2527 typedef enum {
2528   IP4_REWRITE_NEXT_DROP,
2529   IP4_REWRITE_NEXT_ARP,
2530   IP4_REWRITE_NEXT_ICMP_ERROR,
2531 } ip4_rewrite_next_t;
2532
2533 always_inline uword
2534 ip4_rewrite_inline (vlib_main_t * vm,
2535                     vlib_node_runtime_t * node,
2536                     vlib_frame_t * frame,
2537                     int rewrite_for_locally_received_packets)
2538 {
2539   ip_lookup_main_t * lm = &ip4_main.lookup_main;
2540   u32 * from = vlib_frame_vector_args (frame);
2541   u32 n_left_from, n_left_to_next, * to_next, next_index;
2542   vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
2543   vlib_rx_or_tx_t adj_rx_tx = rewrite_for_locally_received_packets ? VLIB_RX : VLIB_TX;
2544
2545   n_left_from = frame->n_vectors;
2546   next_index = node->cached_next_index;
2547   u32 cpu_index = os_get_cpu_number();
2548   
2549   while (n_left_from > 0)
2550     {
2551       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2552
2553       while (n_left_from >= 4 && n_left_to_next >= 2)
2554         {
2555           ip_adjacency_t * adj0, * adj1;
2556           vlib_buffer_t * p0, * p1;
2557           ip4_header_t * ip0, * ip1;
2558           u32 pi0, rw_len0, next0, error0, checksum0, adj_index0;
2559           u32 pi1, rw_len1, next1, error1, checksum1, adj_index1;
2560           u32 next0_override, next1_override;
2561       
2562           if (rewrite_for_locally_received_packets)
2563               next0_override = next1_override = 0;
2564
2565           /* Prefetch next iteration. */
2566           {
2567             vlib_buffer_t * p2, * p3;
2568
2569             p2 = vlib_get_buffer (vm, from[2]);
2570             p3 = vlib_get_buffer (vm, from[3]);
2571
2572             vlib_prefetch_buffer_header (p2, STORE);
2573             vlib_prefetch_buffer_header (p3, STORE);
2574
2575             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
2576             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
2577           }
2578
2579           pi0 = to_next[0] = from[0];
2580           pi1 = to_next[1] = from[1];
2581
2582           from += 2;
2583           n_left_from -= 2;
2584           to_next += 2;
2585           n_left_to_next -= 2;
2586       
2587           p0 = vlib_get_buffer (vm, pi0);
2588           p1 = vlib_get_buffer (vm, pi1);
2589
2590           adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
2591           adj_index1 = vnet_buffer (p1)->ip.adj_index[adj_rx_tx];
2592
2593           /* We should never rewrite a pkt using the MISS adjacency */
2594           ASSERT(adj_index0 && adj_index1);
2595
2596           ip0 = vlib_buffer_get_current (p0);
2597           ip1 = vlib_buffer_get_current (p1);
2598
2599           error0 = error1 = IP4_ERROR_NONE;
2600           next0 = next1 = IP4_REWRITE_NEXT_DROP;
2601
2602           /* Decrement TTL & update checksum.
2603              Works either endian, so no need for byte swap. */
2604           if (! rewrite_for_locally_received_packets)
2605             {
2606               i32 ttl0 = ip0->ttl, ttl1 = ip1->ttl;
2607
2608               /* Input node should have reject packets with ttl 0. */
2609               ASSERT (ip0->ttl > 0);
2610               ASSERT (ip1->ttl > 0);
2611
2612               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2613               checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100);
2614
2615               checksum0 += checksum0 >= 0xffff;
2616               checksum1 += checksum1 >= 0xffff;
2617
2618               ip0->checksum = checksum0;
2619               ip1->checksum = checksum1;
2620
2621               ttl0 -= 1;
2622               ttl1 -= 1;
2623
2624               ip0->ttl = ttl0;
2625               ip1->ttl = ttl1;
2626
2627               /*
2628                * If the ttl drops below 1 when forwarding, generate
2629                * an ICMP response.
2630                */
2631               if (PREDICT_FALSE(ttl0 <= 0))
2632                 {
2633                   error0 = IP4_ERROR_TIME_EXPIRED;
2634                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32)~0;
2635                   icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
2636                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2637                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2638                 }
2639               if (PREDICT_FALSE(ttl1 <= 0))
2640                 {
2641                   error1 = IP4_ERROR_TIME_EXPIRED;
2642                   vnet_buffer (p1)->sw_if_index[VLIB_TX] = (u32)~0;
2643                   icmp4_error_set_vnet_buffer(p1, ICMP4_time_exceeded,
2644                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2645                   next1 = IP4_REWRITE_NEXT_ICMP_ERROR;
2646                 }
2647
2648               /* Verify checksum. */
2649               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2650               ASSERT (ip1->checksum == ip4_header_checksum (ip1));
2651             }
2652
2653           /* Rewrite packet header and updates lengths. */
2654           adj0 = ip_get_adjacency (lm, adj_index0);
2655           adj1 = ip_get_adjacency (lm, adj_index1);
2656       
2657           if (rewrite_for_locally_received_packets)
2658             {
2659               /*
2660                * If someone sends e.g. an icmp4 w/ src = dst = interface addr,
2661                * we end up here with a local adjacency in hand
2662                * The local adj rewrite data is 0xfefe on purpose.
2663                * Bad engineer, no donut for you.
2664                */
2665               if (PREDICT_FALSE(adj0->lookup_next_index 
2666                                 == IP_LOOKUP_NEXT_LOCAL))
2667                 error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
2668               if (PREDICT_FALSE(adj0->lookup_next_index
2669                                 == IP_LOOKUP_NEXT_ARP))
2670                 next0_override = IP4_REWRITE_NEXT_ARP;
2671               if (PREDICT_FALSE(adj1->lookup_next_index 
2672                                 == IP_LOOKUP_NEXT_LOCAL))
2673                 error1 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
2674               if (PREDICT_FALSE(adj1->lookup_next_index
2675                                 == IP_LOOKUP_NEXT_ARP))
2676                 next1_override = IP4_REWRITE_NEXT_ARP;
2677             }
2678
2679           /* Worth pipelining. No guarantee that adj0,1 are hot... */
2680           rw_len0 = adj0[0].rewrite_header.data_bytes;
2681           rw_len1 = adj1[0].rewrite_header.data_bytes;
2682
2683           /* Check MTU of outgoing interface. */
2684           error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes
2685                     ? IP4_ERROR_MTU_EXCEEDED
2686                     : error0);
2687           error1 = (vlib_buffer_length_in_chain (vm, p1) > adj1[0].rewrite_header.max_l3_packet_bytes
2688                     ? IP4_ERROR_MTU_EXCEEDED
2689                     : error1);
2690
2691           next0 = (error0 == IP4_ERROR_NONE)
2692             ? adj0[0].rewrite_header.next_index : next0;
2693
2694           if (rewrite_for_locally_received_packets)
2695               next0 = next0 && next0_override ? next0_override : next0;
2696
2697           next1 = (error1 == IP4_ERROR_NONE)
2698             ? adj1[0].rewrite_header.next_index : next1;
2699
2700           if (rewrite_for_locally_received_packets)
2701               next1 = next1 && next1_override ? next1_override : next1;
2702
2703           /* 
2704            * We've already accounted for an ethernet_header_t elsewhere
2705            */
2706           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
2707               vlib_increment_combined_counter 
2708                   (&lm->adjacency_counters,
2709                    cpu_index, adj_index0, 
2710                    /* packet increment */ 0,
2711                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
2712
2713           if (PREDICT_FALSE (rw_len1 > sizeof(ethernet_header_t)))
2714               vlib_increment_combined_counter 
2715                   (&lm->adjacency_counters,
2716                    cpu_index, adj_index1, 
2717                    /* packet increment */ 0,
2718                    /* byte increment */ rw_len1-sizeof(ethernet_header_t));
2719
2720           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2721            * to see the IP headerr */
2722           if (PREDICT_TRUE(error0 == IP4_ERROR_NONE))
2723             {
2724               p0->current_data -= rw_len0;
2725               p0->current_length += rw_len0;
2726               p0->error = error_node->errors[error0];
2727               vnet_buffer (p0)->sw_if_index[VLIB_TX] =
2728                   adj0[0].rewrite_header.sw_if_index;
2729             }
2730           if (PREDICT_TRUE(error1 == IP4_ERROR_NONE))
2731             {
2732               p1->current_data -= rw_len1;
2733               p1->current_length += rw_len1;
2734               p1->error = error_node->errors[error1];
2735               vnet_buffer (p1)->sw_if_index[VLIB_TX] =
2736                   adj1[0].rewrite_header.sw_if_index;
2737             }
2738
2739           /* Guess we are only writing on simple Ethernet header. */
2740           vnet_rewrite_two_headers (adj0[0], adj1[0],
2741                                     ip0, ip1,
2742                                     sizeof (ethernet_header_t));
2743       
2744           vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
2745                                            to_next, n_left_to_next,
2746                                            pi0, pi1, next0, next1);
2747         }
2748
2749       while (n_left_from > 0 && n_left_to_next > 0)
2750         {
2751           ip_adjacency_t * adj0;
2752           vlib_buffer_t * p0;
2753           ip4_header_t * ip0;
2754           u32 pi0, rw_len0, adj_index0, next0, error0, checksum0;
2755           u32 next0_override;
2756       
2757           if (rewrite_for_locally_received_packets)
2758               next0_override = 0;
2759
2760           pi0 = to_next[0] = from[0];
2761
2762           p0 = vlib_get_buffer (vm, pi0);
2763
2764           adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
2765
2766           /* We should never rewrite a pkt using the MISS adjacency */
2767           ASSERT(adj_index0);
2768
2769           adj0 = ip_get_adjacency (lm, adj_index0);
2770       
2771           ip0 = vlib_buffer_get_current (p0);
2772
2773           error0 = IP4_ERROR_NONE;
2774           next0 = IP4_REWRITE_NEXT_DROP;            /* drop on error */
2775
2776           /* Decrement TTL & update checksum. */
2777           if (! rewrite_for_locally_received_packets)
2778             {
2779               i32 ttl0 = ip0->ttl;
2780
2781               checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
2782
2783               checksum0 += checksum0 >= 0xffff;
2784
2785               ip0->checksum = checksum0;
2786
2787               ASSERT (ip0->ttl > 0);
2788
2789               ttl0 -= 1;
2790
2791               ip0->ttl = ttl0;
2792
2793               ASSERT (ip0->checksum == ip4_header_checksum (ip0));
2794
2795               if (PREDICT_FALSE(ttl0 <= 0))
2796                 {
2797                   /*
2798                    * If the ttl drops below 1 when forwarding, generate
2799                    * an ICMP response.
2800                    */
2801                   error0 = IP4_ERROR_TIME_EXPIRED;
2802                   next0 = IP4_REWRITE_NEXT_ICMP_ERROR;
2803                   vnet_buffer (p0)->sw_if_index[VLIB_TX] = (u32)~0;
2804                   icmp4_error_set_vnet_buffer(p0, ICMP4_time_exceeded,
2805                               ICMP4_time_exceeded_ttl_exceeded_in_transit, 0);
2806                 }
2807             }
2808
2809           if (rewrite_for_locally_received_packets)
2810             {
2811               /*
2812                * If someone sends e.g. an icmp4 w/ src = dst = interface addr,
2813                * we end up here with a local adjacency in hand
2814                * The local adj rewrite data is 0xfefe on purpose.
2815                * Bad engineer, no donut for you.
2816                */
2817               if (PREDICT_FALSE(adj0->lookup_next_index 
2818                                 == IP_LOOKUP_NEXT_LOCAL))
2819                 error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
2820               /* 
2821                * We have to override the next_index in ARP adjacencies,
2822                * because they're set up for ip4-arp, not this node...
2823                */
2824               if (PREDICT_FALSE(adj0->lookup_next_index
2825                                 == IP_LOOKUP_NEXT_ARP))
2826                 next0_override = IP4_REWRITE_NEXT_ARP;
2827             }
2828
2829           /* Guess we are only writing on simple Ethernet header. */
2830           vnet_rewrite_one_header (adj0[0], ip0, 
2831                                    sizeof (ethernet_header_t));
2832           
2833           /* Update packet buffer attributes/set output interface. */
2834           rw_len0 = adj0[0].rewrite_header.data_bytes;
2835           
2836           if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
2837               vlib_increment_combined_counter 
2838                   (&lm->adjacency_counters,
2839                    cpu_index, adj_index0, 
2840                    /* packet increment */ 0,
2841                    /* byte increment */ rw_len0-sizeof(ethernet_header_t));
2842           
2843           /* Check MTU of outgoing interface. */
2844           error0 = (vlib_buffer_length_in_chain (vm, p0) 
2845                     > adj0[0].rewrite_header.max_l3_packet_bytes
2846                     ? IP4_ERROR_MTU_EXCEEDED
2847                     : error0);
2848
2849           p0->error = error_node->errors[error0];
2850
2851           /* Don't adjust the buffer for ttl issue; icmp-error node wants
2852            * to see the IP headerr */
2853           if (PREDICT_TRUE(error0 == IP4_ERROR_NONE))
2854             {
2855               p0->current_data -= rw_len0;
2856               p0->current_length += rw_len0;
2857
2858               vnet_buffer (p0)->sw_if_index[VLIB_TX] =
2859                   adj0[0].rewrite_header.sw_if_index;
2860               next0 = adj0[0].rewrite_header.next_index;
2861             }
2862
2863           if (rewrite_for_locally_received_packets)
2864               next0 = next0 && next0_override ? next0_override : next0;
2865
2866           from += 1;
2867           n_left_from -= 1;
2868           to_next += 1;
2869           n_left_to_next -= 1;
2870       
2871           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
2872                                            to_next, n_left_to_next,
2873                                            pi0, next0);
2874         }
2875   
2876       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2877     }
2878
2879   /* Need to do trace after rewrites to pick up new packet data. */
2880   if (node->flags & VLIB_NODE_FLAG_TRACE)
2881     ip4_forward_next_trace (vm, node, frame, adj_rx_tx);
2882
2883   return frame->n_vectors;
2884 }
2885
2886 static uword
2887 ip4_rewrite_transit (vlib_main_t * vm,
2888                      vlib_node_runtime_t * node,
2889                      vlib_frame_t * frame)
2890 {
2891   return ip4_rewrite_inline (vm, node, frame,
2892                              /* rewrite_for_locally_received_packets */ 0);
2893 }
2894
2895 static uword
2896 ip4_rewrite_local (vlib_main_t * vm,
2897                    vlib_node_runtime_t * node,
2898                    vlib_frame_t * frame)
2899 {
2900   return ip4_rewrite_inline (vm, node, frame,
2901                              /* rewrite_for_locally_received_packets */ 1);
2902 }
2903
2904 VLIB_REGISTER_NODE (ip4_rewrite_node) = {
2905   .function = ip4_rewrite_transit,
2906   .name = "ip4-rewrite-transit",
2907   .vector_size = sizeof (u32),
2908
2909   .format_trace = format_ip4_rewrite_trace,
2910
2911   .n_next_nodes = 3,
2912   .next_nodes = {
2913     [IP4_REWRITE_NEXT_DROP] = "error-drop",
2914     [IP4_REWRITE_NEXT_ARP] = "ip4-arp",
2915     [IP4_REWRITE_NEXT_ICMP_ERROR] = "ip4-icmp-error",
2916   },
2917 };
2918
2919 VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_node, ip4_rewrite_transit)
2920
2921 VLIB_REGISTER_NODE (ip4_rewrite_local_node) = {
2922   .function = ip4_rewrite_local,
2923   .name = "ip4-rewrite-local",
2924   .vector_size = sizeof (u32),
2925
2926   .sibling_of = "ip4-rewrite-transit",
2927
2928   .format_trace = format_ip4_rewrite_trace,
2929
2930   .n_next_nodes = 0,
2931 };
2932
2933 VLIB_NODE_FUNCTION_MULTIARCH (ip4_rewrite_local_node, ip4_rewrite_local)
2934
2935 static clib_error_t *
2936 add_del_interface_table (vlib_main_t * vm,
2937                          unformat_input_t * input,
2938                          vlib_cli_command_t * cmd)
2939 {
2940   vnet_main_t * vnm = vnet_get_main();
2941   clib_error_t * error = 0;
2942   u32 sw_if_index, table_id;
2943
2944   sw_if_index = ~0;
2945
2946   if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
2947     {
2948       error = clib_error_return (0, "unknown interface `%U'",
2949                                  format_unformat_error, input);
2950       goto done;
2951     }
2952
2953   if (unformat (input, "%d", &table_id))
2954     ;
2955   else
2956     {
2957       error = clib_error_return (0, "expected table id `%U'",
2958                                  format_unformat_error, input);
2959       goto done;
2960     }
2961
2962   {
2963     ip4_main_t * im = &ip4_main;
2964     ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_id, IP4_ROUTE_FLAG_TABLE_ID);
2965
2966     if (fib) 
2967       {
2968         vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
2969         im->fib_index_by_sw_if_index[sw_if_index] = fib->index;
2970     }
2971   }
2972
2973  done:
2974   return error;
2975 }
2976
2977 VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = {
2978   .path = "set interface ip table",
2979   .function = add_del_interface_table,
2980   .short_help = "Add/delete FIB table id for interface",
2981 };
2982
2983
2984 static uword
2985 ip4_lookup_multicast (vlib_main_t * vm,
2986                       vlib_node_runtime_t * node,
2987                       vlib_frame_t * frame)
2988 {
2989   ip4_main_t * im = &ip4_main;
2990   ip_lookup_main_t * lm = &im->lookup_main;
2991   vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
2992   u32 n_left_from, n_left_to_next, * from, * to_next;
2993   ip_lookup_next_t next;
2994   u32 cpu_index = os_get_cpu_number();
2995
2996   from = vlib_frame_vector_args (frame);
2997   n_left_from = frame->n_vectors;
2998   next = node->cached_next_index;
2999
3000   while (n_left_from > 0)
3001     {
3002       vlib_get_next_frame (vm, node, next,
3003                            to_next, n_left_to_next);
3004
3005       while (n_left_from >= 4 && n_left_to_next >= 2)
3006         {
3007           vlib_buffer_t * p0, * p1;
3008           u32 pi0, pi1, adj_index0, adj_index1, wrong_next;
3009           ip_lookup_next_t next0, next1;
3010           ip4_header_t * ip0, * ip1;
3011           ip_adjacency_t * adj0, * adj1;
3012           u32 fib_index0, fib_index1;
3013           u32 flow_hash_config0, flow_hash_config1;
3014
3015           /* Prefetch next iteration. */
3016           {
3017             vlib_buffer_t * p2, * p3;
3018
3019             p2 = vlib_get_buffer (vm, from[2]);
3020             p3 = vlib_get_buffer (vm, from[3]);
3021
3022             vlib_prefetch_buffer_header (p2, LOAD);
3023             vlib_prefetch_buffer_header (p3, LOAD);
3024
3025             CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
3026             CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
3027           }
3028
3029           pi0 = to_next[0] = from[0];
3030           pi1 = to_next[1] = from[1];
3031
3032           p0 = vlib_get_buffer (vm, pi0);
3033           p1 = vlib_get_buffer (vm, pi1);
3034
3035           ip0 = vlib_buffer_get_current (p0);
3036           ip1 = vlib_buffer_get_current (p1);
3037
3038           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
3039           fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
3040           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
3041             fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
3042           fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
3043             fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
3044
3045           adj_index0 = ip4_fib_lookup_buffer (im, fib_index0, 
3046                                               &ip0->dst_address, p0);
3047           adj_index1 = ip4_fib_lookup_buffer (im, fib_index1, 
3048                                               &ip1->dst_address, p1);
3049
3050           adj0 = ip_get_adjacency (lm, adj_index0);
3051           adj1 = ip_get_adjacency (lm, adj_index1);
3052
3053           next0 = adj0->lookup_next_index;
3054           next1 = adj1->lookup_next_index;
3055
3056           flow_hash_config0 = 
3057               vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
3058
3059           flow_hash_config1 = 
3060               vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config;
3061
3062           vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash 
3063               (ip0, flow_hash_config0);
3064                                                                   
3065           vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash 
3066               (ip1, flow_hash_config1);
3067
3068           ASSERT (adj0->n_adj > 0);
3069           ASSERT (adj1->n_adj > 0);
3070           ASSERT (is_pow2 (adj0->n_adj));
3071           ASSERT (is_pow2 (adj1->n_adj));
3072           adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
3073           adj_index1 += (vnet_buffer (p1)->ip.flow_hash & (adj1->n_adj - 1));
3074
3075           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
3076           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
3077
3078           if (1) /* $$$$$$ HACK FIXME */
3079           vlib_increment_combined_counter 
3080               (cm, cpu_index, adj_index0, 1,
3081                vlib_buffer_length_in_chain (vm, p0));
3082           if (1) /* $$$$$$ HACK FIXME */
3083           vlib_increment_combined_counter 
3084               (cm, cpu_index, adj_index1, 1,
3085                vlib_buffer_length_in_chain (vm, p1));
3086
3087           from += 2;
3088           to_next += 2;
3089           n_left_to_next -= 2;
3090           n_left_from -= 2;
3091
3092           wrong_next = (next0 != next) + 2*(next1 != next);
3093           if (PREDICT_FALSE (wrong_next != 0))
3094             {
3095               switch (wrong_next)
3096                 {
3097                 case 1:
3098                   /* A B A */
3099                   to_next[-2] = pi1;
3100                   to_next -= 1;
3101                   n_left_to_next += 1;
3102                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
3103                   break;
3104
3105                 case 2:
3106                   /* A A B */
3107                   to_next -= 1;
3108                   n_left_to_next += 1;
3109                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
3110                   break;
3111
3112                 case 3:
3113                   /* A B C */
3114                   to_next -= 2;
3115                   n_left_to_next += 2;
3116                   vlib_set_next_frame_buffer (vm, node, next0, pi0);
3117                   vlib_set_next_frame_buffer (vm, node, next1, pi1);
3118                   if (next0 == next1)
3119                     {
3120                       /* A B B */
3121                       vlib_put_next_frame (vm, node, next, n_left_to_next);
3122                       next = next1;
3123                       vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
3124                     }
3125                 }
3126             }
3127         }
3128     
3129       while (n_left_from > 0 && n_left_to_next > 0)
3130         {
3131           vlib_buffer_t * p0;
3132           ip4_header_t * ip0;
3133           u32 pi0, adj_index0;
3134           ip_lookup_next_t next0;
3135           ip_adjacency_t * adj0;
3136           u32 fib_index0;
3137           u32 flow_hash_config0;
3138
3139           pi0 = from[0];
3140           to_next[0] = pi0;
3141
3142           p0 = vlib_get_buffer (vm, pi0);
3143
3144           ip0 = vlib_buffer_get_current (p0);
3145
3146           fib_index0 = vec_elt (im->fib_index_by_sw_if_index, 
3147                                 vnet_buffer (p0)->sw_if_index[VLIB_RX]);
3148           fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
3149               fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
3150           
3151           adj_index0 = ip4_fib_lookup_buffer (im, fib_index0, 
3152                                               &ip0->dst_address, p0);
3153
3154           adj0 = ip_get_adjacency (lm, adj_index0);
3155
3156           next0 = adj0->lookup_next_index;
3157
3158           flow_hash_config0 = 
3159               vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
3160
3161           vnet_buffer (p0)->ip.flow_hash = 
3162             ip4_compute_flow_hash (ip0, flow_hash_config0);
3163
3164           ASSERT (adj0->n_adj > 0);
3165           ASSERT (is_pow2 (adj0->n_adj));
3166           adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
3167
3168           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
3169
3170           if (1) /* $$$$$$ HACK FIXME */
3171               vlib_increment_combined_counter 
3172                   (cm, cpu_index, adj_index0, 1,
3173                    vlib_buffer_length_in_chain (vm, p0));
3174
3175           from += 1;
3176           to_next += 1;
3177           n_left_to_next -= 1;
3178           n_left_from -= 1;
3179
3180           if (PREDICT_FALSE (next0 != next))
3181             {
3182               n_left_to_next += 1;
3183               vlib_put_next_frame (vm, node, next, n_left_to_next);
3184               next = next0;
3185               vlib_get_next_frame (vm, node, next,
3186                                    to_next, n_left_to_next);
3187               to_next[0] = pi0;
3188               to_next += 1;
3189               n_left_to_next -= 1;
3190             }
3191         }
3192
3193       vlib_put_next_frame (vm, node, next, n_left_to_next);
3194     }
3195
3196   if (node->flags & VLIB_NODE_FLAG_TRACE)
3197       ip4_forward_next_trace(vm, node, frame, VLIB_TX);
3198
3199   return frame->n_vectors;
3200 }
3201
3202 VLIB_REGISTER_NODE (ip4_lookup_multicast_node,static) = {
3203   .function = ip4_lookup_multicast,
3204   .name = "ip4-lookup-multicast",
3205   .vector_size = sizeof (u32),
3206   .sibling_of = "ip4-lookup",
3207   .format_trace = format_ip4_lookup_trace,
3208
3209   .n_next_nodes = 0,
3210 };
3211
3212 VLIB_NODE_FUNCTION_MULTIARCH (ip4_lookup_multicast_node, ip4_lookup_multicast)
3213
3214 VLIB_REGISTER_NODE (ip4_multicast_node,static) = {
3215   .function = ip4_drop,
3216   .name = "ip4-multicast",
3217   .vector_size = sizeof (u32),
3218
3219   .format_trace = format_ip4_forward_next_trace,
3220
3221   .n_next_nodes = 1,
3222   .next_nodes = {
3223     [0] = "error-drop",
3224   },
3225 };
3226
3227 int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0)
3228 {
3229   ip4_main_t * im = &ip4_main;
3230   ip4_fib_mtrie_t * mtrie0;
3231   ip4_fib_mtrie_leaf_t leaf0;
3232   u32 adj_index0;
3233     
3234   mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
3235
3236   leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
3237   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 0);
3238   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 1);
3239   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
3240   leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
3241   
3242   /* Handle default route. */
3243   leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
3244   
3245   adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
3246   
3247   return adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
3248                                                   a, 
3249                                                   /* no_default_route */ 0);
3250 }
3251  
3252 static clib_error_t *
3253 test_lookup_command_fn (vlib_main_t * vm,
3254                         unformat_input_t * input,
3255                         vlib_cli_command_t * cmd)
3256 {
3257   u32 table_id = 0;
3258   f64 count = 1;
3259   u32 n;
3260   int i;
3261   ip4_address_t ip4_base_address;
3262   u64 errors = 0;
3263
3264   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
3265       if (unformat (input, "table %d", &table_id))
3266         ;
3267       else if (unformat (input, "count %f", &count))
3268         ;
3269
3270       else if (unformat (input, "%U",
3271                          unformat_ip4_address, &ip4_base_address))
3272         ;
3273       else
3274         return clib_error_return (0, "unknown input `%U'",
3275                                   format_unformat_error, input);
3276   }
3277
3278   n = count;
3279
3280   for (i = 0; i < n; i++)
3281     {
3282       if (!ip4_lookup_validate (&ip4_base_address, table_id))
3283         errors++;
3284
3285       ip4_base_address.as_u32 = 
3286         clib_host_to_net_u32 (1 + 
3287                               clib_net_to_host_u32 (ip4_base_address.as_u32));
3288     }
3289
3290   if (errors) 
3291     vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
3292   else
3293     vlib_cli_output (vm, "No errors in %d lookups\n", n);
3294
3295   return 0;
3296 }
3297
3298 VLIB_CLI_COMMAND (lookup_test_command, static) = {
3299     .path = "test lookup",
3300     .short_help = "test lookup",
3301     .function = test_lookup_command_fn,
3302 };
3303
3304 int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
3305 {
3306   ip4_main_t * im4 = &ip4_main;
3307   ip4_fib_t * fib;
3308   uword * p = hash_get (im4->fib_index_by_table_id, table_id);
3309
3310   if (p == 0)
3311     return VNET_API_ERROR_NO_SUCH_FIB;
3312
3313   fib = vec_elt_at_index (im4->fibs, p[0]);
3314
3315   fib->flow_hash_config = flow_hash_config;
3316   return 0;
3317 }
3318  
3319 static clib_error_t *
3320 set_ip_flow_hash_command_fn (vlib_main_t * vm,
3321                              unformat_input_t * input,
3322                              vlib_cli_command_t * cmd)
3323 {
3324   int matched = 0;
3325   u32 table_id = 0;
3326   u32 flow_hash_config = 0;
3327   int rv;
3328
3329   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
3330     if (unformat (input, "table %d", &table_id))
3331       matched = 1;
3332 #define _(a,v) \
3333     else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
3334     foreach_flow_hash_bit
3335 #undef _
3336     else break;
3337   }
3338   
3339   if (matched == 0)
3340     return clib_error_return (0, "unknown input `%U'",
3341                               format_unformat_error, input);
3342   
3343   rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
3344   switch (rv)
3345     {
3346     case 0:
3347       break;
3348       
3349     case VNET_API_ERROR_NO_SUCH_FIB:
3350       return clib_error_return (0, "no such FIB table %d", table_id);
3351       
3352     default:
3353       clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
3354       break;
3355     }
3356   
3357   return 0;
3358 }
3359  
3360 VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) = {
3361   .path = "set ip flow-hash",
3362   .short_help = 
3363   "set ip table flow-hash table <fib-id> src dst sport dport proto reverse",
3364   .function = set_ip_flow_hash_command_fn,
3365 };
3366  
3367 int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index, 
3368                                  u32 table_index)
3369 {
3370   vnet_main_t * vnm = vnet_get_main();
3371   vnet_interface_main_t * im = &vnm->interface_main;
3372   ip4_main_t * ipm = &ip4_main;
3373   ip_lookup_main_t * lm = &ipm->lookup_main;
3374   vnet_classify_main_t * cm = &vnet_classify_main;
3375
3376   if (pool_is_free_index (im->sw_interfaces, sw_if_index))
3377     return VNET_API_ERROR_NO_MATCHING_INTERFACE;
3378
3379   if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
3380     return VNET_API_ERROR_NO_SUCH_ENTRY;
3381
3382   vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
3383   lm->classify_table_index_by_sw_if_index [sw_if_index] = table_index;
3384
3385   return 0;
3386 }
3387
3388 static clib_error_t *
3389 set_ip_classify_command_fn (vlib_main_t * vm,
3390                             unformat_input_t * input,
3391                             vlib_cli_command_t * cmd)
3392 {
3393   u32 table_index = ~0;
3394   int table_index_set = 0;
3395   u32 sw_if_index = ~0;
3396   int rv;
3397   
3398   while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
3399     if (unformat (input, "table-index %d", &table_index))
3400       table_index_set = 1;
3401     else if (unformat (input, "intfc %U", unformat_vnet_sw_interface, 
3402                        vnet_get_main(), &sw_if_index))
3403       ;
3404     else
3405       break;
3406   }
3407       
3408   if (table_index_set == 0)
3409     return clib_error_return (0, "classify table-index must be specified");
3410
3411   if (sw_if_index == ~0)
3412     return clib_error_return (0, "interface / subif must be specified");
3413
3414   rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
3415
3416   switch (rv)
3417     {
3418     case 0:
3419       break;
3420
3421     case VNET_API_ERROR_NO_MATCHING_INTERFACE:
3422       return clib_error_return (0, "No such interface");
3423
3424     case VNET_API_ERROR_NO_SUCH_ENTRY:
3425       return clib_error_return (0, "No such classifier table");
3426     }
3427   return 0;
3428 }
3429
3430 VLIB_CLI_COMMAND (set_ip_classify_command, static) = {
3431     .path = "set ip classify",
3432     .short_help = 
3433     "set ip classify intfc <int> table-index <index>",
3434     .function = set_ip_classify_command_fn,
3435 };
3436