Adjust buffer data offset based on rte_mbuff data_off
[vpp.git] / vnet / vnet / devices / dpdk / node.c
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #include <vnet/vnet.h>
16 #include <vppinfra/vec.h>
17 #include <vppinfra/error.h>
18 #include <vppinfra/format.h>
19 #include <vppinfra/xxhash.h>
20
21 #include <vnet/ethernet/ethernet.h>
22 #include <vnet/devices/dpdk/dpdk.h>
23 #include <vnet/classify/vnet_classify.h>
24 #include <vnet/mpls-gre/packet.h>
25
26 #include "dpdk_priv.h"
27
28 #ifndef MAX
29 #define MAX(a,b) ((a) < (b) ? (b) : (a))
30 #endif
31
32 #ifndef MIN
33 #define MIN(a,b) ((a) < (b) ? (a) : (b))
34 #endif
35
36 /*
37  * At least in certain versions of ESXi, vmware e1000's don't honor the
38  * "strip rx CRC" bit. Set this flag to work around that bug FOR UNIT TEST ONLY.
39  *
40  * If wireshark complains like so:
41  *
42  * "Frame check sequence: 0x00000000 [incorrect, should be <hex-num>]"
43  * and you're using ESXi emulated e1000's, set this flag FOR UNIT TEST ONLY.
44  *
45  * Note: do NOT check in this file with this workaround enabled! You'll lose
46  * actual data from e.g. 10xGE interfaces. The extra 4 bytes annoy
47  * wireshark, but they're harmless...
48  */
49 #define VMWARE_LENGTH_BUG_WORKAROUND 0
50
51 typedef struct {
52   u32 cached_next_index;
53
54   /* convenience variables */
55   vlib_main_t * vlib_main;
56   vnet_main_t * vnet_main;
57 } handoff_dispatch_main_t;
58
59 typedef struct {
60   u32 buffer_index;
61   u32 next_index;
62   u32 sw_if_index;
63 } handoff_dispatch_trace_t;
64
65 /* packet trace format function */
66 static u8 * format_handoff_dispatch_trace (u8 * s, va_list * args)
67 {
68   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
69   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
70   handoff_dispatch_trace_t * t = va_arg (*args, handoff_dispatch_trace_t *);
71
72   s = format (s, "HANDOFF_DISPATCH: sw_if_index %d next_index %d buffer 0x%x",
73       t->sw_if_index,
74       t->next_index,
75       t->buffer_index);
76   return s;
77 }
78
79 handoff_dispatch_main_t handoff_dispatch_main;
80
81 vlib_node_registration_t handoff_dispatch_node;
82
83 #define foreach_handoff_dispatch_error \
84 _(EXAMPLE, "example packets")
85
86 typedef enum {
87 #define _(sym,str) HANDOFF_DISPATCH_ERROR_##sym,
88   foreach_handoff_dispatch_error
89 #undef _
90   HANDOFF_DISPATCH_N_ERROR,
91 } handoff_dispatch_error_t;
92
93 static char * handoff_dispatch_error_strings[] = {
94 #define _(sym,string) string,
95   foreach_handoff_dispatch_error
96 #undef _
97 };
98
99 static inline
100 void vlib_put_handoff_queue_elt (vlib_frame_queue_elt_t * hf)
101 {
102   CLIB_MEMORY_BARRIER();
103   hf->valid = 1;
104 }
105
106 static uword
107 handoff_dispatch_node_fn (vlib_main_t * vm,
108                   vlib_node_runtime_t * node,
109                   vlib_frame_t * frame)
110 {
111   u32 n_left_from, * from, * to_next;
112   dpdk_rx_next_t next_index;
113
114   from = vlib_frame_vector_args (frame);
115   n_left_from = frame->n_vectors;
116   next_index = node->cached_next_index;
117
118   while (n_left_from > 0)
119     {
120       u32 n_left_to_next;
121
122       vlib_get_next_frame (vm, node, next_index,
123                            to_next, n_left_to_next);
124
125       while (n_left_from >= 4 && n_left_to_next >= 2)
126         {
127           u32 bi0, bi1;
128           vlib_buffer_t * b0, * b1;
129           u32 next0, next1;
130           u32 sw_if_index0, sw_if_index1;
131           
132           /* Prefetch next iteration. */
133           {
134             vlib_buffer_t * p2, * p3;
135             
136             p2 = vlib_get_buffer (vm, from[2]);
137             p3 = vlib_get_buffer (vm, from[3]);
138             
139             vlib_prefetch_buffer_header (p2, LOAD);
140             vlib_prefetch_buffer_header (p3, LOAD);
141           }
142
143           /* speculatively enqueue b0 and b1 to the current next frame */
144           to_next[0] = bi0 = from[0];
145           to_next[1] = bi1 = from[1];
146           from += 2;
147           to_next += 2;
148           n_left_from -= 2;
149           n_left_to_next -= 2;
150
151           b0 = vlib_get_buffer (vm, bi0);
152           b1 = vlib_get_buffer (vm, bi1);
153
154           next0 = vnet_buffer(b0)->io_handoff.next_index;
155           next1 = vnet_buffer(b1)->io_handoff.next_index;
156
157           if (PREDICT_FALSE(vm->trace_main.trace_active_hint))
158             {
159             if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
160               {
161                 vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
162                 handoff_dispatch_trace_t *t =
163                   vlib_add_trace (vm, node, b0, sizeof (*t));
164                 sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
165                 t->sw_if_index = sw_if_index0;
166                 t->next_index = next0;
167                 t->buffer_index = bi0;
168               }
169             if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
170               {
171                 vlib_trace_buffer (vm, node, next1, b1, /* follow_chain */ 0);
172                 handoff_dispatch_trace_t *t =
173                   vlib_add_trace (vm, node, b1, sizeof (*t));
174                 sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
175                 t->sw_if_index = sw_if_index1;
176                 t->next_index = next1;
177                 t->buffer_index = bi1;
178               }
179             }
180             
181           /* verify speculative enqueues, maybe switch current next frame */
182           vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
183                                            to_next, n_left_to_next,
184                                            bi0, bi1, next0, next1);
185         }
186       
187       while (n_left_from > 0 && n_left_to_next > 0)
188         {
189           u32 bi0;
190           vlib_buffer_t * b0;
191           u32 next0;
192           u32 sw_if_index0;
193
194           /* speculatively enqueue b0 to the current next frame */
195           bi0 = from[0];
196           to_next[0] = bi0;
197           from += 1;
198           to_next += 1;
199           n_left_from -= 1;
200           n_left_to_next -= 1;
201
202           b0 = vlib_get_buffer (vm, bi0);
203
204           next0 = vnet_buffer(b0)->io_handoff.next_index;
205
206           if (PREDICT_FALSE(vm->trace_main.trace_active_hint))
207             {
208             if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
209               {
210                 vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
211                 handoff_dispatch_trace_t *t =
212                   vlib_add_trace (vm, node, b0, sizeof (*t));
213                 sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
214                 t->sw_if_index = sw_if_index0;
215                 t->next_index = next0;
216                 t->buffer_index = bi0;
217               }
218             }
219
220           /* verify speculative enqueue, maybe switch current next frame */
221           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
222                                            to_next, n_left_to_next,
223                                            bi0, next0);
224         }
225
226       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
227     }
228
229   return frame->n_vectors;
230 }
231
232 VLIB_REGISTER_NODE (handoff_dispatch_node) = {
233   .function = handoff_dispatch_node_fn,
234   .name = "handoff-dispatch",
235   .vector_size = sizeof (u32),
236   .format_trace = format_handoff_dispatch_trace,
237   .type = VLIB_NODE_TYPE_INTERNAL,
238   .flags = VLIB_NODE_FLAG_IS_HANDOFF,
239   
240   .n_errors = ARRAY_LEN(handoff_dispatch_error_strings),
241   .error_strings = handoff_dispatch_error_strings,
242
243   .n_next_nodes = DPDK_RX_N_NEXT,
244
245   .next_nodes = {
246         [DPDK_RX_NEXT_DROP] = "error-drop",
247         [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
248         [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input",
249         [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
250         [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
251   },
252 };
253
254 VLIB_NODE_FUNCTION_MULTIARCH (handoff_dispatch_node, handoff_dispatch_node_fn)
255
256 clib_error_t *handoff_dispatch_init (vlib_main_t *vm)
257 {
258   handoff_dispatch_main_t * mp = &handoff_dispatch_main;
259     
260   mp->vlib_main = vm;
261   mp->vnet_main = &vnet_main;
262
263   return 0;
264 }
265
266 VLIB_INIT_FUNCTION (handoff_dispatch_init);
267
268 u32 dpdk_get_handoff_node_index (void)
269 {
270   return handoff_dispatch_node.index;
271 }
272
273 static char * dpdk_error_strings[] = {
274 #define _(n,s) s,
275     foreach_dpdk_error
276 #undef _
277 };
278
279 always_inline void
280 dpdk_rx_next_and_error_from_mb_flags_x1 (dpdk_device_t *xd, struct rte_mbuf *mb,
281                                          vlib_buffer_t *b0,
282                                          u8 * next0, u8 * error0)
283 {
284   u8 is0_ip4, is0_ip6, is0_mpls, n0;
285   uint16_t mb_flags = mb->ol_flags;
286
287   if (PREDICT_FALSE(mb_flags & (
288 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
289        PKT_EXT_RX_PKT_ERROR | PKT_EXT_RX_BAD_FCS   |
290 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
291         PKT_RX_IP_CKSUM_BAD  | PKT_RX_L4_CKSUM_BAD
292     ))) 
293     {
294       /* some error was flagged. determine the drop reason */ 
295       n0 = DPDK_RX_NEXT_DROP;
296       *error0 = 
297 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
298         (mb_flags & PKT_EXT_RX_PKT_ERROR) ? DPDK_ERROR_RX_PACKET_ERROR : 
299         (mb_flags & PKT_EXT_RX_BAD_FCS) ? DPDK_ERROR_RX_BAD_FCS : 
300 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
301         (mb_flags & PKT_RX_IP_CKSUM_BAD) ? DPDK_ERROR_IP_CHECKSUM_ERROR : 
302         (mb_flags & PKT_RX_L4_CKSUM_BAD) ? DPDK_ERROR_L4_CHECKSUM_ERROR : 
303         DPDK_ERROR_NONE;
304     }
305   else
306     {
307       *error0 = DPDK_ERROR_NONE;
308       if (xd->per_interface_next_index != ~0)
309         n0 = xd->per_interface_next_index;
310       else if (mb_flags & PKT_RX_VLAN_PKT)
311         n0 = DPDK_RX_NEXT_ETHERNET_INPUT;
312       else
313         {
314           n0 = DPDK_RX_NEXT_ETHERNET_INPUT;
315 #if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
316           is0_ip4 = RTE_ETH_IS_IPV4_HDR(mb->packet_type) != 0;
317 #else
318           is0_ip4 = (mb_flags & (PKT_RX_IPV4_HDR | PKT_RX_IPV4_HDR_EXT)) != 0;
319 #endif
320
321           if (PREDICT_TRUE(is0_ip4))
322             n0 = DPDK_RX_NEXT_IP4_INPUT;
323           else
324             {
325 #if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
326               is0_ip6 = RTE_ETH_IS_IPV6_HDR(mb->packet_type) != 0;
327 #else
328               is0_ip6 = 
329                       (mb_flags & (PKT_RX_IPV6_HDR | PKT_RX_IPV6_HDR_EXT)) != 0;
330 #endif
331               if (PREDICT_TRUE(is0_ip6))
332                 n0 = DPDK_RX_NEXT_IP6_INPUT;
333               else
334                 {
335                   ethernet_header_t *h0 = (ethernet_header_t *) b0->data;
336                   is0_mpls = (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST));
337                   n0 = is0_mpls ? DPDK_RX_NEXT_MPLS_INPUT : n0;
338                 }
339             }
340         }
341     }
342   *next0 = n0;
343 }
344
345 void dpdk_rx_trace (dpdk_main_t * dm,
346                     vlib_node_runtime_t * node,
347                     dpdk_device_t * xd,
348                     u16 queue_id,
349                     u32 * buffers,
350                     uword n_buffers)
351 {
352   vlib_main_t * vm = vlib_get_main();
353   u32 * b, n_left;
354   u8 next0;
355
356   n_left = n_buffers;
357   b = buffers;
358
359   while (n_left >= 1)
360     {
361       u32 bi0;
362       vlib_buffer_t * b0;
363       dpdk_rx_dma_trace_t * t0;
364       struct rte_mbuf *mb;
365       u8 error0;
366
367       bi0 = b[0];
368       n_left -= 1;
369
370       b0 = vlib_get_buffer (vm, bi0);
371       mb = rte_mbuf_from_vlib_buffer(b0);
372       dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
373                                                &next0, &error0);
374       vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
375       t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
376       t0->queue_index = queue_id;
377       t0->device_index = xd->device_index;
378       t0->buffer_index = bi0;
379
380       clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
381       clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
382       clib_memcpy (t0->buffer.pre_data, b0->data, sizeof (t0->buffer.pre_data));
383
384 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
385       /*
386        * Clear overloaded TX offload flags when a DPDK driver
387        * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
388        */
389       mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
390 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
391
392       b += 1;
393     }
394 }
395
396 /*
397  * dpdk_efd_update_counters()
398  * Update EFD (early-fast-discard) counters
399  */
400 void dpdk_efd_update_counters (dpdk_device_t *xd,
401                                u32 n_buffers,
402                                u16 enabled)
403 {
404   if (enabled & DPDK_EFD_MONITOR_ENABLED)
405     {
406       u64 now = clib_cpu_time_now();
407       if (xd->efd_agent.last_poll_time > 0)
408         {
409           u64 elapsed_time = (now - xd->efd_agent.last_poll_time);
410           if (elapsed_time > xd->efd_agent.max_poll_delay)
411             xd->efd_agent.max_poll_delay = elapsed_time;
412         }
413       xd->efd_agent.last_poll_time = now;
414     }
415   
416   xd->efd_agent.total_packet_cnt += n_buffers;
417   xd->efd_agent.last_burst_sz = n_buffers;
418
419   if (n_buffers > xd->efd_agent.max_burst_sz)
420     xd->efd_agent.max_burst_sz = n_buffers;
421
422   if (PREDICT_FALSE(n_buffers == VLIB_FRAME_SIZE))
423     {
424       xd->efd_agent.full_frames_cnt++;
425       xd->efd_agent.consec_full_frames_cnt++;
426     }
427   else
428     {
429       xd->efd_agent.consec_full_frames_cnt = 0;
430     }
431 }
432
433 /* is_efd_discardable()
434  *   returns non zero DPDK error if packet meets early-fast-discard criteria,
435  *           zero otherwise
436  */
437 u32 is_efd_discardable (vlib_thread_main_t *tm,
438                         vlib_buffer_t * b0,
439                         struct rte_mbuf *mb)
440 {
441   ethernet_header_t *eh = (ethernet_header_t *) b0->data;
442
443   if (eh->type == clib_host_to_net_u16(ETHERNET_TYPE_IP4))
444     {
445       ip4_header_t *ipv4 =
446           (ip4_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
447       u8 pkt_prec = (ipv4->tos >> 5);
448           
449       return (tm->efd.ip_prec_bitmap & (1 << pkt_prec) ?
450                   DPDK_ERROR_IPV4_EFD_DROP_PKTS : DPDK_ERROR_NONE);
451     }
452   else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_IP6))
453     {
454       ip6_header_t *ipv6 =
455           (ip6_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
456       u8 pkt_tclass =
457           ((ipv6->ip_version_traffic_class_and_flow_label >> 20) & 0xff);
458           
459       return (tm->efd.ip_prec_bitmap & (1 << pkt_tclass) ?
460                   DPDK_ERROR_IPV6_EFD_DROP_PKTS : DPDK_ERROR_NONE);
461     }
462   else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_MPLS_UNICAST))
463     {
464       mpls_unicast_header_t *mpls =
465           (mpls_unicast_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
466       u8 pkt_exp = ((mpls->label_exp_s_ttl >> 9) & 0x07);
467
468       return (tm->efd.mpls_exp_bitmap & (1 << pkt_exp) ?
469                   DPDK_ERROR_MPLS_EFD_DROP_PKTS : DPDK_ERROR_NONE);
470     }
471   else if ((eh->type == clib_net_to_host_u16(ETHERNET_TYPE_VLAN)) ||
472            (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_DOT1AD)))
473     {
474       ethernet_vlan_header_t *vlan =
475           (ethernet_vlan_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
476       u8 pkt_cos = ((vlan->priority_cfi_and_id >> 13) & 0x07);
477
478       return (tm->efd.vlan_cos_bitmap & (1 << pkt_cos) ?
479                   DPDK_ERROR_VLAN_EFD_DROP_PKTS : DPDK_ERROR_NONE);
480     }
481
482   return DPDK_ERROR_NONE;
483 }
484
485 /*
486  * This function is used when there are no worker threads.
487  * The main thread performs IO and forwards the packets. 
488  */
489 static inline u32 dpdk_device_input ( dpdk_main_t * dm, 
490                                       dpdk_device_t * xd,
491                                       vlib_node_runtime_t * node,
492                                       u32 cpu_index,
493                                       u16 queue_id,
494                                       int use_efd)
495 {
496   u32 n_buffers;
497   u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT;
498   u32 n_left_to_next, * to_next;
499   u32 mb_index;
500   vlib_main_t * vm = vlib_get_main();
501   uword n_rx_bytes = 0;
502   u32 n_trace, trace_cnt __attribute__((unused));
503   vlib_buffer_free_list_t * fl;
504   u8 efd_discard_burst = 0;
505   u32 buffer_flags_template;
506   
507   if (xd->admin_up == 0)
508     return 0;
509
510   n_buffers = dpdk_rx_burst(dm, xd, queue_id);
511
512   if (n_buffers == 0)
513     {
514       /* check if EFD (dpdk) is enabled */
515       if (PREDICT_FALSE(use_efd && dm->efd.enabled))
516         {
517           /* reset a few stats */
518           xd->efd_agent.last_poll_time = 0;
519           xd->efd_agent.last_burst_sz = 0;
520         }
521       return 0;
522     }
523
524   buffer_flags_template = dm->buffer_flags_template;
525
526   vec_reset_length (xd->d_trace_buffers);
527   trace_cnt = n_trace = vlib_get_trace_count (vm, node);
528
529   fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
530
531   /*
532    * DAW-FIXME: VMXNET3 device stop/start doesn't work, 
533    * therefore fake the stop in the dpdk driver by
534    * silently dropping all of the incoming pkts instead of 
535    * stopping the driver / hardware.
536    */
537   if (PREDICT_FALSE(xd->admin_up != 1))
538     {
539       for (mb_index = 0; mb_index < n_buffers; mb_index++)
540         rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
541       
542       return 0;
543     }
544
545   /* Check for congestion if EFD (Early-Fast-Discard) is enabled
546    * in any mode (e.g. dpdk, monitor, or drop_all)
547    */
548   if (PREDICT_FALSE(use_efd && dm->efd.enabled))
549     {
550       /* update EFD counters */
551       dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
552
553       if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
554         {
555           /* discard all received packets */
556           for (mb_index = 0; mb_index < n_buffers; mb_index++)
557             rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
558
559           xd->efd_agent.discard_cnt += n_buffers;
560           increment_efd_drop_counter(vm, 
561                                      DPDK_ERROR_VLAN_EFD_DROP_PKTS,
562                                      n_buffers);
563
564           return 0;
565         }
566       
567       if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
568                         dm->efd.consec_full_frames_hi_thresh))
569         {
570           u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
571                                                        queue_id);
572           if (device_queue_sz >= dm->efd.queue_hi_thresh)
573             {
574               /* dpdk device queue has reached the critical threshold */
575               xd->efd_agent.congestion_cnt++;
576
577               /* apply EFD to packets from the burst */
578               efd_discard_burst = 1;
579             }
580         }
581     }
582   
583   mb_index = 0;
584
585   while (n_buffers > 0)
586     {
587       u32 bi0;
588       u8 next0, error0;
589       u32 l3_offset0;
590       vlib_buffer_t * b0, * b_seg, * b_chain = 0;
591       u32 cntr_type;
592
593       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
594
595       while (n_buffers > 0 && n_left_to_next > 0)
596         {
597           u8 nb_seg = 1;
598           struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
599           struct rte_mbuf *mb_seg = mb->next;
600
601           if (PREDICT_TRUE(n_buffers > 2))
602           {
603               struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
604               vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
605               CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, STORE);
606               CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
607           }
608
609           ASSERT(mb);
610
611           b0 = vlib_buffer_from_rte_mbuf(mb);
612
613           /* check whether EFD is looking for packets to discard */
614           if (PREDICT_FALSE(efd_discard_burst))
615             {
616               vlib_thread_main_t * tm = vlib_get_thread_main();
617
618               if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
619                 {
620                   rte_pktmbuf_free(mb);
621                   xd->efd_agent.discard_cnt++;
622                   increment_efd_drop_counter(vm, 
623                                              cntr_type,
624                                              1);
625                   n_buffers--;
626                   mb_index++;
627                   continue;
628                 }
629             }
630
631           /* Prefetch one next segment if it exists. */
632           if (PREDICT_FALSE(mb->nb_segs > 1))
633             {
634               struct rte_mbuf *pfmb = mb->next;
635               vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
636               CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
637               CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
638               b_chain = b0;
639             }
640
641           vlib_buffer_init_for_free_list (b0, fl);
642           b0->clone_count = 0;
643           
644           bi0 = vlib_get_buffer_index (vm, b0);
645
646           to_next[0] = bi0;
647           to_next++;
648           n_left_to_next--;
649           
650           dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
651                                                    &next0, &error0);
652 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
653           /*
654            * Clear overloaded TX offload flags when a DPDK driver
655            * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
656            */
657
658           if (PREDICT_TRUE(trace_cnt == 0))
659             mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
660           else
661             trace_cnt--;
662 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
663
664           b0->error = node->errors[error0];
665
666           l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
667                          next0 == DPDK_RX_NEXT_IP6_INPUT ||
668                          next0 == DPDK_RX_NEXT_MPLS_INPUT) ? 
669                         sizeof (ethernet_header_t) : 0);
670
671           b0->current_data = l3_offset0;
672           /* Some drivers like fm10k receive frames with
673              mb->data_off > RTE_PKTMBUF_HEADROOM */
674           b0->current_data += mb->data_off - RTE_PKTMBUF_HEADROOM;
675           b0->current_length = mb->data_len - l3_offset0;
676
677           b0->flags = buffer_flags_template;
678
679           if (VMWARE_LENGTH_BUG_WORKAROUND)
680               b0->current_length -= 4;
681
682           vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
683           vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
684           n_rx_bytes += mb->pkt_len;
685
686           /* Process subsequent segments of multi-segment packets */
687           while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
688             {
689               ASSERT(mb_seg != 0);
690
691               b_seg = vlib_buffer_from_rte_mbuf(mb_seg);
692               vlib_buffer_init_for_free_list (b_seg, fl);
693               b_seg->clone_count = 0;
694
695               ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
696               ASSERT(b_seg->current_data == 0);
697
698               /*
699                * The driver (e.g. virtio) may not put the packet data at the start
700                * of the segment, so don't assume b_seg->current_data == 0 is correct.
701                */
702               b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
703
704               b_seg->current_length = mb_seg->data_len;
705               b0->total_length_not_including_first_buffer +=
706                 mb_seg->data_len;
707
708               b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
709               b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
710
711               b_chain = b_seg;
712               mb_seg = mb_seg->next;
713               nb_seg++;
714             } 
715
716           /*
717            * Turn this on if you run into
718            * "bad monkey" contexts, and you want to know exactly
719            * which nodes they've visited... See main.c...
720            */
721           VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
722
723           vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
724                                            to_next, n_left_to_next,
725                                            bi0, next0);
726           if (PREDICT_FALSE (n_trace > mb_index))
727             vec_add1 (xd->d_trace_buffers, bi0);
728           n_buffers--;
729           mb_index++;
730         }
731       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
732     }
733
734   if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
735     {
736       dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers,
737                      vec_len (xd->d_trace_buffers));
738       vlib_set_trace_count (vm, node, n_trace - vec_len (xd->d_trace_buffers));
739     }
740   
741   vlib_increment_combined_counter 
742     (vnet_get_main()->interface_main.combined_sw_if_counters
743      + VNET_INTERFACE_COUNTER_RX,
744      cpu_index, 
745      xd->vlib_sw_if_index,
746      mb_index, n_rx_bytes);
747
748   dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
749   dw->aggregate_rx_packets += mb_index;
750
751   return mb_index;
752 }
753
754 static inline void poll_rate_limit(dpdk_main_t * dm)
755 {
756   /* Limit the poll rate by sleeping for N msec between polls */
757   if (PREDICT_FALSE (dm->poll_sleep != 0))
758   {
759     struct timespec ts, tsrem;
760
761     ts.tv_sec = 0;
762     ts.tv_nsec = 1000*1000*dm->poll_sleep; /* 1ms */
763
764     while (nanosleep(&ts, &tsrem) < 0)
765       {
766         ts = tsrem;
767       }
768   }
769 }
770
771 static uword
772 dpdk_input (vlib_main_t * vm,
773             vlib_node_runtime_t * node,
774             vlib_frame_t * f)
775 {
776   dpdk_main_t * dm = &dpdk_main;
777   dpdk_device_t * xd;
778   uword n_rx_packets = 0;
779   dpdk_device_and_queue_t * dq;
780   u32 cpu_index = os_get_cpu_number();
781
782   /*
783    * Poll all devices on this cpu for input/interrupts.
784    */
785   vec_foreach (dq, dm->devices_by_cpu[cpu_index])
786     {
787       xd = vec_elt_at_index(dm->devices, dq->device);
788       ASSERT(dq->queue_id == 0);
789       n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, 0, 0);
790     }
791
792   poll_rate_limit(dm);
793
794   return n_rx_packets;
795 }
796
797 uword
798 dpdk_input_rss (vlib_main_t * vm,
799       vlib_node_runtime_t * node,
800       vlib_frame_t * f)
801 {
802   dpdk_main_t * dm = &dpdk_main;
803   dpdk_device_t * xd;
804   uword n_rx_packets = 0;
805   dpdk_device_and_queue_t * dq;
806   u32 cpu_index = os_get_cpu_number();
807
808   /*
809    * Poll all devices on this cpu for input/interrupts.
810    */
811   vec_foreach (dq, dm->devices_by_cpu[cpu_index])
812     {
813       xd = vec_elt_at_index(dm->devices, dq->device);
814       n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, 0);
815     }
816
817   poll_rate_limit(dm);
818
819   return n_rx_packets;
820 }
821
822 uword
823 dpdk_input_efd (vlib_main_t * vm,
824       vlib_node_runtime_t * node,
825       vlib_frame_t * f)
826 {
827   dpdk_main_t * dm = &dpdk_main;
828   dpdk_device_t * xd;
829   uword n_rx_packets = 0;
830   dpdk_device_and_queue_t * dq;
831   u32 cpu_index = os_get_cpu_number();
832
833   /*
834    * Poll all devices on this cpu for input/interrupts.
835    */
836   vec_foreach (dq, dm->devices_by_cpu[cpu_index])
837     {
838       xd = vec_elt_at_index(dm->devices, dq->device);
839       n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, 1);
840     }
841
842   poll_rate_limit(dm);
843
844   return n_rx_packets;
845 }
846
847
848 VLIB_REGISTER_NODE (dpdk_input_node) = {
849   .function = dpdk_input,
850   .type = VLIB_NODE_TYPE_INPUT,
851   .name = "dpdk-input",
852
853   /* Will be enabled if/when hardware is detected. */
854   .state = VLIB_NODE_STATE_DISABLED,
855
856   .format_buffer = format_ethernet_header_with_length,
857   .format_trace = format_dpdk_rx_dma_trace,
858
859   .n_errors = DPDK_N_ERROR,
860   .error_strings = dpdk_error_strings,
861
862   .n_next_nodes = DPDK_RX_N_NEXT,
863   .next_nodes = {
864     [DPDK_RX_NEXT_DROP] = "error-drop",
865     [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
866     [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
867     [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
868     [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
869   },
870 };
871
872
873 /* handle dpdk_input_rss alternative function */
874 VLIB_NODE_FUNCTION_MULTIARCH_CLONE(dpdk_input)
875 VLIB_NODE_FUNCTION_MULTIARCH_CLONE(dpdk_input_rss)
876 VLIB_NODE_FUNCTION_MULTIARCH_CLONE(dpdk_input_efd)
877
878 /* this macro defines dpdk_input_rss_multiarch_select() */
879 CLIB_MULTIARCH_SELECT_FN(dpdk_input);
880 CLIB_MULTIARCH_SELECT_FN(dpdk_input_rss);
881 CLIB_MULTIARCH_SELECT_FN(dpdk_input_efd);
882
883 /*
884  * Override the next nodes for the dpdk input nodes.
885  * Must be invoked prior to VLIB_INIT_FUNCTION calls.
886  */
887 void dpdk_set_next_node (dpdk_rx_next_t next, char *name)
888 {
889   vlib_node_registration_t *r = &dpdk_input_node;
890   vlib_node_registration_t *r_io = &dpdk_io_input_node;
891   vlib_node_registration_t *r_handoff = &handoff_dispatch_node;
892
893   switch (next)
894     {
895     case DPDK_RX_NEXT_IP4_INPUT:
896     case DPDK_RX_NEXT_IP6_INPUT:
897     case DPDK_RX_NEXT_MPLS_INPUT:
898     case DPDK_RX_NEXT_ETHERNET_INPUT:
899       r->next_nodes[next] = name;
900       r_io->next_nodes[next] = name;
901       r_handoff->next_nodes[next] = name;
902       break;
903
904     default:
905       clib_warning ("%s: illegal next %d\n", __FUNCTION__, next);
906       break;
907     }
908 }
909
910 inline vlib_frame_queue_elt_t * 
911 vlib_get_handoff_queue_elt (u32 vlib_worker_index) 
912 {
913   vlib_frame_queue_t *fq;
914   vlib_frame_queue_elt_t *elt;
915   u64 new_tail;
916   
917   fq = vlib_frame_queues[vlib_worker_index];
918   ASSERT (fq);
919
920   new_tail = __sync_add_and_fetch (&fq->tail, 1);
921
922   /* Wait until a ring slot is available */
923   while (new_tail >= fq->head_hint + fq->nelts)
924       vlib_worker_thread_barrier_check ();
925
926   elt = fq->elts + (new_tail & (fq->nelts-1));
927
928   /* this would be very bad... */
929   while (elt->valid) 
930     ;
931
932   elt->msg_type = VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME;
933   elt->last_n_vectors = elt->n_vectors = 0;
934
935   return elt;
936 }
937
938 static inline vlib_frame_queue_elt_t *
939 dpdk_get_handoff_queue_elt ( 
940     u32 vlib_worker_index, 
941     vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index)
942 {
943   vlib_frame_queue_elt_t *elt;
944
945   if (handoff_queue_elt_by_worker_index [vlib_worker_index])
946       return handoff_queue_elt_by_worker_index [vlib_worker_index];
947
948   elt = vlib_get_handoff_queue_elt (vlib_worker_index);
949
950   handoff_queue_elt_by_worker_index [vlib_worker_index] = elt;
951
952   return elt;
953 }
954
955 static inline vlib_frame_queue_t *
956 is_vlib_handoff_queue_congested (
957     u32 vlib_worker_index,
958     u32 queue_hi_thresh,
959     vlib_frame_queue_t ** handoff_queue_by_worker_index)
960 {
961   vlib_frame_queue_t *fq;
962
963   fq = handoff_queue_by_worker_index [vlib_worker_index];
964   if (fq != (vlib_frame_queue_t *)(~0)) 
965       return fq;
966   
967   fq = vlib_frame_queues[vlib_worker_index];
968   ASSERT (fq);
969
970   if (PREDICT_FALSE(fq->tail >= (fq->head_hint + queue_hi_thresh))) {
971     /* a valid entry in the array will indicate the queue has reached
972      * the specified threshold and is congested
973      */
974     handoff_queue_by_worker_index [vlib_worker_index] = fq;
975     fq->enqueue_full_events++;
976     return fq;
977   }
978
979   return NULL;
980 }
981
982 static inline u64 ipv4_get_key (ip4_header_t *ip)
983 {
984    u64  hash_key;
985
986    hash_key = *((u64*)(&ip->address_pair)) ^ ip->protocol;
987
988    return hash_key;
989 }
990
991 static inline u64 ipv6_get_key (ip6_header_t *ip)
992 {
993    u64  hash_key;
994
995    hash_key = ip->src_address.as_u64[0] ^
996               rotate_left(ip->src_address.as_u64[1],13) ^
997               rotate_left(ip->dst_address.as_u64[0],26) ^
998               rotate_left(ip->dst_address.as_u64[1],39) ^
999               ip->protocol;
1000
1001    return hash_key;
1002 }
1003
1004
1005 #define MPLS_BOTTOM_OF_STACK_BIT_MASK   0x00000100U
1006 #define MPLS_LABEL_MASK                 0xFFFFF000U
1007
1008 static inline u64 mpls_get_key (mpls_unicast_header_t *m)
1009 {
1010    u64                     hash_key;
1011    u8                      ip_ver;
1012
1013
1014    /* find the bottom of the MPLS label stack. */
1015    if (PREDICT_TRUE(m->label_exp_s_ttl & 
1016                     clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
1017        goto bottom_lbl_found;
1018    }
1019    m++;
1020
1021    if (PREDICT_TRUE(m->label_exp_s_ttl & 
1022                     clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
1023        goto bottom_lbl_found;
1024    }
1025    m++;
1026
1027    if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
1028        goto bottom_lbl_found;
1029    }
1030    m++;
1031
1032    if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
1033        goto bottom_lbl_found;
1034    }
1035    m++;
1036
1037    if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
1038        goto bottom_lbl_found;
1039    }
1040    
1041    /* the bottom label was not found - use the last label */
1042    hash_key = m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
1043
1044    return hash_key;
1045    
1046
1047 bottom_lbl_found:
1048    m++;
1049    ip_ver = (*((u8 *)m) >> 4);
1050
1051    /* find out if it is IPV4 or IPV6 header */
1052    if (PREDICT_TRUE(ip_ver == 4)) {
1053        hash_key = ipv4_get_key((ip4_header_t *)m);
1054    } else if (PREDICT_TRUE(ip_ver == 6)) {
1055        hash_key = ipv6_get_key((ip6_header_t *)m);
1056    } else {
1057        /* use the bottom label */
1058        hash_key = (m-1)->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
1059    }
1060
1061    return hash_key;
1062
1063 }
1064
1065 static inline u64 eth_get_key (ethernet_header_t *h0)
1066 {
1067    u64 hash_key;
1068
1069
1070    if (PREDICT_TRUE(h0->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
1071        hash_key = ipv4_get_key((ip4_header_t *)(h0+1));
1072    } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_IP6)) {
1073        hash_key = ipv6_get_key((ip6_header_t *)(h0+1));
1074    } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
1075        hash_key = mpls_get_key((mpls_unicast_header_t *)(h0+1));
1076    } else if ((h0->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) || 
1077               (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_DOT1AD))) {
1078        ethernet_vlan_header_t * outer = (ethernet_vlan_header_t *)(h0 + 1);
1079        
1080        outer = (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ? 
1081                                   outer+1 : outer;
1082        if (PREDICT_TRUE(outer->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
1083            hash_key = ipv4_get_key((ip4_header_t *)(outer+1));
1084        } else if (outer->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)) {
1085            hash_key = ipv6_get_key((ip6_header_t *)(outer+1));
1086        } else if (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
1087            hash_key = mpls_get_key((mpls_unicast_header_t *)(outer+1));
1088        }  else {
1089            hash_key = outer->type; 
1090        }
1091    } else {
1092        hash_key  = 0;
1093    }
1094
1095    return hash_key;
1096 }
1097
1098 /*
1099  * This function is used when dedicated IO threads feed the worker threads.
1100  *
1101  * Devices are allocated to this thread based on instances and instance_id.
1102  * If instances==0 then the function automatically determines the number
1103  * of instances of this thread, and allocates devices between them. 
1104  * If instances != 0, then instance_id must be in the range 0..instances-1.
1105  * The function allocates devices among the specified number of instances,
1106  * with this thread having the given instance id. This option is used for 
1107  * splitting devices among differently named "io"-type threads.
1108  */
1109 void dpdk_io_thread (vlib_worker_thread_t * w,
1110                      u32 instances,
1111                      u32 instance_id,
1112                      char *worker_name,
1113                      dpdk_io_thread_callback_t callback)
1114 {
1115   vlib_main_t * vm = vlib_get_main();
1116   vlib_thread_main_t * tm = vlib_get_thread_main();
1117   vlib_thread_registration_t * tr;
1118   dpdk_main_t * dm = &dpdk_main;
1119   char *io_name = w->registration->name;
1120   dpdk_device_t * xd;
1121   dpdk_device_t ** my_devices = 0;
1122   vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index = 0;
1123   vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
1124   vlib_frame_queue_elt_t * hf = 0;
1125   int i;
1126   u32 n_left_to_next_worker = 0, * to_next_worker = 0;
1127   u32 next_worker_index = 0;
1128   u32 current_worker_index = ~0;
1129   u32 cpu_index = os_get_cpu_number();
1130   u32 num_workers = 0;
1131   u32 num_devices = 0;
1132   uword * p;
1133   u16 queue_id = 0;
1134   vlib_node_runtime_t * node_trace = 0;
1135   u32 first_worker_index = 0;
1136   u32 buffer_flags_template;
1137   
1138   /* Wait until the dpdk init sequence is complete */
1139   while (dm->io_thread_release == 0)
1140     vlib_worker_thread_barrier_check();
1141
1142   clib_time_init (&vm->clib_time);
1143
1144   p = hash_get_mem (tm->thread_registrations_by_name, worker_name);
1145   ASSERT (p);
1146   tr = (vlib_thread_registration_t *) p[0];
1147   if (tr) 
1148     {
1149       num_workers = tr->count;
1150       first_worker_index = tr->first_index;
1151     }
1152
1153   /* Allocate devices to this thread */
1154   if (instances == 0) 
1155     {
1156       /* auto-assign */
1157       instance_id = w->instance_id;
1158
1159       p = hash_get_mem (tm->thread_registrations_by_name, io_name);
1160       tr = (vlib_thread_registration_t *) p[0];
1161       /* Otherwise, how did we get here */
1162       ASSERT (tr && tr->count);
1163       instances = tr->count;
1164     }
1165   else
1166     {
1167       /* manually assign */
1168       ASSERT (instance_id < instances);
1169     }
1170
1171   vec_validate (handoff_queue_elt_by_worker_index,
1172                 first_worker_index + num_workers - 1);
1173
1174   vec_validate_init_empty (congested_handoff_queue_by_worker_index,
1175                            first_worker_index + num_workers - 1,
1176                            (vlib_frame_queue_t *)(~0));
1177
1178   buffer_flags_template = dm->buffer_flags_template;
1179
1180   /* And handle them... */
1181   while (1)
1182     {
1183       u32 n_buffers;
1184       u32 mb_index;
1185       uword n_rx_bytes = 0;
1186       u32 n_trace, trace_cnt __attribute__((unused));
1187       vlib_buffer_free_list_t * fl;
1188       u32 hash;
1189       u64 hash_key;
1190       u8 efd_discard_burst;
1191
1192       vlib_worker_thread_barrier_check ();
1193
1194       /* Invoke callback if supplied */
1195       if (PREDICT_FALSE(callback != NULL))
1196           callback(vm);
1197
1198       if (PREDICT_FALSE(vec_len(dm->devices) != num_devices))
1199       {
1200         vec_reset_length(my_devices);
1201         vec_foreach (xd, dm->devices)
1202           {
1203             if (((xd - dm->devices) % tr->count) == instance_id)
1204               {
1205                 fprintf(stderr, "i/o thread %d (cpu %d) takes port %d\n",
1206                         instance_id, (int) os_get_cpu_number(), (int) (xd - dm->devices));
1207                 vec_add1 (my_devices, xd);
1208               }
1209           }
1210         num_devices = vec_len(dm->devices);
1211       }
1212
1213       for (i = 0; i < vec_len (my_devices); i++)
1214       {
1215           xd = my_devices[i];
1216
1217           if (!xd->admin_up)
1218             continue;
1219
1220           n_buffers = dpdk_rx_burst(dm, xd, 0 /* queue_id */);
1221
1222           if (n_buffers == 0)
1223             {
1224               /* check if EFD (dpdk) is enabled */
1225               if (PREDICT_FALSE(dm->efd.enabled))
1226                 {
1227                   /* reset a few stats */
1228                   xd->efd_agent.last_poll_time = 0;
1229                   xd->efd_agent.last_burst_sz = 0;
1230                 }
1231               continue;
1232             }
1233
1234           trace_cnt = n_trace = 0;
1235           if (PREDICT_FALSE(vm->trace_main.trace_active_hint))
1236             {
1237               /*
1238                * packet tracing is triggered on the dpdk-input node for
1239                * ease-of-use. Re-fetch the node_runtime for dpdk-input
1240                * in case it has changed.
1241                */
1242               node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
1243
1244               vec_reset_length (xd->d_trace_buffers);
1245               trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
1246             }
1247         
1248           /*
1249            * DAW-FIXME: VMXNET3 device stop/start doesn't work, 
1250            * therefore fake the stop in the dpdk driver by
1251            * silently dropping all of the incoming pkts instead of 
1252            * stopping the driver / hardware.
1253            */
1254           if (PREDICT_FALSE(xd->admin_up != 1))
1255             {
1256               for (mb_index = 0; mb_index < n_buffers; mb_index++)
1257                 rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
1258               continue;
1259             }
1260
1261           /* reset EFD action for the burst */
1262           efd_discard_burst = 0;
1263           
1264           /* Check for congestion if EFD (Early-Fast-Discard) is enabled
1265            * in any mode (e.g. dpdk, monitor, or drop_all)
1266            */
1267           if (PREDICT_FALSE(dm->efd.enabled))
1268             {
1269               /* update EFD counters */
1270               dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
1271
1272               if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
1273                 {
1274                   /* drop all received packets */
1275                   for (mb_index = 0; mb_index < n_buffers; mb_index++)
1276                     rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
1277
1278                   xd->efd_agent.discard_cnt += n_buffers;
1279                   increment_efd_drop_counter(vm, 
1280                                              DPDK_ERROR_VLAN_EFD_DROP_PKTS,
1281                                              n_buffers);
1282
1283                   continue;
1284                 }
1285
1286               if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
1287                                 dm->efd.consec_full_frames_hi_thresh))
1288                 {
1289                   u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
1290                                                                queue_id);
1291                   if (device_queue_sz >= dm->efd.queue_hi_thresh)
1292                     {
1293                       /* dpdk device queue has reached the critical threshold */
1294                       xd->efd_agent.congestion_cnt++;
1295
1296                       /* apply EFD to packets from the burst */
1297                       efd_discard_burst = 1;
1298                     }
1299                 }
1300             }
1301
1302           fl = vlib_buffer_get_free_list 
1303             (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
1304         
1305           mb_index = 0;
1306
1307           while (n_buffers > 0)
1308             {
1309               u32 bi0;
1310               u8 next0, error0;
1311               u32 l3_offset0;
1312               vlib_buffer_t * b0, * b_seg, * b_chain = 0;
1313               ethernet_header_t * h0;
1314               u8 nb_seg = 1;
1315               struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
1316               struct rte_mbuf *mb_seg = mb->next;
1317                 
1318               if (PREDICT_TRUE(n_buffers > 1))
1319                 {
1320                   struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
1321                   vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1322                   CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1323                   CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1324                   CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD);
1325                 }
1326                 
1327               b0 = vlib_buffer_from_rte_mbuf(mb);
1328
1329               /* check whether EFD is looking for packets to discard */
1330               if (PREDICT_FALSE(efd_discard_burst))
1331                 {
1332                   u32 cntr_type;
1333                   if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
1334                     {
1335                       rte_pktmbuf_free(mb);
1336                       xd->efd_agent.discard_cnt++;
1337                       increment_efd_drop_counter(vm, 
1338                                                  cntr_type,
1339                                                  1);
1340
1341                       n_buffers--;
1342                       mb_index++;
1343                       continue;
1344                     }
1345                 }
1346               
1347               /* Prefetch one next segment if it exists */
1348               if (PREDICT_FALSE(mb->nb_segs > 1))
1349                 {
1350                   struct rte_mbuf *pfmb = mb->next;
1351                   vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1352                   CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1353                   CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1354                   b_chain = b0;
1355                 }
1356
1357               bi0 = vlib_get_buffer_index (vm, b0);
1358               vlib_buffer_init_for_free_list (b0, fl);
1359               b0->clone_count = 0;
1360
1361               dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
1362                                                        &next0, &error0);
1363 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
1364               /*
1365                * Clear overloaded TX offload flags when a DPDK driver
1366                * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
1367                */
1368               if (PREDICT_TRUE(trace_cnt == 0))
1369                 mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
1370               else
1371                 trace_cnt--;
1372 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
1373
1374               if (error0)
1375                   clib_warning ("bi %d error %d", bi0, error0);
1376
1377               b0->error = 0;
1378
1379               l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
1380                              next0 == DPDK_RX_NEXT_IP6_INPUT || 
1381                              next0 == DPDK_RX_NEXT_MPLS_INPUT) ? 
1382                             sizeof (ethernet_header_t) : 0);
1383
1384               b0->current_data = l3_offset0;
1385               /* Some drivers like fm10k receive frames with
1386                  mb->data_off > RTE_PKTMBUF_HEADROOM */
1387               b0->current_data += mb->data_off - RTE_PKTMBUF_HEADROOM;
1388               b0->current_length = mb->data_len - l3_offset0;
1389
1390               b0->flags = buffer_flags_template;
1391
1392               if (VMWARE_LENGTH_BUG_WORKAROUND)
1393                   b0->current_length -= 4;
1394                 
1395               vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
1396               vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
1397               vnet_buffer(b0)->io_handoff.next_index = next0;
1398               n_rx_bytes += mb->pkt_len;
1399
1400               /* Process subsequent segments of multi-segment packets */
1401               while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
1402                 {
1403                   ASSERT(mb_seg != 0);
1404  
1405                   b_seg = vlib_buffer_from_rte_mbuf(mb_seg);
1406                   vlib_buffer_init_for_free_list (b_seg, fl);
1407                   b_seg->clone_count = 0;
1408  
1409                   ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
1410                   ASSERT(b_seg->current_data == 0);
1411  
1412                   /*
1413                    * The driver (e.g. virtio) may not put the packet data at the start
1414                    * of the segment, so don't assume b_seg->current_data == 0 is correct.
1415                    */
1416                   b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
1417
1418                   b_seg->current_length = mb_seg->data_len;
1419                   b0->total_length_not_including_first_buffer +=
1420                     mb_seg->data_len;
1421  
1422                   b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
1423                   b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
1424  
1425                   b_chain = b_seg;
1426                   mb_seg = mb_seg->next;
1427                   nb_seg++;
1428                 }
1429
1430               /*
1431                * Turn this on if you run into
1432                * "bad monkey" contexts, and you want to know exactly
1433                * which nodes they've visited... See main.c...
1434                */
1435               VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
1436  
1437               if (PREDICT_FALSE (n_trace > mb_index))
1438                 vec_add1 (xd->d_trace_buffers, bi0);
1439
1440               next_worker_index = first_worker_index;
1441
1442               /* 
1443                * Force unknown traffic onto worker 0, 
1444                * and into ethernet-input. $$$$ add more hashes.
1445                */
1446               h0 = (ethernet_header_t *) b0->data;
1447
1448               /* Compute ingress LB hash */
1449               hash_key = eth_get_key(h0);
1450               hash = (u32)clib_xxhash(hash_key);
1451
1452               if (PREDICT_TRUE (is_pow2(num_workers)))
1453                 next_worker_index += hash & (num_workers - 1);
1454               else
1455                 next_worker_index += hash % num_workers;
1456
1457               /* if EFD is enabled and not already discarding from dpdk,
1458                * check the worker ring/queue for congestion
1459                */
1460               if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
1461                 {
1462                   vlib_frame_queue_t *fq;
1463
1464                   /* fq will be valid if the ring is congested */
1465                   fq = is_vlib_handoff_queue_congested(
1466                       next_worker_index, tm->efd.queue_hi_thresh,
1467                       congested_handoff_queue_by_worker_index);
1468                   
1469                   if (PREDICT_FALSE(fq != NULL))
1470                     {
1471                       u32 cntr_type;
1472                       if (PREDICT_TRUE(cntr_type =
1473                                        is_efd_discardable(tm, b0, mb)))
1474                         {
1475                           /* discard the packet */
1476                           fq->enqueue_efd_discards++;
1477                           increment_efd_drop_counter(vm, cntr_type, 1);
1478                           rte_pktmbuf_free(mb);
1479                           n_buffers--;
1480                           mb_index++;
1481                           continue;
1482                         }
1483                     }
1484                 }
1485               
1486               if (next_worker_index != current_worker_index)
1487                 {
1488                   if (hf)
1489                     hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1490
1491                   hf = dpdk_get_handoff_queue_elt(
1492                            next_worker_index,
1493                            handoff_queue_elt_by_worker_index);
1494                       
1495                   n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
1496                   to_next_worker = &hf->buffer_index[hf->n_vectors];
1497                   current_worker_index = next_worker_index;
1498                 }
1499               
1500               /* enqueue to correct worker thread */
1501               to_next_worker[0] = bi0;
1502               to_next_worker++;
1503               n_left_to_next_worker--;
1504
1505               if (n_left_to_next_worker == 0)
1506                 {
1507                   hf->n_vectors = VLIB_FRAME_SIZE;
1508                   vlib_put_handoff_queue_elt(hf);
1509                   current_worker_index = ~0;
1510                   handoff_queue_elt_by_worker_index[next_worker_index] = 0;
1511                   hf = 0;
1512                 }
1513                   
1514               n_buffers--;
1515               mb_index++;
1516             }
1517
1518           if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
1519             {
1520               /* credit the trace to the trace node */
1521               dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
1522                              vec_len (xd->d_trace_buffers));
1523               vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
1524             }
1525
1526           vlib_increment_combined_counter 
1527             (vnet_get_main()->interface_main.combined_sw_if_counters
1528              + VNET_INTERFACE_COUNTER_RX,
1529              cpu_index, 
1530              xd->vlib_sw_if_index,
1531              mb_index, n_rx_bytes);
1532
1533           dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
1534           dw->aggregate_rx_packets += mb_index;
1535         }
1536
1537       if (hf)
1538         hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1539
1540       /* Ship frames to the worker nodes */
1541       for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
1542         {
1543           if (handoff_queue_elt_by_worker_index[i])
1544             {
1545               hf = handoff_queue_elt_by_worker_index[i];
1546               /* 
1547                * It works better to let the handoff node
1548                * rate-adapt, always ship the handoff queue element.
1549                */
1550               if (1 || hf->n_vectors == hf->last_n_vectors)
1551                 {
1552                   vlib_put_handoff_queue_elt(hf);
1553                   handoff_queue_elt_by_worker_index[i] = 0;
1554                 }
1555               else
1556                 hf->last_n_vectors = hf->n_vectors;
1557             }
1558           congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
1559         }
1560       hf = 0;
1561       current_worker_index = ~0;
1562
1563       vlib_increment_main_loop_counter (vm);
1564     }
1565 }
1566
1567 /*
1568  * This function is used when the main thread performs IO and feeds the
1569  * worker threads.
1570  */
1571 static uword
1572 dpdk_io_input (vlib_main_t * vm,
1573                vlib_node_runtime_t * node,
1574                vlib_frame_t * f)
1575 {
1576   dpdk_main_t * dm = &dpdk_main;
1577   dpdk_device_t * xd;
1578   vlib_thread_main_t * tm = vlib_get_thread_main();
1579   uword n_rx_packets = 0;
1580   static vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index;
1581   static vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
1582   vlib_frame_queue_elt_t * hf = 0;
1583   int i;
1584   u32 n_left_to_next_worker = 0, * to_next_worker = 0;
1585   u32 next_worker_index = 0;
1586   u32 current_worker_index = ~0;
1587   u32 cpu_index = os_get_cpu_number();
1588   static int num_workers_set;
1589   static u32 num_workers;
1590   u16 queue_id = 0;
1591   vlib_node_runtime_t * node_trace;
1592   static u32 first_worker_index;
1593   u32 buffer_flags_template;
1594
1595   if (PREDICT_FALSE(num_workers_set == 0))
1596     {
1597       uword * p;
1598       vlib_thread_registration_t * tr;
1599       /* Only the standard vnet worker threads are supported */
1600       p = hash_get_mem (tm->thread_registrations_by_name, "workers");
1601       tr = (vlib_thread_registration_t *) p[0];
1602       if (tr) 
1603         {
1604           num_workers = tr->count;
1605           first_worker_index = tr->first_index;
1606         }
1607       num_workers_set = 1;
1608     }
1609
1610   if (PREDICT_FALSE(handoff_queue_elt_by_worker_index == 0))
1611     {
1612       vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1);
1613       
1614       vec_validate_init_empty (congested_handoff_queue_by_worker_index,
1615                                first_worker_index + num_workers - 1,
1616                                (vlib_frame_queue_t *)(~0));
1617     }
1618
1619   /* packet tracing is triggered on the dpdk-input node for ease-of-use */
1620   node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
1621
1622   buffer_flags_template = dm->buffer_flags_template;
1623
1624   vec_foreach (xd, dm->devices)
1625     {
1626       u32 n_buffers;
1627       u32 mb_index;
1628       uword n_rx_bytes = 0;
1629       u32 n_trace, trace_cnt __attribute__((unused));
1630       vlib_buffer_free_list_t * fl;
1631       u32 hash;
1632       u64 hash_key;
1633       u8 efd_discard_burst = 0;
1634
1635       if (!xd->admin_up)
1636         continue;
1637
1638       n_buffers = dpdk_rx_burst(dm, xd, queue_id );
1639
1640       if (n_buffers == 0)
1641         {
1642           /* check if EFD (dpdk) is enabled */
1643           if (PREDICT_FALSE(dm->efd.enabled))
1644             {
1645               /* reset a few stats */
1646               xd->efd_agent.last_poll_time = 0;
1647               xd->efd_agent.last_burst_sz = 0;
1648             }
1649           continue;
1650         }
1651
1652       vec_reset_length (xd->d_trace_buffers);
1653       trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
1654         
1655       /*
1656        * DAW-FIXME: VMXNET3 device stop/start doesn't work, 
1657        * therefore fake the stop in the dpdk driver by
1658        * silently dropping all of the incoming pkts instead of 
1659        * stopping the driver / hardware.
1660        */
1661       if (PREDICT_FALSE(xd->admin_up != 1))
1662         {
1663           for (mb_index = 0; mb_index < n_buffers; mb_index++)
1664             rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
1665           continue;
1666         }
1667
1668       /* Check for congestion if EFD (Early-Fast-Discard) is enabled
1669        * in any mode (e.g. dpdk, monitor, or drop_all)
1670        */
1671       if (PREDICT_FALSE(dm->efd.enabled))
1672         {
1673           /* update EFD counters */
1674           dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
1675
1676           if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
1677             {
1678               /* discard all received packets */
1679               for (mb_index = 0; mb_index < n_buffers; mb_index++)
1680                 rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
1681
1682               xd->efd_agent.discard_cnt += n_buffers;
1683               increment_efd_drop_counter(vm, 
1684                                          DPDK_ERROR_VLAN_EFD_DROP_PKTS,
1685                                          n_buffers);
1686             
1687               continue;
1688             }
1689           
1690           if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
1691                             dm->efd.consec_full_frames_hi_thresh))
1692             {
1693               u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
1694                                                            queue_id);
1695               if (device_queue_sz >= dm->efd.queue_hi_thresh)
1696                 {
1697                   /* dpdk device queue has reached the critical threshold */
1698                   xd->efd_agent.congestion_cnt++;
1699
1700                   /* apply EFD to packets from the burst */
1701                   efd_discard_burst = 1;
1702                 }
1703             }
1704         }
1705       
1706       fl = vlib_buffer_get_free_list 
1707         (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
1708           
1709       mb_index = 0;
1710
1711       while (n_buffers > 0)
1712         {
1713           u32 bi0;
1714           u8 next0, error0;
1715           u32 l3_offset0;
1716           vlib_buffer_t * b0, * b_seg, * b_chain = 0;
1717           ethernet_header_t * h0;
1718           u8 nb_seg = 1;
1719           struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
1720           struct rte_mbuf *mb_seg = mb->next;
1721
1722           if (PREDICT_TRUE(n_buffers > 1))
1723             {
1724               struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
1725               vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1726               CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1727               CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1728               CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD);
1729             }
1730
1731           b0 = vlib_buffer_from_rte_mbuf(mb);
1732
1733           /* check whether EFD is looking for packets to discard */
1734           if (PREDICT_FALSE(efd_discard_burst))
1735             {
1736               u32 cntr_type;
1737               if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
1738                 {
1739                   rte_pktmbuf_free(mb);
1740                   xd->efd_agent.discard_cnt++;
1741                   increment_efd_drop_counter(vm, 
1742                                              cntr_type,
1743                                              1);
1744
1745                   n_buffers--;
1746                   mb_index++;
1747                   continue;
1748                 }
1749             }
1750
1751           /* Prefetch one next segment if it exists */
1752           if (PREDICT_FALSE(mb->nb_segs > 1))
1753             {
1754               struct rte_mbuf *pfmb = mb->next;
1755               vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1756               CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1757               CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1758               b_chain = b0;
1759             }
1760
1761           bi0 = vlib_get_buffer_index (vm, b0);
1762           vlib_buffer_init_for_free_list (b0, fl);
1763           b0->clone_count = 0;
1764
1765           dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
1766                                                    &next0, &error0);
1767 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
1768           /*
1769            * Clear overloaded TX offload flags when a DPDK driver
1770            * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
1771            */
1772           if (PREDICT_TRUE(trace_cnt == 0))
1773             mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
1774           else
1775             trace_cnt--;
1776 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
1777
1778           if (error0)
1779             clib_warning ("bi %d error %d", bi0, error0);
1780
1781           b0->error = 0;
1782
1783           l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
1784                          next0 == DPDK_RX_NEXT_IP6_INPUT || 
1785                          next0 == DPDK_RX_NEXT_MPLS_INPUT) ? 
1786                         sizeof (ethernet_header_t) : 0);
1787
1788           b0->current_data = l3_offset0;
1789           b0->current_length = mb->data_len - l3_offset0;
1790
1791           b0->flags = buffer_flags_template;
1792                 
1793           if (VMWARE_LENGTH_BUG_WORKAROUND)
1794               b0->current_length -= 4;
1795
1796           vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
1797           vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
1798           vnet_buffer(b0)->io_handoff.next_index = next0;
1799           n_rx_bytes += mb->pkt_len;
1800
1801           /* Process subsequent segments of multi-segment packets */
1802           while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
1803             {
1804               ASSERT(mb_seg != 0);
1805  
1806               b_seg = vlib_buffer_from_rte_mbuf(mb_seg);
1807               vlib_buffer_init_for_free_list (b_seg, fl);
1808               b_seg->clone_count = 0;
1809  
1810               ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
1811               ASSERT(b_seg->current_data == 0);
1812  
1813               /*
1814                * The driver (e.g. virtio) may not put the packet data at the start
1815                * of the segment, so don't assume b_seg->current_data == 0 is correct.
1816                */
1817               b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
1818
1819               b_seg->current_length = mb_seg->data_len;
1820               b0->total_length_not_including_first_buffer +=
1821                 mb_seg->data_len;
1822  
1823               b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
1824               b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
1825  
1826               b_chain = b_seg;
1827               mb_seg = mb_seg->next;
1828               nb_seg++;
1829             }
1830  
1831           /*
1832            * Turn this on if you run into
1833            * "bad monkey" contexts, and you want to know exactly
1834            * which nodes they've visited... See main.c...
1835            */
1836           VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
1837  
1838           if (PREDICT_FALSE (n_trace > mb_index))
1839             vec_add1 (xd->d_trace_buffers, bi0);
1840
1841           next_worker_index = first_worker_index;
1842
1843           /* 
1844            * Force unknown traffic onto worker 0, 
1845            * and into ethernet-input. $$$$ add more hashes.
1846            */
1847           h0 = (ethernet_header_t *) b0->data;
1848
1849           /* Compute ingress LB hash */
1850           hash_key = eth_get_key(h0);
1851           hash = (u32)clib_xxhash(hash_key);
1852
1853           if (PREDICT_TRUE (is_pow2(num_workers)))
1854             next_worker_index += hash & (num_workers - 1);
1855           else
1856             next_worker_index += hash % num_workers;
1857
1858           /* if EFD is enabled and not already discarding from dpdk,
1859            * check the worker ring/queue for congestion
1860            */
1861           if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
1862             {
1863               vlib_frame_queue_t *fq;
1864
1865               /* fq will be valid if the ring is congested */
1866               fq = is_vlib_handoff_queue_congested(
1867                   next_worker_index, tm->efd.queue_hi_thresh,
1868                   congested_handoff_queue_by_worker_index);
1869               
1870               if (PREDICT_FALSE(fq != NULL))
1871                 {
1872                   u32 cntr_type;
1873                   if (PREDICT_TRUE(cntr_type =
1874                                    is_efd_discardable(tm, b0, mb)))
1875                     {
1876                       /* discard the packet */
1877                       fq->enqueue_efd_discards++;
1878                       increment_efd_drop_counter(vm, cntr_type, 1);
1879                       rte_pktmbuf_free(mb);
1880                       n_buffers--;
1881                       mb_index++;
1882                       continue;
1883                     }
1884                 }
1885             }
1886           
1887           if (next_worker_index != current_worker_index)
1888             {
1889               if (hf)
1890                 hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1891
1892               hf = dpdk_get_handoff_queue_elt(
1893                      next_worker_index,
1894                      handoff_queue_elt_by_worker_index);
1895
1896               n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
1897               to_next_worker = &hf->buffer_index[hf->n_vectors];
1898               current_worker_index = next_worker_index;
1899             }
1900           
1901           /* enqueue to correct worker thread */
1902           to_next_worker[0] = bi0;
1903           to_next_worker++;
1904           n_left_to_next_worker--;
1905
1906           if (n_left_to_next_worker == 0)
1907             {
1908               hf->n_vectors = VLIB_FRAME_SIZE;
1909               vlib_put_handoff_queue_elt(hf);
1910               current_worker_index = ~0;
1911               handoff_queue_elt_by_worker_index[next_worker_index] = 0;
1912               hf = 0;
1913             }
1914           
1915           n_buffers--;
1916           mb_index++;
1917         }
1918
1919       if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
1920         {
1921           /* credit the trace to the trace node */
1922           dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
1923                          vec_len (xd->d_trace_buffers));
1924           vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
1925         }
1926
1927       vlib_increment_combined_counter 
1928         (vnet_get_main()->interface_main.combined_sw_if_counters
1929          + VNET_INTERFACE_COUNTER_RX,
1930          cpu_index, 
1931          xd->vlib_sw_if_index,
1932          mb_index, n_rx_bytes);
1933
1934       dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
1935       dw->aggregate_rx_packets += mb_index;
1936       n_rx_packets += mb_index;
1937     }
1938
1939   if (hf)
1940     hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1941   
1942   /* Ship frames to the worker nodes */
1943   for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
1944     {
1945       if (handoff_queue_elt_by_worker_index[i])
1946         {
1947           hf = handoff_queue_elt_by_worker_index[i];
1948           /* 
1949            * It works better to let the handoff node
1950            * rate-adapt, always ship the handoff queue element.
1951            */
1952           if (1 || hf->n_vectors == hf->last_n_vectors)
1953             {
1954               vlib_put_handoff_queue_elt(hf);
1955               handoff_queue_elt_by_worker_index[i] = 0;
1956             }
1957           else
1958             hf->last_n_vectors = hf->n_vectors;
1959         }
1960       congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
1961     }
1962   hf = 0;
1963   current_worker_index = ~0;
1964   return n_rx_packets;
1965 }
1966
1967 VLIB_REGISTER_NODE (dpdk_io_input_node) = {
1968   .function = dpdk_io_input,
1969   .type = VLIB_NODE_TYPE_INPUT,
1970   .name = "dpdk-io-input",
1971
1972   /* Will be enabled if/when hardware is detected. */
1973   .state = VLIB_NODE_STATE_DISABLED,
1974
1975   .format_buffer = format_ethernet_header_with_length,
1976   .format_trace = format_dpdk_rx_dma_trace,
1977
1978   .n_errors = DPDK_N_ERROR,
1979   .error_strings = dpdk_error_strings,
1980
1981   .n_next_nodes = DPDK_RX_N_NEXT,
1982   .next_nodes = {
1983     [DPDK_RX_NEXT_DROP] = "error-drop",
1984     [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
1985     [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
1986     [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
1987     [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
1988   },
1989 };
1990
1991 /*
1992  * set_efd_bitmap()
1993  * Based on the operation type, set lower/upper bits for the given index value
1994  */
1995 void
1996 set_efd_bitmap (u8 *bitmap, u32 value, u32 op)
1997 {
1998     int ix;
1999
2000     *bitmap = 0;
2001     for (ix = 0; ix < 8; ix++) {
2002         if (((op == EFD_OPERATION_LESS_THAN) && (ix < value)) ||
2003             ((op == EFD_OPERATION_GREATER_OR_EQUAL) && (ix >= value))){
2004             (*bitmap) |= (1 << ix);
2005         }
2006     }
2007 }
2008
2009 void
2010 efd_config (u32 enabled, 
2011             u32 ip_prec,  u32 ip_op,
2012             u32 mpls_exp, u32 mpls_op,
2013             u32 vlan_cos, u32 vlan_op)
2014 {
2015    vlib_thread_main_t * tm = vlib_get_thread_main();
2016    dpdk_main_t * dm = &dpdk_main;
2017
2018    if (enabled) {
2019        tm->efd.enabled |= VLIB_EFD_DISCARD_ENABLED;
2020        dm->efd.enabled |= DPDK_EFD_DISCARD_ENABLED;
2021    } else {
2022        tm->efd.enabled &= ~VLIB_EFD_DISCARD_ENABLED;
2023        dm->efd.enabled &= ~DPDK_EFD_DISCARD_ENABLED;
2024    }
2025
2026    set_efd_bitmap(&tm->efd.ip_prec_bitmap, ip_prec, ip_op);
2027    set_efd_bitmap(&tm->efd.mpls_exp_bitmap, mpls_exp, mpls_op);
2028    set_efd_bitmap(&tm->efd.vlan_cos_bitmap, vlan_cos, vlan_op);
2029
2030 }