+static_always_inline u8
+dpdk_ol_flags_extract (struct rte_mbuf **mb, u8 * flags, int count)
+{
+ u8 rv = 0;
+ int i;
+ for (i = 0; i < count; i++)
+ {
+ /* all flags we are interested in are in lower 8 bits but
+ that might change */
+ flags[i] = (u8) mb[i]->ol_flags;
+ rv |= flags[i];
+ }
+ return rv;
+}
+
+static_always_inline uword
+dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
+ uword n_rx_packets, int maybe_multiseg, u8 * or_flagsp)
+{
+ u32 n_left = n_rx_packets;
+ vlib_buffer_t *b[4];
+ vlib_buffer_free_list_t *fl;
+ struct rte_mbuf **mb = ptd->mbufs;
+ uword n_bytes = 0;
+ i16 off;
+ u8 *flags, or_flags = 0;
+ u16 *next;
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ mb = ptd->mbufs;
+ flags = ptd->flags;
+ next = ptd->next;
+
+ while (n_left >= 8)
+ {
+ CLIB_PREFETCH (mb + 8, CLIB_CACHE_LINE_BYTES, LOAD);
+
+ dpdk_prefetch_buffer_x4 (mb + 4);
+
+ b[0] = vlib_buffer_from_rte_mbuf (mb[0]);
+ b[1] = vlib_buffer_from_rte_mbuf (mb[1]);
+ b[2] = vlib_buffer_from_rte_mbuf (mb[2]);
+ b[3] = vlib_buffer_from_rte_mbuf (mb[3]);
+
+ clib_memcpy64_x4 (b[0], b[1], b[2], b[3], &ptd->buffer_template);
+
+ dpdk_prefetch_mbuf_x4 (mb + 4);
+
+ or_flags |= dpdk_ol_flags_extract (mb, flags, 4);
+ flags += 4;
+
+ /* we temporary store relative offset of ethertype into next[x]
+ so we can prefetch and get it faster later */
+
+ off = mb[0]->data_off;
+ next[0] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
+ off -= RTE_PKTMBUF_HEADROOM;
+ vnet_buffer (b[0])->l2_hdr_offset = off;
+ b[0]->current_data = off;
+
+ off = mb[1]->data_off;
+ next[1] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
+ off -= RTE_PKTMBUF_HEADROOM;
+ vnet_buffer (b[1])->l2_hdr_offset = off;
+ b[1]->current_data = off;
+
+ off = mb[2]->data_off;
+ next[2] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
+ off -= RTE_PKTMBUF_HEADROOM;
+ vnet_buffer (b[2])->l2_hdr_offset = off;
+ b[2]->current_data = off;
+
+ off = mb[3]->data_off;
+ next[3] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
+ off -= RTE_PKTMBUF_HEADROOM;
+ vnet_buffer (b[3])->l2_hdr_offset = off;
+ b[3]->current_data = off;
+
+ b[0]->current_length = mb[0]->data_len;
+ b[1]->current_length = mb[1]->data_len;
+ b[2]->current_length = mb[2]->data_len;
+ b[3]->current_length = mb[3]->data_len;
+
+ n_bytes += mb[0]->data_len;
+ n_bytes += mb[1]->data_len;
+ n_bytes += mb[2]->data_len;
+ n_bytes += mb[3]->data_len;
+
+ if (maybe_multiseg)
+ {
+ n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], fl);
+ n_bytes += dpdk_process_subseq_segs (vm, b[1], mb[1], fl);
+ n_bytes += dpdk_process_subseq_segs (vm, b[2], mb[2], fl);
+ n_bytes += dpdk_process_subseq_segs (vm, b[3], mb[3], fl);
+ }
+
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[2]);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[3]);
+
+ /* next */
+ mb += 4;
+ n_left -= 4;
+ next += 4;
+ }
+
+ while (n_left)
+ {
+ b[0] = vlib_buffer_from_rte_mbuf (mb[0]);
+ clib_memcpy (b[0], &ptd->buffer_template, 64);
+ or_flags |= dpdk_ol_flags_extract (mb, flags, 1);
+ flags += 1;
+
+ off = mb[0]->data_off;
+ next[0] = off + STRUCT_OFFSET_OF (ethernet_header_t, type);
+ off -= RTE_PKTMBUF_HEADROOM;
+ vnet_buffer (b[0])->l2_hdr_offset = off;
+ b[0]->current_data = off;
+ b[0]->current_length = mb[0]->data_len;
+ n_bytes += mb[0]->data_len;
+ if (maybe_multiseg)
+ n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], fl);
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
+
+ /* next */
+ mb += 1;
+ n_left -= 1;
+ next += 1;
+ }
+
+ *or_flagsp = or_flags;
+ return n_bytes;
+}
+
+static_always_inline void
+dpdk_set_next_from_etype (vlib_main_t * vm, vlib_node_runtime_t * node,
+ dpdk_per_thread_data_t * ptd, uword n_rx_packets)
+{
+ vlib_buffer_t *b[4];
+ i16 adv[4];
+ u16 etype[4];
+ struct rte_mbuf **mb = ptd->mbufs;
+ u8 *flags = ptd->flags;
+ u16 *next = ptd->next;
+ u32 n_left = n_rx_packets;
+
+ while (n_left >= 12)
+ {
+ dpdk_prefetch_buffer_data_x4 (mb + 8);
+ dpdk_prefetch_buffer_x4 (mb + 8);
+
+ b[0] = vlib_buffer_from_rte_mbuf (mb[0]);
+ b[1] = vlib_buffer_from_rte_mbuf (mb[1]);
+ b[2] = vlib_buffer_from_rte_mbuf (mb[2]);
+ b[3] = vlib_buffer_from_rte_mbuf (mb[3]);
+ etype[0] = *(u16 *) ((u8 *) mb[0] + next[0] + sizeof (vlib_buffer_t));
+ etype[1] = *(u16 *) ((u8 *) mb[1] + next[1] + sizeof (vlib_buffer_t));
+ etype[2] = *(u16 *) ((u8 *) mb[2] + next[2] + sizeof (vlib_buffer_t));
+ etype[3] = *(u16 *) ((u8 *) mb[3] + next[3] + sizeof (vlib_buffer_t));
+ next[0] = dpdk_rx_next (node, etype[0], flags[0]);
+ next[1] = dpdk_rx_next (node, etype[1], flags[1]);
+ next[2] = dpdk_rx_next (node, etype[2], flags[2]);
+ next[3] = dpdk_rx_next (node, etype[3], flags[3]);
+ adv[0] = device_input_next_node_advance[next[0]];
+ adv[1] = device_input_next_node_advance[next[1]];
+ adv[2] = device_input_next_node_advance[next[2]];
+ adv[3] = device_input_next_node_advance[next[3]];
+ b[0]->current_data += adv[0];
+ b[1]->current_data += adv[1];
+ b[2]->current_data += adv[2];
+ b[3]->current_data += adv[3];
+ b[0]->current_length -= adv[0];
+ b[1]->current_length -= adv[1];
+ b[2]->current_length -= adv[2];
+ b[3]->current_length -= adv[3];
+
+ /* next */
+ next += 4;
+ mb += 4;
+ n_left -= 4;
+ flags += 4;
+ }
+
+ while (n_left)
+ {
+ b[0] = vlib_buffer_from_rte_mbuf (mb[0]);
+ next[0] = *(u16 *) ((u8 *) mb[0] + next[0] + sizeof (vlib_buffer_t));
+ next[0] = dpdk_rx_next (node, next[0], flags[0]);
+ adv[0] = device_input_next_node_advance[next[0]];
+ b[0]->current_data += adv[0];
+ b[0]->current_length -= adv[0];
+
+ /* next */
+ next += 1;
+ mb += 1;
+ n_left -= 1;
+ flags += 1;
+ }
+}
+
+static_always_inline void
+dpdk_process_flow_offload (dpdk_device_t * xd, dpdk_per_thread_data_t * ptd,
+ uword n_rx_packets)
+{
+ uword n;
+ dpdk_flow_lookup_entry_t *fle;
+ vlib_buffer_t *b0;
+
+ /* TODO prefetch and quad-loop */
+ for (n = 0; n < n_rx_packets; n++)
+ {
+ if ((ptd->flags[n] & (1 << DPDK_RX_F_FDIR)) == 0)
+ continue;
+
+ fle = vec_elt_at_index (xd->flow_lookup_entries,
+ ptd->mbufs[n]->hash.fdir.hi);
+
+ if (fle->next_index != (u16) ~ 0)
+ ptd->next[n] = fle->next_index;
+
+ if (fle->flow_id != ~0)
+ {
+ b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]);
+ b0->flow_id = fle->flow_id;
+ }
+
+ if (fle->buffer_advance != ~0)
+ {
+ b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]);
+ vlib_buffer_advance (b0, fle->buffer_advance);
+ }
+ }
+}
+
+static_always_inline u32
+dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd,
+ vlib_node_runtime_t * node, u32 thread_index, u16 queue_id)
+{
+ uword n_rx_packets = 0, n_rx_bytes;
+ u32 n_left, n_trace;
+ u32 *buffers;
+ u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
+ struct rte_mbuf **mb;
+ vlib_buffer_t *b0;
+ int known_next = 0;
+ u16 *next;
+ u8 or_flags;
+ u32 n;
+
+ dpdk_per_thread_data_t *ptd = vec_elt_at_index (dm->per_thread_data,
+ thread_index);
+ vlib_buffer_t *bt = &ptd->buffer_template;
+
+ if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
+ return 0;
+
+ /* get up to DPDK_RX_BURST_SZ buffers from PMD */
+ while (n_rx_packets < DPDK_RX_BURST_SZ)
+ {
+ n = rte_eth_rx_burst (xd->port_id, queue_id,
+ ptd->mbufs + n_rx_packets,
+ DPDK_RX_BURST_SZ - n_rx_packets);
+ n_rx_packets += n;
+
+ if (n < 32)
+ break;
+ }
+
+ if (n_rx_packets == 0)
+ return 0;
+
+ /* Update buffer template */
+ vnet_buffer (bt)->sw_if_index[VLIB_RX] = xd->sw_if_index;
+ bt->error = node->errors[DPDK_ERROR_NONE];
+ /* as DPDK is allocating empty buffers from mempool provided before interface
+ start for each queue, it is safe to store this in the template */
+ bt->buffer_pool_index = xd->buffer_pool_for_queue[queue_id];
+
+ /* receive burst of packets from DPDK PMD */
+ if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
+ {
+ known_next = 1;
+ next_index = xd->per_interface_next_index;
+ }
+
+ /* as all packets belong to the same interface feature arc lookup
+ can be don once and result stored in the buffer template */
+ if (PREDICT_FALSE (vnet_device_input_have_features (xd->sw_if_index)))
+ {
+ vnet_feature_start_device_input_x1 (xd->sw_if_index, &next_index, bt);
+ known_next = 1;
+ }
+
+ if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG)
+ n_rx_bytes = dpdk_process_rx_burst (vm, ptd, n_rx_packets, 1, &or_flags);
+ else
+ n_rx_bytes = dpdk_process_rx_burst (vm, ptd, n_rx_packets, 0, &or_flags);
+
+ if (PREDICT_FALSE (known_next))
+ {
+ for (n = 0; n < n_rx_packets; n++)
+ ptd->next[n] = next_index;
+
+ vnet_buffer (bt)->feature_arc_index = 0;
+ bt->current_config_index = 0;
+ }
+ else
+ dpdk_set_next_from_etype (vm, node, ptd, n_rx_packets);
+
+ /* flow offload - process if rx flow offlaod enabled and at least one packet
+ is marked */
+ if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_RX_FLOW_OFFLOAD) &&
+ (or_flags & (1 << DPDK_RX_F_FDIR))))
+ dpdk_process_flow_offload (xd, ptd, n_rx_packets);
+
+ /* is at least one packet marked as ip4 checksum bad? */
+ if (PREDICT_FALSE (or_flags & (1 << DPDK_RX_F_CKSUM_BAD)))
+ for (n = 0; n < n_rx_packets; n++)
+ {
+ if ((ptd->flags[n] & (1 << DPDK_RX_F_CKSUM_BAD)) == 0)
+ continue;
+ if (ptd->next[n] != VNET_DEVICE_INPUT_NEXT_IP4_INPUT)
+ continue;
+
+ b0 = vlib_buffer_from_rte_mbuf (ptd->mbufs[n]);
+ b0->error = node->errors[DPDK_ERROR_IP_CHECKSUM_ERROR];
+ ptd->next[n] = VNET_DEVICE_INPUT_NEXT_DROP;
+ }
+
+ /* enqueue buffers to the next node */
+ vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, ptd->buffers,
+ n_rx_packets,
+ sizeof (struct rte_mbuf));
+
+ vlib_buffer_enqueue_to_next (vm, node, ptd->buffers, ptd->next,
+ n_rx_packets);
+
+ /* packet trace if enabled */
+ if (PREDICT_FALSE ((n_trace = vlib_get_trace_count (vm, node))))
+ {
+ n_left = n_rx_packets;
+ buffers = ptd->buffers;
+ mb = ptd->mbufs;
+ next = ptd->next;
+ while (n_trace && n_left)
+ {
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ vlib_trace_buffer (vm, node, next[0], b0, /* follow_chain */ 0);
+
+ dpdk_rx_trace_t *t0 = vlib_add_trace (vm, node, b0, sizeof t0[0]);
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = vlib_get_buffer_index (vm, b0);
+
+ clib_memcpy (&t0->mb, mb[0], sizeof t0->mb);
+ clib_memcpy (&t0->buffer, b0, sizeof b0[0] - sizeof b0->pre_data);
+ clib_memcpy (t0->buffer.pre_data, b0->data,
+ sizeof t0->buffer.pre_data);
+ clib_memcpy (&t0->data, mb[0]->buf_addr + mb[0]->data_off,
+ sizeof t0->data);
+ n_trace--;
+ n_left--;
+ buffers++;
+ mb++;
+ next++;
+ }
+ vlib_set_trace_count (vm, node, n_trace);
+ }
+
+ /* rx pcap capture if enabled */
+ if (PREDICT_FALSE (dm->pcap[VLIB_RX].pcap_enable))
+ {
+ u32 bi0;
+ n_left = n_rx_packets;
+ buffers = ptd->buffers;
+ while (n_left)
+ {
+ bi0 = buffers[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ buffers++;
+
+ if (dm->pcap[VLIB_RX].pcap_sw_if_index == 0 ||
+ dm->pcap[VLIB_RX].pcap_sw_if_index
+ == vnet_buffer (b0)->sw_if_index[VLIB_RX])
+ {
+ struct rte_mbuf *mb;
+ i16 data_start;
+ i32 temp_advance;
+
+ /*
+ * Note: current_data will have advanced
+ * when we skip ethernet input.
+ * Temporarily back up to the original DMA
+ * target, so we capture a valid ethernet frame
+ */
+ mb = rte_mbuf_from_vlib_buffer (b0);
+
+ /* Figure out the original data_start */
+ data_start = (mb->buf_addr + mb->data_off) - (void *) b0->data;
+ /* Back up that far */
+ temp_advance = b0->current_data - data_start;
+ vlib_buffer_advance (b0, -temp_advance);
+ /* Trace the packet */
+ pcap_add_buffer (&dm->pcap[VLIB_RX].pcap_main, vm, bi0, 512);
+ /* and advance again */
+ vlib_buffer_advance (b0, temp_advance);
+ }
+ n_left--;
+ }
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main ()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX, thread_index, xd->sw_if_index,
+ n_rx_packets, n_rx_bytes);
+
+ vnet_device_increment_rx_packets (thread_index, n_rx_packets);
+
+ return n_rx_packets;
+}
+
+VLIB_NODE_FN (dpdk_input_node) (vlib_main_t * vm, vlib_node_runtime_t * node,