#include <vlib/unix/unix.h>
#include <vlib/pci/pci.h>
#include <vppinfra/ring.h>
+#include <vppinfra/vector/ip_csum.h>
#include <vnet/ethernet/ethernet.h>
#include <vnet/ip/ip4_packet.h>
vnet_buffer_oflags_t oflags = vnet_buffer (b)->oflags;
u32 is_tcp = is_tso || oflags & VNET_BUFFER_OFFLOAD_F_TCP_CKSUM;
u32 is_udp = !is_tso && oflags & VNET_BUFFER_OFFLOAD_F_UDP_CKSUM;
+
+ if (!is_tcp && !is_udp)
+ return 0;
+
u32 is_ip4 = b->flags & VNET_BUFFER_F_IS_IP4;
u32 is_ip6 = b->flags & VNET_BUFFER_F_IS_IP6;
- ASSERT (!is_tcp || !is_udp);
+
+ ASSERT (!(is_tcp && is_udp));
ASSERT (is_ip4 || is_ip6);
i16 l2_hdr_offset = b->current_data;
i16 l3_hdr_offset = vnet_buffer (b)->l3_hdr_offset;
ip6_header_t *ip6 = (void *) (b->data + l3_hdr_offset);
tcp_header_t *tcp = (void *) (b->data + l4_hdr_offset);
udp_header_t *udp = (void *) (b->data + l4_hdr_offset);
- u16 l4_len =
- is_tcp ? tcp_header_bytes (tcp) : is_udp ? sizeof (udp_header_t) : 0;
+ u16 l4_len = is_tcp ? tcp_header_bytes (tcp) : sizeof (udp_header_t);
u16 sum = 0;
flags |= AVF_TXD_OFFSET_MACLEN (l2_len) |
AVF_TXD_OFFSET_IPLEN (l3_len) | AVF_TXD_OFFSET_L4LEN (l4_len);
flags |= is_ip4 ? AVF_TXD_CMD_IIPT_IPV4 : AVF_TXD_CMD_IIPT_IPV6;
- flags |= is_tcp ? AVF_TXD_CMD_L4T_TCP : is_udp ? AVF_TXD_CMD_L4T_UDP : 0;
+ flags |= is_tcp ? AVF_TXD_CMD_L4T_TCP : AVF_TXD_CMD_L4T_UDP;
if (is_ip4)
ip4->checksum = 0;
ip6->payload_length = 0;
}
- if (is_tcp || is_udp)
- {
if (is_ip4)
{
struct avf_ip4_psh psh = { 0 };
is_tso ? 0 :
clib_host_to_net_u16 (clib_net_to_host_u16 (ip4->length) -
(l4_hdr_offset - l3_hdr_offset));
- sum = ~ip_csum (&psh, sizeof (psh));
+ sum = ~clib_ip_csum ((u8 *) &psh, sizeof (psh));
}
else
{
psh.dst = ip6->dst_address;
psh.proto = clib_host_to_net_u32 ((u32) ip6->protocol);
psh.l4len = is_tso ? 0 : ip6->payload_length;
- sum = ~ip_csum (&psh, sizeof (psh));
+ sum = ~clib_ip_csum ((u8 *) &psh, sizeof (psh));
}
- }
- /* ip_csum does a byte swap for some reason... */
- sum = clib_net_to_host_u16 (sum);
+
if (is_tcp)
tcp->checksum = sum;
- else if (is_udp)
+ else
udp->checksum = sum;
return flags;
}
{
const u64 cmd_eop = AVF_TXD_CMD_EOP;
u16 n_free_desc, n_desc_left, n_packets_left = n_packets;
+#if defined CLIB_HAVE_VEC512
+ vlib_buffer_t *b[8];
+#else
vlib_buffer_t *b[4];
+#endif
avf_tx_desc_t *d = txq->tmp_descs;
u32 *tb = txq->tmp_bufs;
while (n_packets_left && n_desc_left)
{
+#if defined CLIB_HAVE_VEC512
+ u32 flags;
+ u64x8 or_flags_vec512;
+ u64x8 flags_mask_vec512;
+#else
u32 flags, or_flags;
+#endif
+#if defined CLIB_HAVE_VEC512
+ if (n_packets_left < 8 || n_desc_left < 8)
+#else
if (n_packets_left < 8 || n_desc_left < 4)
+#endif
goto one_by_one;
+#if defined CLIB_HAVE_VEC512
+ u64x8 base_ptr = u64x8_splat (vm->buffer_main->buffer_mem_start);
+ u32x8 buf_indices = u32x8_load_unaligned (buffers);
+
+ *(u64x8 *) &b = base_ptr + u64x8_from_u32x8 (
+ buf_indices << CLIB_LOG2_CACHE_LINE_BYTES);
+
+ or_flags_vec512 = u64x8_i64gather (u64x8_load_unaligned (b), 0, 1);
+#else
vlib_prefetch_buffer_with_index (vm, buffers[4], LOAD);
vlib_prefetch_buffer_with_index (vm, buffers[5], LOAD);
vlib_prefetch_buffer_with_index (vm, buffers[6], LOAD);
b[3] = vlib_get_buffer (vm, buffers[3]);
or_flags = b[0]->flags | b[1]->flags | b[2]->flags | b[3]->flags;
+#endif
+#if defined CLIB_HAVE_VEC512
+ flags_mask_vec512 = u64x8_splat (
+ VLIB_BUFFER_NEXT_PRESENT | VNET_BUFFER_F_OFFLOAD | VNET_BUFFER_F_GSO);
+ if (PREDICT_FALSE (
+ !u64x8_is_all_zero (or_flags_vec512 & flags_mask_vec512)))
+#else
if (PREDICT_FALSE (or_flags &
(VLIB_BUFFER_NEXT_PRESENT | VNET_BUFFER_F_OFFLOAD |
VNET_BUFFER_F_GSO)))
+#endif
goto one_by_one;
+#if defined CLIB_HAVE_VEC512
+ vlib_buffer_copy_indices (tb, buffers, 8);
+ avf_tx_fill_data_desc (vm, d + 0, b[0], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 1, b[1], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 2, b[2], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 3, b[3], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 4, b[4], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 5, b[5], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 6, b[6], cmd_eop, use_va_dma);
+ avf_tx_fill_data_desc (vm, d + 7, b[7], cmd_eop, use_va_dma);
+
+ buffers += 8;
+ n_packets_left -= 8;
+ n_desc_left -= 8;
+ d += 8;
+ tb += 8;
+#else
vlib_buffer_copy_indices (tb, buffers, 4);
avf_tx_fill_data_desc (vm, d + 0, b[0], cmd_eop, use_va_dma);
n_desc_left -= 4;
d += 4;
tb += 4;
+#endif
+
continue;
one_by_one:
{
vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
avf_device_t *ad = avf_get_device (rd->dev_instance);
- u32 thread_index = vm->thread_index;
- u8 qid = thread_index;
- avf_txq_t *txq = vec_elt_at_index (ad->txqs, qid % ad->num_queue_pairs);
+ vnet_hw_if_tx_frame_t *tf = vlib_frame_scalar_args (frame);
+ u8 qid = tf->queue_id;
+ avf_txq_t *txq = vec_elt_at_index (ad->txqs, qid);
u16 next;
u16 mask = txq->size - 1;
u32 *buffers = vlib_frame_vector_args (frame);
u16 n_enq, n_left, n_desc, *slot;
u16 n_retry = 2;
- clib_spinlock_lock_if_init (&txq->lock);
+ if (tf->shared_queue)
+ clib_spinlock_lock (&txq->lock);
n_left = frame->n_vectors;
AVF_TX_ERROR_NO_FREE_SLOTS, n_left);
}
- clib_spinlock_unlock_if_init (&txq->lock);
+ if (tf->shared_queue)
+ clib_spinlock_unlock (&txq->lock);
return frame->n_vectors - n_left;
}