From ca1812dbe714fc8e4de13f88df2d3b830d95a2c9 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 8 Apr 2021 16:34:28 +0200 Subject: [PATCH] avf: avoid ring wrap in the tx path Type: improvement Change-Id: I91ecf0bff2ddd43ab5cf0f03ed2882882154557b Signed-off-by: Damjan Marion --- src/plugins/avf/avf.h | 5 +- src/plugins/avf/device.c | 5 + src/plugins/avf/output.c | 251 +++++++++++++++++++++-------------------------- 3 files changed, 122 insertions(+), 139 deletions(-) diff --git a/src/plugins/avf/avf.h b/src/plugins/avf/avf.h index 6538ff9e41d..f7ea407c698 100644 --- a/src/plugins/avf/avf.h +++ b/src/plugins/avf/avf.h @@ -149,7 +149,7 @@ typedef volatile struct STATIC_ASSERT_SIZEOF (avf_rx_desc_t, 32); -typedef volatile struct +typedef struct { union { @@ -188,6 +188,9 @@ typedef struct u32 *bufs; u16 n_enqueued; u16 *rs_slots; + + avf_tx_desc_t *tmp_descs; + u32 *tmp_bufs; } avf_txq_t; typedef struct diff --git a/src/plugins/avf/device.c b/src/plugins/avf/device.c index 14ad47c9854..e0c3c99453a 100644 --- a/src/plugins/avf/device.c +++ b/src/plugins/avf/device.c @@ -336,6 +336,9 @@ avf_txq_init (vlib_main_t * vm, avf_device_t * ad, u16 qid, u16 txq_size) /* initialize ring of pending RS slots */ clib_ring_new_aligned (txq->rs_slots, 32, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (txq->tmp_descs, txq->size, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (txq->tmp_bufs, txq->size, CLIB_CACHE_LINE_BYTES); + ad->n_tx_queues = clib_min (ad->num_queue_pairs, qid + 1); return 0; } @@ -1518,6 +1521,8 @@ avf_delete_if (vlib_main_t * vm, avf_device_t * ad, int with_barrier) vlib_buffer_free_one(vm, txq->ctx_desc_placeholder_bi); vec_free (txq->bufs); clib_ring_free (txq->rs_slots); + vec_free (txq->tmp_bufs); + vec_free (txq->tmp_descs); } /* *INDENT-ON* */ vec_free (ad->txqs); diff --git a/src/plugins/avf/output.c b/src/plugins/avf/output.c index 5bcb68cc95c..3f361f0067f 100644 --- a/src/plugins/avf/output.c +++ b/src/plugins/avf/output.c @@ -166,31 +166,81 @@ avf_tx_fill_ctx_desc (vlib_main_t * vm, avf_txq_t * txq, avf_tx_desc_t * d, return 0; } +static_always_inline void +avf_tx_copy_desc (avf_tx_desc_t *d, avf_tx_desc_t *s, u32 n_descs) +{ +#if defined CLIB_HAVE_VEC512 + while (n_descs >= 8) + { + u64x8u *dv = (u64x8u *) d; + u64x8u *sv = (u64x8u *) s; + + dv[0] = sv[0]; + dv[1] = sv[1]; + + /* next */ + d += 8; + s += 8; + n_descs -= 8; + } +#elif defined CLIB_HAVE_VEC256 + while (n_descs >= 4) + { + u64x4u *dv = (u64x4u *) d; + u64x4u *sv = (u64x4u *) s; + + dv[0] = sv[0]; + dv[1] = sv[1]; + + /* next */ + d += 4; + s += 4; + n_descs -= 4; + } +#elif defined CLIB_HAVE_VEC128 + while (n_descs >= 2) + { + u64x2u *dv = (u64x2u *) d; + u64x2u *sv = (u64x2u *) s; + + dv[0] = sv[0]; + dv[1] = sv[1]; + + /* next */ + d += 2; + s += 2; + n_descs -= 2; + } +#endif + while (n_descs) + { + d[0].qword[0] = s[0].qword[0]; + d[0].qword[1] = s[0].qword[1]; + d++; + s++; + n_descs--; + } +} static_always_inline u16 -avf_tx_enqueue (vlib_main_t * vm, vlib_node_runtime_t * node, avf_txq_t * txq, - u32 * buffers, u32 n_packets, int use_va_dma) +avf_tx_prepare (vlib_main_t *vm, vlib_node_runtime_t *node, avf_txq_t *txq, + u32 *buffers, u32 n_packets, u16 *n_enq_descs, int use_va_dma) { - u16 next = txq->next; u64 bits = AVF_TXD_CMD_EOP | AVF_TXD_CMD_RSV; const u32 offload_mask = VNET_BUFFER_F_OFFLOAD | VNET_BUFFER_F_GSO; u64 one_by_one_offload_flags = 0; int is_tso; u16 n_desc = 0; - u16 *slot, n_desc_left, n_packets_left = n_packets; - u16 mask = txq->size - 1; + u16 n_desc_left, n_packets_left = n_packets; vlib_buffer_t *b[4]; - avf_tx_desc_t *d = txq->descs + next; - u16 n_desc_needed; - vlib_buffer_t *b0; + avf_tx_desc_t *d = txq->tmp_descs; + u32 *tb = txq->tmp_bufs; - /* avoid ring wrap */ - n_desc_left = txq->size - clib_max (txq->next, txq->n_enqueued + 8); + n_desc_left = txq->size - txq->n_enqueued - 8; if (n_desc_left == 0) return 0; - /* Fast path, no ring wrap */ while (n_packets_left && n_desc_left) { u32 or_flags; @@ -212,7 +262,7 @@ avf_tx_enqueue (vlib_main_t * vm, vlib_node_runtime_t * node, avf_txq_t * txq, if (or_flags & (VLIB_BUFFER_NEXT_PRESENT | offload_mask)) goto one_by_one; - vlib_buffer_copy_indices (txq->bufs + next, buffers, 4); + vlib_buffer_copy_indices (tb, buffers, 4); if (use_va_dma) { @@ -234,17 +284,17 @@ avf_tx_enqueue (vlib_main_t * vm, vlib_node_runtime_t * node, avf_txq_t * txq, d[2].qword[1] = ((u64) b[2]->current_length) << 34 | bits; d[3].qword[1] = ((u64) b[3]->current_length) << 34 | bits; - next += 4; n_desc += 4; buffers += 4; n_packets_left -= 4; n_desc_left -= 4; d += 4; + tb += 4; continue; one_by_one: one_by_one_offload_flags = 0; - txq->bufs[next] = buffers[0]; + tb[0] = buffers[0]; b[0] = vlib_get_buffer (vm, buffers[0]); is_tso = ! !(b[0]->flags & VNET_BUFFER_F_GSO); if (PREDICT_FALSE (is_tso || b[0]->flags & offload_mask)) @@ -253,8 +303,8 @@ avf_tx_enqueue (vlib_main_t * vm, vlib_node_runtime_t * node, avf_txq_t * txq, /* Deal with chain buffer if present */ if (is_tso || b[0]->flags & VLIB_BUFFER_NEXT_PRESENT) { - n_desc_needed = 1 + is_tso; - b0 = b[0]; + u16 n_desc_needed = 1 + is_tso; + vlib_buffer_t *b0 = b[0]; /* Wish there were a buffer count for chain buffer */ while (b0->flags & VLIB_BUFFER_NEXT_PRESENT) @@ -287,12 +337,12 @@ avf_tx_enqueue (vlib_main_t * vm, vlib_node_runtime_t * node, avf_txq_t * txq, if (avf_tx_fill_ctx_desc (vm, txq, d, b[0])) /* Failure to acquire ref on ctx placeholder */ break; - txq->bufs[next + 1] = txq->bufs[next]; - txq->bufs[next] = txq->ctx_desc_placeholder_bi; - next += 1; + tb[1] = tb[0]; + tb[0] = txq->ctx_desc_placeholder_bi; n_desc += 1; n_desc_left -= 1; d += 1; + tb += 1; } while (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT) { @@ -304,12 +354,12 @@ avf_tx_enqueue (vlib_main_t * vm, vlib_node_runtime_t * node, avf_txq_t * txq, d[0].qword[1] = (((u64) b[0]->current_length) << 34) | AVF_TXD_CMD_RSV | one_by_one_offload_flags; - next += 1; n_desc += 1; n_desc_left -= 1; d += 1; + tb += 1; - txq->bufs[next] = b[0]->next_buffer; + tb[0] = b[0]->next_buffer; b[0] = vlib_get_buffer (vm, b[0]->next_buffer); } } @@ -320,127 +370,17 @@ avf_tx_enqueue (vlib_main_t * vm, vlib_node_runtime_t * node, avf_txq_t * txq, d[0].qword[0] = vlib_buffer_get_current_pa (vm, b[0]); d[0].qword[1] = - (((u64) b[0]->current_length) << 34) | bits | - one_by_one_offload_flags; + (((u64) b[0]->current_length) << 34) | bits | one_by_one_offload_flags; - next += 1; n_desc += 1; buffers += 1; n_packets_left -= 1; n_desc_left -= 1; d += 1; + tb += 1; } - /* Slow path to support ring wrap */ - if (PREDICT_FALSE (n_packets_left)) - { - txq->n_enqueued += n_desc; - - n_desc = 0; - d = txq->descs + (next & mask); - - /* +8 to be consistent with fast path */ - n_desc_left = txq->size - (txq->n_enqueued + 8); - - while (n_packets_left && n_desc_left) - { - - txq->bufs[next & mask] = buffers[0]; - b[0] = vlib_get_buffer (vm, buffers[0]); - - one_by_one_offload_flags = 0; - is_tso = ! !(b[0]->flags & VNET_BUFFER_F_GSO); - if (PREDICT_FALSE (is_tso || b[0]->flags & offload_mask)) - one_by_one_offload_flags |= avf_tx_prepare_cksum (b[0], is_tso); - - /* Deal with chain buffer if present */ - if (is_tso || b[0]->flags & VLIB_BUFFER_NEXT_PRESENT) - { - n_desc_needed = 1 + is_tso; - b0 = b[0]; - - while (b0->flags & VLIB_BUFFER_NEXT_PRESENT) - { - b0 = vlib_get_buffer (vm, b0->next_buffer); - n_desc_needed++; - } - - /* Spec says data descriptor is limited to 8 segments */ - if (PREDICT_FALSE (!is_tso && n_desc_needed > 8)) - { - vlib_buffer_free_one (vm, buffers[0]); - vlib_error_count (vm, node->node_index, - AVF_TX_ERROR_SEGMENT_SIZE_EXCEEDED, 1); - n_packets_left -= 1; - buffers += 1; - continue; - } - - if (PREDICT_FALSE (n_desc_left < n_desc_needed)) - break; - - /* Enqueue a context descriptor if needed */ - if (PREDICT_FALSE (is_tso)) - { - if (avf_tx_fill_ctx_desc (vm, txq, d, b[0])) - /* Failure to acquire ref on ctx placeholder */ - break; - - txq->bufs[(next + 1) & mask] = txq->bufs[next & mask]; - txq->bufs[next & mask] = txq->ctx_desc_placeholder_bi; - next += 1; - n_desc += 1; - n_desc_left -= 1; - d = txq->descs + (next & mask); - } - while (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT) - { - if (use_va_dma) - d[0].qword[0] = vlib_buffer_get_current_va (b[0]); - else - d[0].qword[0] = vlib_buffer_get_current_pa (vm, b[0]); - - d[0].qword[1] = (((u64) b[0]->current_length) << 34) | - AVF_TXD_CMD_RSV | one_by_one_offload_flags; - - next += 1; - n_desc += 1; - n_desc_left -= 1; - d = txq->descs + (next & mask); - - txq->bufs[next & mask] = b[0]->next_buffer; - b[0] = vlib_get_buffer (vm, b[0]->next_buffer); - } - } - - if (use_va_dma) - d[0].qword[0] = vlib_buffer_get_current_va (b[0]); - else - d[0].qword[0] = vlib_buffer_get_current_pa (vm, b[0]); - - d[0].qword[1] = - (((u64) b[0]->current_length) << 34) | bits | - one_by_one_offload_flags; - - next += 1; - n_desc += 1; - buffers += 1; - n_packets_left -= 1; - n_desc_left -= 1; - d = txq->descs + (next & mask); - } - } - - if ((slot = clib_ring_enq (txq->rs_slots))) - { - u16 rs_slot = slot[0] = (next - 1) & mask; - d = txq->descs + rs_slot; - d[0].qword[1] |= AVF_TXD_CMD_RS; - } - - txq->next = next & mask; - avf_tail_write (txq->qtx_tail, txq->next); - txq->n_enqueued += n_desc; + *n_enq_descs = n_desc; return n_packets - n_packets_left; } @@ -453,8 +393,10 @@ VNET_DEVICE_CLASS_TX_FN (avf_device_class) (vlib_main_t * vm, u32 thread_index = vm->thread_index; u8 qid = thread_index; avf_txq_t *txq = vec_elt_at_index (ad->txqs, qid % ad->num_queue_pairs); + u16 next = txq->next; + u16 mask = txq->size - 1; u32 *buffers = vlib_frame_vector_args (frame); - u16 n_enq, n_left; + u16 n_enq, n_left, n_desc, *slot; u16 n_retry = 2; clib_spinlock_lock_if_init (&txq->lock); @@ -494,12 +436,45 @@ retry: } } + n_desc = 0; if (ad->flags & AVF_DEVICE_F_VA_DMA) - n_enq = avf_tx_enqueue (vm, node, txq, buffers, n_left, 1); + n_enq = avf_tx_prepare (vm, node, txq, buffers, n_left, &n_desc, 1); else - n_enq = avf_tx_enqueue (vm, node, txq, buffers, n_left, 0); + n_enq = avf_tx_prepare (vm, node, txq, buffers, n_left, &n_desc, 0); - n_left -= n_enq; + if (n_desc) + { + if (PREDICT_TRUE (next + n_desc <= txq->size)) + { + /* no wrap */ + avf_tx_copy_desc (txq->descs + next, txq->tmp_descs, n_desc); + vlib_buffer_copy_indices (txq->bufs + next, txq->tmp_bufs, n_desc); + } + else + { + /* wrap */ + u32 n_not_wrap = txq->size - next; + avf_tx_copy_desc (txq->descs + next, txq->tmp_descs, n_not_wrap); + avf_tx_copy_desc (txq->descs, txq->tmp_descs + n_not_wrap, + n_desc - n_not_wrap); + vlib_buffer_copy_indices (txq->bufs + next, txq->tmp_bufs, + n_not_wrap); + vlib_buffer_copy_indices (txq->bufs, txq->tmp_bufs + n_not_wrap, + n_desc - n_not_wrap); + } + + next += n_desc; + if ((slot = clib_ring_enq (txq->rs_slots))) + { + u16 rs_slot = slot[0] = (next - 1) & mask; + txq->descs[rs_slot].qword[1] |= AVF_TXD_CMD_RS; + } + + txq->next = next & mask; + avf_tail_write (txq->qtx_tail, txq->next); + txq->n_enqueued += n_desc; + n_left -= n_enq; + } if (n_left) { -- 2.16.6