src/plugins/dev_ena/tx_node.c

   1 /* SPDX-License-Identifier: Apache-2.0
   2  * Copyright (c) 2023 Cisco Systems, Inc.
   3  */
   4
   5 #include <vnet/vnet.h>
   6 #include <vnet/dev/dev.h>
   7 #include <dev_ena/ena.h>
   8 #include <vnet/ethernet/ethernet.h>
   9 #include <dev_ena/ena.h>
  10 #include <dev_ena/ena_inlines.h>
  11
  12 #define ENA_TX_ENQ_BATCH_SZ   64
  13 #define ENA_MAX_LOG2_TXQ_SIZE 11
  14 #define ENA_TX_MAX_TAIL_LEN   5
  15
  16 typedef struct
  17 {
  18   u32 n_bytes;
  19   ena_device_t *ed;
  20   u16 n_desc;
  21   u32 mask;
  22   u16 n_packets_left;
  23   u16 n_free_slots;
  24   u32 *from;
  25   u32 *sq_buffer_indices;
  26   u32 tmp_bi[VLIB_FRAME_SIZE];
  27   ena_tx_desc_t *sqes;
  28   u64 *sqe_templates;
  29   u16 n_dropped_chain_too_long;
  30   u8 llq;
  31   void *bd;
  32 } ena_tx_ctx_t;
  33
  34 /* bits inside req_id which represent SQE index */
  35 static const u16 reqid_sqe_idx_mask = (1U << ENA_MAX_LOG2_TXQ_SIZE) - 1;
  36
  37 static_always_inline void
  38 ena_txq_adv_sq_tail (ena_tx_ctx_t *ctx, ena_txq_t *eq)
  39 {
  40   /* CQEs can arrive out of order, so we cannot blindly advance SQ tail for
  41    * number of free slots, instead we need to check if slot contains invalid
  42    * buffer index */
  43
  44   u32 sq_head = eq->sq_head;
  45   u32 sq_tail = eq->sq_tail;
  46   u16 n, offset = sq_tail & ctx->mask;
  47   u32 *bi = ctx->sq_buffer_indices + offset;
  48   u16 n_to_check = clib_min (sq_head - sq_tail, ctx->n_desc - offset);
  49
  50 advance_sq_tail:
  51   n = n_to_check;
  52
  53 #ifdef CLIB_HAVE_VEC256
  54   for (; n >= 8; n -= 8, bi += 8)
  55     if (!u32x8_is_all_equal (*(u32x8u *) bi, VLIB_BUFFER_INVALID_INDEX))
  56       break;
  57 #elif defined(CLIB_HAVE_VEC128)
  58   for (; n >= 4; n -= 4, bi += 4)
  59     if (!u32x4_is_all_equal (*(u32x4u *) bi, VLIB_BUFFER_INVALID_INDEX))
  60       break;
  61 #endif
  62
  63   for (; n > 0; n -= 1, bi += 1)
  64     if (bi[0] != VLIB_BUFFER_INVALID_INDEX)
  65       break;
  66
  67   sq_tail += n_to_check - n;
  68
  69   if (n == 0 && sq_tail < sq_head)
  70     {
  71       n_to_check = sq_head - sq_tail;
  72       bi = ctx->sq_buffer_indices;
  73       goto advance_sq_tail;
  74     }
  75
  76   eq->sq_tail = sq_tail;
  77 }
  78
  79 static_always_inline void
  80 ena_txq_deq (vlib_main_t *vm, ena_tx_ctx_t *ctx, ena_txq_t *txq)
  81 {
  82   /* dequeue CQ, extract SQ slot and number of chained buffers from
  83    * req_id, move completed buffer indices to temp array */
  84   const ena_tx_cdesc_t mask_phase = { .phase = 1 };
  85   ena_tx_cdesc_t *cqes = txq->cqes, *cd, match_phase = {};
  86   u32 cq_next = txq->cq_next;
  87   u32 offset, n = 0;
  88   u32 n_to_check;
  89   u32 *buffers_to_free = ctx->tmp_bi;
  90   u32 n_buffers_to_free = 0;
  91
  92   offset = cq_next & ctx->mask;
  93   cd = cqes + offset;
  94   n_to_check = ctx->n_desc - offset;
  95   match_phase.phase = ~(cq_next & (ctx->n_desc << 1)) != 0;
  96
  97 #ifdef CLIB_HAVE_VEC256
  98   const u16 reqid_nic1 = 1U << ENA_MAX_LOG2_TXQ_SIZE;
  99   const ena_tx_cdesc_t mask_reqid = { .req_id = reqid_sqe_idx_mask },
 100                        match_ph0_nic1 = { .req_id = reqid_nic1, .phase = 0 },
 101                        match_ph1_nic1 = { .req_id = reqid_nic1, .phase = 1 },
 102                        mask_ph_nic = { .req_id = ~reqid_sqe_idx_mask,
 103                                        .phase = 1 };
 104   /* both phase and req_id are in lower 32 bits */
 105   u32x8 mask_ph_nic_x8 = u32x8_splat (mask_ph_nic.as_u64);
 106   u32x8 mask_reqid_x8 = u32x8_splat (mask_reqid.as_u64);
 107   u32x8 match_ph_nic1_x8 = u32x8_splat (
 108     match_phase.phase ? match_ph1_nic1.as_u64 : match_ph0_nic1.as_u64);
 109   u32x8 buf_inv_idx_x8 = u32x8_splat (VLIB_BUFFER_INVALID_INDEX);
 110 #endif
 111
 112 more:
 113   while (n < n_to_check)
 114     {
 115       u16 req_id, n_in_chain;
 116
 117 #ifdef CLIB_HAVE_VEC256
 118       while (n + 7 < n_to_check)
 119         {
 120           u32x8 r, v;
 121
 122           /* load lower 32-bits of 8 CQEs in 256-bit register */
 123           r = u32x8_shuffle2 (*(u32x8u *) cd, *(u32x8u *) (cd + 4), 0, 2, 4, 6,
 124                               8, 10, 12, 14);
 125
 126           /* check if all 8 CQEs are completed and there is no chained bufs */
 127           if (u32x8_is_equal (r & mask_ph_nic_x8, match_ph_nic1_x8) == 0)
 128             goto one_by_one;
 129
 130           r &= mask_reqid_x8;
 131
 132           /* take consumed buffer indices from ring */
 133           v = u32x8_gather_u32 (ctx->sq_buffer_indices, r,
 134                                 sizeof (ctx->sq_buffer_indices[0]));
 135           u32x8_scatter_u32 (ctx->sq_buffer_indices, r, buf_inv_idx_x8,
 136                              sizeof (ctx->sq_buffer_indices[0]));
 137           *(u32x8u *) (buffers_to_free + n_buffers_to_free) = v;
 138           n_buffers_to_free += 8;
 139
 140           n += 8;
 141           cd += 8;
 142           continue;
 143         }
 144     one_by_one:
 145 #endif
 146
 147       if ((cd->as_u64 & mask_phase.as_u64) != match_phase.as_u64)
 148         goto done;
 149
 150       req_id = cd->req_id;
 151       n_in_chain = req_id >> ENA_MAX_LOG2_TXQ_SIZE;
 152       req_id &= reqid_sqe_idx_mask;
 153
 154       buffers_to_free[n_buffers_to_free++] = ctx->sq_buffer_indices[req_id];
 155       ctx->sq_buffer_indices[req_id] = VLIB_BUFFER_INVALID_INDEX;
 156
 157       if (PREDICT_FALSE (n_in_chain > 1))
 158         while (n_in_chain-- > 1)
 159           {
 160             req_id = (req_id + 1) & ctx->mask;
 161             buffers_to_free[n_buffers_to_free++] =
 162               ctx->sq_buffer_indices[req_id];
 163             ctx->sq_buffer_indices[req_id] = VLIB_BUFFER_INVALID_INDEX;
 164           }
 165
 166       n++;
 167       cd++;
 168     }
 169
 170   if (PREDICT_FALSE (n == n_to_check))
 171     {
 172       cq_next += n;
 173       n = 0;
 174       cd = cqes;
 175       match_phase.phase ^= 1;
 176 #ifdef CLIB_HAVE_VEC256
 177       match_ph_nic1_x8 ^= u32x8_splat (mask_phase.as_u64);
 178 #endif
 179       n_to_check = ctx->n_desc;
 180       goto more;
 181     }
 182
 183 done:
 184
 185   if (n_buffers_to_free)
 186     {
 187       cq_next += n;
 188
 189       /* part two - free buffers stored in temporary array */
 190       vlib_buffer_free_no_next (vm, buffers_to_free, n_buffers_to_free);
 191       txq->cq_next = cq_next;
 192
 193       ena_txq_adv_sq_tail (ctx, txq);
 194     }
 195 }
 196
 197 static_always_inline u16
 198 ena_txq_wr_sqe (vlib_main_t *vm, vlib_buffer_t *b, int use_iova,
 199                 ena_tx_desc_t *dp, u32 n_in_chain, ena_tx_desc_t desc)
 200 {
 201   uword dma_addr = use_iova ? vlib_buffer_get_current_va (b) :
 202                                     vlib_buffer_get_current_pa (vm, b);
 203   u16 len = b->current_length;
 204
 205   desc.req_id_hi = n_in_chain << (ENA_MAX_LOG2_TXQ_SIZE - 10);
 206   desc.as_u16x8[0] = len;
 207   ASSERT (dma_addr < 0xffffffffffff); /* > 48bit - should never happen */
 208   desc.as_u64x2[1] = dma_addr;        /* this also overwrites header_length */
 209
 210   /* write descriptor as single 128-bit store */
 211   dp->as_u64x2 = desc.as_u64x2;
 212   return len;
 213 }
 214
 215 static_always_inline void
 216 ena_txq_copy_sqes (ena_tx_ctx_t *ctx, u32 off, ena_tx_desc_t *s, u32 n_desc)
 217 {
 218   const u64 temp_phase_xor = (ena_tx_desc_t){ .phase = 1 }.as_u64x2[0];
 219   u32 n = 0;
 220
 221   if (ctx->llq)
 222     {
 223       ena_tx_llq_desc128_t *llq_descs = (ena_tx_llq_desc128_t *) ctx->sqes;
 224       for (; n < n_desc; n += 1, s += 1, off += 1)
 225         {
 226           ena_tx_llq_desc128_t td = {};
 227           u64 t = ctx->sqe_templates[off];
 228           u64x2 v = { t, 0 };
 229           ctx->sqe_templates[off] = t ^ temp_phase_xor;
 230           td.desc[0].as_u64x2 = v | s->as_u64x2;
 231           td.desc[0].phase = 1;
 232           td.desc[0].header_length = 96;
 233           td.desc[0].length -= 96;
 234           td.desc[0].buff_addr_lo += 96;
 235           vlib_buffer_t *b =
 236             vlib_get_buffer (vlib_get_main (), ctx->sq_buffer_indices[off]);
 237           clib_memcpy_fast (td.data, vlib_buffer_get_current (b), 96);
 238           fformat (stderr, "%U\n", format_hexdump_u32, &td, 32);
 239           fformat (stderr, "%U\n", format_ena_tx_desc, &td);
 240           clib_memcpy_fast (llq_descs + off, &td, 128);
 241         }
 242       return;
 243     }
 244
 245 #ifdef CLIB_HAVE_VEC512
 246   u64x8 temp_phase_xor_x8 = u64x8_splat (temp_phase_xor);
 247   for (; n + 7 < n_desc; n += 8, s += 8, off += 8)
 248     {
 249       u64x8 t8 = *(u64x8u *) (ctx->sqe_templates + off);
 250       *(u64x8u *) (ctx->sqe_templates + off) = t8 ^ temp_phase_xor_x8;
 251       u64x8 r0 = *(u64x8u *) s;
 252       u64x8 r1 = *(u64x8u *) (s + 4);
 253       r0 |= u64x8_shuffle2 (t8, (u64x8){}, 0, 9, 1, 11, 2, 13, 3, 15);
 254       r1 |= u64x8_shuffle2 (t8, (u64x8){}, 4, 9, 5, 11, 6, 13, 7, 15);
 255       *((u64x8u *) (ctx->sqes + off)) = r0;
 256       *((u64x8u *) (ctx->sqes + off + 4)) = r1;
 257     }
 258 #elif defined(CLIB_HAVE_VEC256)
 259   u64x4 temp_phase_xor_x4 = u64x4_splat (temp_phase_xor);
 260   for (; n + 3 < n_desc; n += 4, s += 4, off += 4)
 261     {
 262       u64x4 t4 = *(u64x4u *) (ctx->sqe_templates + off);
 263       *(u64x4u *) (ctx->sqe_templates + off) = t4 ^ temp_phase_xor_x4;
 264       u64x4 r0 = *(u64x4u *) s;
 265       u64x4 r1 = *(u64x4u *) (s + 2);
 266       r0 |= u64x4_shuffle2 (t4, (u64x4){}, 0, 5, 1, 7);
 267       r1 |= u64x4_shuffle2 (t4, (u64x4){}, 2, 5, 3, 7);
 268       *((u64x4u *) (ctx->sqes + off)) = r0;
 269       *((u64x4u *) (ctx->sqes + off + 2)) = r1;
 270     }
 271 #endif
 272
 273   for (; n < n_desc; n += 1, s += 1, off += 1)
 274     {
 275       u64 t = ctx->sqe_templates[off];
 276       u64x2 v = { t, 0 };
 277       ctx->sqe_templates[off] = t ^ temp_phase_xor;
 278       ctx->sqes[off].as_u64x2 = v | s->as_u64x2;
 279     }
 280 }
 281
 282 static_always_inline u32
 283 ena_txq_enq_one (vlib_main_t *vm, ena_tx_ctx_t *ctx, vlib_buffer_t *b0,
 284                  ena_tx_desc_t *d, u16 n_free_desc, u32 *f, int use_iova)
 285 {
 286   const ena_tx_desc_t single = { .first = 1, .last = 1 };
 287   vlib_buffer_t *b;
 288   u32 i, n;
 289
 290   /* non-chained buffer */
 291   if ((b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0)
 292     {
 293       ctx->n_bytes += ena_txq_wr_sqe (vm, b0, use_iova, d, 1, single);
 294       f[0] = ctx->from[0];
 295       ctx->from += 1;
 296       ctx->n_packets_left -= 1;
 297       return 1;
 298     }
 299
 300   /* count number of buffers in chain */
 301   for (n = 1, b = b0; b->flags & VLIB_BUFFER_NEXT_PRESENT; n++)
 302     b = vlib_get_buffer (vm, b->next_buffer);
 303
 304   /* if chain is too long, drop packet */
 305   if (n > ENA_TX_MAX_TAIL_LEN + 1)
 306     {
 307       vlib_buffer_free_one (vm, ctx->from[0]);
 308       ctx->from += 1;
 309       ctx->n_packets_left -= 1;
 310       ctx->n_dropped_chain_too_long++;
 311       return 0;
 312     }
 313
 314   /* no enough descriptors to accomodate? */
 315   if (n > n_free_desc)
 316     return 0;
 317
 318   /* first */
 319   f++[0] = ctx->from[0];
 320   ctx->from += 1;
 321   ctx->n_packets_left -= 1;
 322   ctx->n_bytes +=
 323     ena_txq_wr_sqe (vm, b0, use_iova, d++, n, (ena_tx_desc_t){ .first = 1 });
 324
 325   /* mid */
 326   for (i = 1, b = b0; i < n - 1; i++)
 327     {
 328       f++[0] = b->next_buffer;
 329       b = vlib_get_buffer (vm, b->next_buffer);
 330       ctx->n_bytes +=
 331         ena_txq_wr_sqe (vm, b, use_iova, d++, 0, (ena_tx_desc_t){});
 332     }
 333
 334   /* last */
 335   f[0] = b->next_buffer;
 336   b = vlib_get_buffer (vm, b->next_buffer);
 337   ctx->n_bytes +=
 338     ena_txq_wr_sqe (vm, b, use_iova, d, 0, (ena_tx_desc_t){ .last = 1 });
 339
 340   return n;
 341 }
 342
 343 static_always_inline uword
 344 ena_txq_enq (vlib_main_t *vm, ena_tx_ctx_t *ctx, ena_txq_t *txq, int use_iova)
 345 {
 346   vlib_buffer_t *b0, *b1, *b2, *b3;
 347   u32 *f = ctx->tmp_bi;
 348   ena_tx_desc_t desc[ENA_TX_ENQ_BATCH_SZ], *d = desc;
 349   const ena_tx_desc_t single = { .first = 1, .last = 1 };
 350   u32 n_desc_left, n;
 351
 352   if (ctx->n_packets_left == 0)
 353     return 0;
 354
 355   if (ctx->n_free_slots == 0)
 356     return 0;
 357
 358   n_desc_left = clib_min (ENA_TX_ENQ_BATCH_SZ, ctx->n_free_slots);
 359
 360   while (n_desc_left >= 4 && ctx->n_packets_left >= 8)
 361     {
 362       clib_prefetch_load (vlib_get_buffer (vm, ctx->from[4]));
 363       b0 = vlib_get_buffer (vm, ctx->from[0]);
 364       clib_prefetch_load (vlib_get_buffer (vm, ctx->from[5]));
 365       b1 = vlib_get_buffer (vm, ctx->from[1]);
 366       clib_prefetch_load (vlib_get_buffer (vm, ctx->from[6]));
 367       b2 = vlib_get_buffer (vm, ctx->from[2]);
 368       clib_prefetch_load (vlib_get_buffer (vm, ctx->from[7]));
 369       b3 = vlib_get_buffer (vm, ctx->from[3]);
 370
 371       if (PREDICT_FALSE (((b0->flags | b1->flags | b2->flags | b3->flags) &
 372                           VLIB_BUFFER_NEXT_PRESENT) == 0))
 373         {
 374           ctx->n_bytes += ena_txq_wr_sqe (vm, b0, use_iova, d++, 1, single);
 375           ctx->n_bytes += ena_txq_wr_sqe (vm, b1, use_iova, d++, 1, single);
 376           ctx->n_bytes += ena_txq_wr_sqe (vm, b2, use_iova, d++, 1, single);
 377           ctx->n_bytes += ena_txq_wr_sqe (vm, b3, use_iova, d++, 1, single);
 378           vlib_buffer_copy_indices (f, ctx->from, 4);
 379           ctx->from += 4;
 380           ctx->n_packets_left -= 4;
 381
 382           n_desc_left -= 4;
 383           f += 4;
 384         }
 385       else
 386         {
 387           n = ena_txq_enq_one (vm, ctx, b0, d, n_desc_left, f, use_iova);
 388           if (n == 0)
 389             break;
 390           n_desc_left -= n;
 391           f += n;
 392           d += n;
 393         }
 394     }
 395
 396   while (n_desc_left > 0 && ctx->n_packets_left > 0)
 397     {
 398       vlib_buffer_t *b0;
 399
 400       b0 = vlib_get_buffer (vm, ctx->from[0]);
 401       n = ena_txq_enq_one (vm, ctx, b0, d, n_desc_left, f, use_iova);
 402       if (n == 0)
 403         break;
 404       n_desc_left -= n;
 405       f += n;
 406       d += n;
 407     }
 408
 409   n = d - desc;
 410
 411   if (n)
 412     {
 413       u32 head = txq->sq_head;
 414       u32 offset = head & ctx->mask;
 415       u32 n_before_wrap = ctx->n_desc - offset;
 416       u32 n_copy;
 417
 418       d = desc;
 419       f = ctx->tmp_bi;
 420
 421       if (n_before_wrap >= n)
 422         {
 423           n_copy = n;
 424           vlib_buffer_copy_indices (ctx->sq_buffer_indices + offset, f,
 425                                     n_copy);
 426           ena_txq_copy_sqes (ctx, offset, d, n_copy);
 427         }
 428       else
 429         {
 430           n_copy = n_before_wrap;
 431           vlib_buffer_copy_indices (ctx->sq_buffer_indices + offset, f,
 432                                     n_copy);
 433           ena_txq_copy_sqes (ctx, offset, d, n_copy);
 434
 435           n_copy = n - n_before_wrap;
 436           vlib_buffer_copy_indices (ctx->sq_buffer_indices, f + n_before_wrap,
 437                                     n_copy);
 438           ena_txq_copy_sqes (ctx, 0, d + n_before_wrap, n_copy);
 439         }
 440
 441       head += n;
 442       __atomic_store_n (txq->sq_db, head, __ATOMIC_RELEASE);
 443       txq->sq_head = head;
 444       ctx->n_free_slots -= n;
 445
 446       return n;
 447     }
 448   return 0;
 449 }
 450
 451 VNET_DEV_NODE_FN (ena_tx_node)
 452 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 453 {
 454   vnet_dev_tx_node_runtime_t *tnr = vnet_dev_get_tx_node_runtime (node);
 455   vnet_dev_tx_queue_t *txq = tnr->tx_queue;
 456   vnet_dev_t *dev = txq->port->dev;
 457   ena_device_t *ed = vnet_dev_get_data (dev);
 458   ena_txq_t *eq = vnet_dev_get_tx_queue_data (txq);
 459   u32 n_pkts = 0;
 460
 461   ena_tx_ctx_t ctx = { .mask = txq->size - 1,
 462                        .n_desc = txq->size,
 463                        .n_packets_left = frame->n_vectors,
 464                        .from = vlib_frame_vector_args (frame),
 465                        .sqe_templates = eq->sqe_templates,
 466                        .sqes = eq->sqes,
 467                        .sq_buffer_indices = eq->buffer_indices,
 468                        .llq = ed->llq };
 469
 470   vnet_dev_tx_queue_lock_if_needed (txq);
 471
 472   /* try 3 times to enquee packets by first freeing consumed from the ring
 473    * and then trying to enqueue as much as possible */
 474   for (int i = 0; i < 3; i++)
 475     {
 476       /* free buffers consumed by ENA */
 477       if (eq->sq_head != eq->sq_tail)
 478         ena_txq_deq (vm, &ctx, eq);
 479
 480       /* enqueue new buffers, try until last attempt enqueues 0 packets */
 481       ctx.n_free_slots = ctx.n_desc - (eq->sq_head - eq->sq_tail);
 482
 483       if (dev->va_dma)
 484         while (ena_txq_enq (vm, &ctx, eq, /* va */ 1) > 0)
 485           ;
 486       else
 487         while (ena_txq_enq (vm, &ctx, eq, /* va */ 0) > 0)
 488           ;
 489
 490       if (ctx.n_packets_left == 0)
 491         break;
 492     }
 493
 494   vnet_dev_tx_queue_unlock_if_needed (txq);
 495
 496   if (ctx.n_dropped_chain_too_long)
 497     vlib_error_count (vm, node->node_index, ENA_TX_NODE_CTR_CHAIN_TOO_LONG,
 498                       ctx.n_dropped_chain_too_long);
 499
 500   n_pkts = frame->n_vectors - ctx.n_packets_left;
 501   vlib_increment_combined_counter (
 502     vnet_get_main ()->interface_main.combined_sw_if_counters +
 503       VNET_INTERFACE_COUNTER_TX,
 504     vm->thread_index, tnr->hw_if_index, n_pkts, ctx.n_bytes);
 505
 506   if (ctx.n_packets_left)
 507     {
 508       vlib_buffer_free (vm, ctx.from, ctx.n_packets_left);
 509       vlib_error_count (vm, node->node_index, ENA_TX_NODE_CTR_NO_FREE_SLOTS,
 510                         ctx.n_packets_left);
 511     }
 512
 513   return n_pkts;
 514 }