src/plugins/dev_octeon/tx_node.c

   1 /* SPDX-License-Identifier: Apache-2.0
   2  * Copyright (c) 2023 Cisco Systems, Inc.
   3  */
   4
   5 #include <vlib/vlib.h>
   6 #include <vppinfra/ring.h>
   7 #include <vppinfra/vector/ip_csum.h>
   8
   9 #include <vnet/dev/dev.h>
  10 #include <vnet/ethernet/ethernet.h>
  11 #include <vnet/ip/ip4_packet.h>
  12 #include <vnet/ip/ip6_packet.h>
  13 #include <vnet/udp/udp_packet.h>
  14 #include <vnet/tcp/tcp_packet.h>
  15
  16 #include <dev_octeon/octeon.h>
  17
  18 typedef struct
  19 {
  20   union nix_send_hdr_w0_u hdr_w0_teplate;
  21   vlib_node_runtime_t *node;
  22   u32 n_tx_bytes;
  23   u32 n_drop;
  24   vlib_buffer_t *drop[VLIB_FRAME_SIZE];
  25   u32 batch_alloc_not_ready;
  26   u32 batch_alloc_issue_fail;
  27   u16 lmt_id;
  28   u64 lmt_ioaddr;
  29   lmt_line_t *lmt_lines;
  30 } oct_tx_ctx_t;
  31
  32 static_always_inline u32
  33 oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
  34 {
  35   oct_txq_t *ctq = vnet_dev_get_tx_queue_data (txq);
  36   u8 num_cl;
  37   u64 ah;
  38   u32 n_freed = 0, n;
  39   oct_npa_batch_alloc_cl128_t *cl;
  40
  41   num_cl = ctq->ba_num_cl;
  42   if (num_cl)
  43     {
  44       u16 off = ctq->hdr_off;
  45       u32 *bi = (u32 *) ctq->ba_buffer;
  46
  47       for (cl = ctq->ba_buffer + ctq->ba_first_cl; num_cl > 0; num_cl--, cl++)
  48         {
  49           oct_npa_batch_alloc_status_t st;
  50
  51           if ((st.as_u64 = __atomic_load_n (cl->iova, __ATOMIC_RELAXED)) ==
  52               OCT_BATCH_ALLOC_IOVA0_MASK + ALLOC_CCODE_INVAL)
  53             {
  54             cl_not_ready:
  55               ctx->batch_alloc_not_ready++;
  56               n_freed = bi - (u32 *) ctq->ba_buffer;
  57               if (n_freed > 0)
  58                 {
  59                   vlib_buffer_free_no_next (vm, (u32 *) ctq->ba_buffer,
  60                                             n_freed);
  61                   ctq->ba_num_cl = num_cl;
  62                   ctq->ba_first_cl = cl - ctq->ba_buffer;
  63                   return n_freed;
  64                 }
  65
  66               return 0;
  67             }
  68
  69           if (st.status.count > 8 &&
  70               __atomic_load_n (cl->iova + 8, __ATOMIC_RELAXED) ==
  71                 OCT_BATCH_ALLOC_IOVA0_MASK)
  72             goto cl_not_ready;
  73
  74 #if (CLIB_DEBUG > 0)
  75           cl->iova[0] &= OCT_BATCH_ALLOC_IOVA0_MASK;
  76 #endif
  77           if (PREDICT_TRUE (st.status.count == 16))
  78             {
  79               /* optimize for likely case where cacheline is full */
  80               vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi, 16,
  81                                                    off);
  82               bi += 16;
  83             }
  84           else
  85             {
  86               vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi,
  87                                                    st.status.count, off);
  88               bi += st.status.count;
  89             }
  90         }
  91
  92       n_freed = bi - (u32 *) ctq->ba_buffer;
  93       if (n_freed > 0)
  94         vlib_buffer_free_no_next (vm, (u32 *) ctq->ba_buffer, n_freed);
  95
  96       /* clear status bits in each cacheline */
  97       n = cl - ctq->ba_buffer;
  98       for (u32 i = 0; i < n; i++)
  99         ctq->ba_buffer[i].iova[0] = ctq->ba_buffer[i].iova[8] =
 100           OCT_BATCH_ALLOC_IOVA0_MASK;
 101
 102       ctq->ba_num_cl = ctq->ba_first_cl = 0;
 103     }
 104
 105   ah = ctq->aura_handle;
 106
 107   if ((n = roc_npa_aura_op_available (ah)) >= 32)
 108     {
 109       u64 addr, res;
 110
 111       n = clib_min (n, ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS);
 112
 113       oct_npa_batch_alloc_compare_t cmp = {
 114         .compare_s = { .aura = roc_npa_aura_handle_to_aura (ah),
 115                        .stype = ALLOC_STYPE_STF,
 116                        .count = n }
 117       };
 118
 119       addr = roc_npa_aura_handle_to_base (ah) + NPA_LF_AURA_BATCH_ALLOC;
 120       res = roc_atomic64_casl (cmp.as_u64, (uint64_t) ctq->ba_buffer,
 121                                (i64 *) addr);
 122       if (res == ALLOC_RESULT_ACCEPTED || res == ALLOC_RESULT_NOCORE)
 123         {
 124           ctq->ba_num_cl = (n + 15) / 16;
 125           ctq->ba_first_cl = 0;
 126         }
 127       else
 128         ctx->batch_alloc_issue_fail++;
 129     }
 130
 131   return n_freed;
 132 }
 133
 134 static_always_inline u8
 135 oct_tx_enq1 (vlib_main_t *vm, oct_tx_ctx_t *ctx, vlib_buffer_t *b,
 136              lmt_line_t *line, u32 flags, int simple, int trace, u32 *n,
 137              u8 *dpl)
 138 {
 139   u8 n_dwords = 2;
 140   u32 total_len = 0;
 141   oct_tx_desc_t d = {
 142     .hdr_w0 = ctx->hdr_w0_teplate,
 143     .sg[0] = {
 144       .segs = 1,
 145       .subdc = NIX_SUBDC_SG,
 146     },
 147     .sg[4] = {
 148       .subdc = NIX_SUBDC_SG,
 149     },
 150   };
 151
 152   if (!simple && flags & VLIB_BUFFER_NEXT_PRESENT)
 153     {
 154       u8 n_tail_segs = 0;
 155       vlib_buffer_t *tail_segs[5], *t = b;
 156
 157       while (t->flags & VLIB_BUFFER_NEXT_PRESENT)
 158         {
 159           t = vlib_get_buffer (vm, t->next_buffer);
 160           tail_segs[n_tail_segs++] = t;
 161           if (n_tail_segs > 5)
 162             {
 163               ctx->drop[ctx->n_drop++] = b;
 164               return 0;
 165             }
 166         }
 167
 168       switch (n_tail_segs)
 169         {
 170         case 5:
 171           d.sg[7].u = (u64) vlib_buffer_get_current (tail_segs[4]);
 172           total_len += d.sg[4].seg3_size = tail_segs[4]->current_length;
 173           d.sg[4].segs++;
 174         case 4:
 175           d.sg[6].u = (u64) vlib_buffer_get_current (tail_segs[3]);
 176           total_len += d.sg[4].seg2_size = tail_segs[3]->current_length;
 177           d.sg[4].segs++;
 178           n_dwords++;
 179         case 3:
 180           d.sg[5].u = (u64) vlib_buffer_get_current (tail_segs[2]);
 181           total_len += d.sg[4].seg1_size = tail_segs[2]->current_length;
 182           d.sg[4].segs++;
 183           n_dwords++;
 184         case 2:
 185           d.sg[3].u = (u64) vlib_buffer_get_current (tail_segs[1]);
 186           total_len += d.sg[0].seg3_size = tail_segs[1]->current_length;
 187           d.sg[0].segs++;
 188         case 1:
 189           d.sg[2].u = (u64) vlib_buffer_get_current (tail_segs[0]);
 190           total_len += d.sg[0].seg2_size = tail_segs[0]->current_length;
 191           d.sg[0].segs++;
 192           n_dwords++;
 193         default:
 194           break;
 195         };
 196       d.hdr_w0.sizem1 = n_dwords - 1;
 197     }
 198
 199   if (!simple && flags & VNET_BUFFER_F_OFFLOAD)
 200     {
 201       vnet_buffer_oflags_t oflags = vnet_buffer (b)->oflags;
 202       if (oflags & VNET_BUFFER_OFFLOAD_F_IP_CKSUM)
 203         {
 204           d.hdr_w1.ol3type = NIX_SENDL3TYPE_IP4_CKSUM;
 205           d.hdr_w1.ol3ptr = vnet_buffer (b)->l3_hdr_offset;
 206           d.hdr_w1.ol4ptr =
 207             vnet_buffer (b)->l3_hdr_offset + sizeof (ip4_header_t);
 208         }
 209       if (oflags & VNET_BUFFER_OFFLOAD_F_UDP_CKSUM)
 210         {
 211           d.hdr_w1.ol4type = NIX_SENDL4TYPE_UDP_CKSUM;
 212           d.hdr_w1.ol4ptr = vnet_buffer (b)->l4_hdr_offset;
 213         }
 214       else if (oflags & VNET_BUFFER_OFFLOAD_F_TCP_CKSUM)
 215         {
 216           d.hdr_w1.ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
 217           d.hdr_w1.ol4ptr = vnet_buffer (b)->l4_hdr_offset;
 218         }
 219     }
 220
 221   total_len += d.sg[0].seg1_size = b->current_length;
 222   d.hdr_w0.total = total_len;
 223   d.sg[1].u = (u64) vlib_buffer_get_current (b);
 224
 225   if (trace && flags & VLIB_BUFFER_IS_TRACED)
 226     {
 227       oct_tx_trace_t *t = vlib_add_trace (vm, ctx->node, b, sizeof (*t));
 228       t->desc = d;
 229       t->sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_TX];
 230     }
 231
 232   for (u32 i = 0; i < n_dwords; i++)
 233     line->dwords[i] = d.as_u128[i];
 234
 235   *dpl = n_dwords;
 236   *n = *n + 1;
 237
 238   return n_dwords;
 239 }
 240
 241 static_always_inline u32
 242 oct_tx_enq16 (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq,
 243               vlib_buffer_t **b, u32 n_pkts, int trace)
 244 {
 245   u8 dwords_per_line[16], *dpl = dwords_per_line;
 246   u64 lmt_arg, ioaddr, n_lines;
 247   u32 n_left, or_flags_16 = 0, n = 0;
 248   const u32 not_simple_flags =
 249     VLIB_BUFFER_NEXT_PRESENT | VNET_BUFFER_F_OFFLOAD;
 250   lmt_line_t *l = ctx->lmt_lines;
 251
 252   /* Data Store Memory Barrier - outer shareable domain */
 253   asm volatile("dmb oshst" ::: "memory");
 254
 255   for (n_left = n_pkts; n_left >= 8; n_left -= 8, b += 8)
 256     {
 257       u32 f0, f1, f2, f3, f4, f5, f6, f7, or_f = 0;
 258       vlib_prefetch_buffer_header (b[8], LOAD);
 259       or_f |= f0 = b[0]->flags;
 260       or_f |= f1 = b[1]->flags;
 261       vlib_prefetch_buffer_header (b[9], LOAD);
 262       or_f |= f2 = b[2]->flags;
 263       or_f |= f3 = b[3]->flags;
 264       vlib_prefetch_buffer_header (b[10], LOAD);
 265       or_f |= f4 = b[4]->flags;
 266       or_f |= f5 = b[5]->flags;
 267       vlib_prefetch_buffer_header (b[11], LOAD);
 268       or_f |= f6 = b[6]->flags;
 269       or_f |= f7 = b[7]->flags;
 270       vlib_prefetch_buffer_header (b[12], LOAD);
 271       or_flags_16 |= or_f;
 272
 273       if ((or_f & not_simple_flags) == 0)
 274         {
 275           int simple = 1;
 276           oct_tx_enq1 (vm, ctx, b[0], l, f0, simple, trace, &n, &dpl[n]);
 277           oct_tx_enq1 (vm, ctx, b[1], l + n, f1, simple, trace, &n, &dpl[n]);
 278           vlib_prefetch_buffer_header (b[13], LOAD);
 279           oct_tx_enq1 (vm, ctx, b[2], l + n, f2, simple, trace, &n, &dpl[n]);
 280           oct_tx_enq1 (vm, ctx, b[3], l + n, f3, simple, trace, &n, &dpl[n]);
 281           vlib_prefetch_buffer_header (b[14], LOAD);
 282           oct_tx_enq1 (vm, ctx, b[4], l + n, f4, simple, trace, &n, &dpl[n]);
 283           oct_tx_enq1 (vm, ctx, b[5], l + n, f5, simple, trace, &n, &dpl[n]);
 284           vlib_prefetch_buffer_header (b[15], LOAD);
 285           oct_tx_enq1 (vm, ctx, b[6], l + n, f6, simple, trace, &n, &dpl[n]);
 286           oct_tx_enq1 (vm, ctx, b[7], l + n, f7, simple, trace, &n, &dpl[n]);
 287         }
 288       else
 289         {
 290           int simple = 0;
 291           oct_tx_enq1 (vm, ctx, b[0], l, f0, simple, trace, &n, &dpl[n]);
 292           oct_tx_enq1 (vm, ctx, b[1], l + n, f1, simple, trace, &n, &dpl[n]);
 293           vlib_prefetch_buffer_header (b[13], LOAD);
 294           oct_tx_enq1 (vm, ctx, b[2], l + n, f2, simple, trace, &n, &dpl[n]);
 295           oct_tx_enq1 (vm, ctx, b[3], l + n, f3, simple, trace, &n, &dpl[n]);
 296           vlib_prefetch_buffer_header (b[14], LOAD);
 297           oct_tx_enq1 (vm, ctx, b[4], l + n, f4, simple, trace, &n, &dpl[n]);
 298           oct_tx_enq1 (vm, ctx, b[5], l + n, f5, simple, trace, &n, &dpl[n]);
 299           vlib_prefetch_buffer_header (b[15], LOAD);
 300           oct_tx_enq1 (vm, ctx, b[6], l + n, f6, simple, trace, &n, &dpl[n]);
 301           oct_tx_enq1 (vm, ctx, b[7], l + n, f7, simple, trace, &n, &dpl[n]);
 302         }
 303       dpl += n;
 304       l += n;
 305       n = 0;
 306     }
 307
 308   for (; n_left > 0; n_left -= 1, b += 1)
 309     {
 310       u32 f0 = b[0]->flags;
 311       oct_tx_enq1 (vm, ctx, b[0], l, f0, 0, trace, &n, &dpl[n]);
 312       or_flags_16 |= f0;
 313       dpl += n;
 314       l += n;
 315       n = 0;
 316     }
 317
 318   lmt_arg = ctx->lmt_id;
 319   ioaddr = ctx->lmt_ioaddr;
 320   n_lines = dpl - dwords_per_line;
 321
 322   if (PREDICT_FALSE (!n_lines))
 323     return n_pkts;
 324
 325   if (PREDICT_FALSE (or_flags_16 & VLIB_BUFFER_NEXT_PRESENT))
 326     {
 327       dpl = dwords_per_line;
 328       ioaddr |= (dpl[0] - 1) << 4;
 329
 330       if (n_lines > 1)
 331         {
 332           lmt_arg |= (--n_lines) << 12;
 333
 334           for (u8 bit_off = 19; n_lines; n_lines--, bit_off += 3, dpl++)
 335             lmt_arg |= ((u64) dpl[1] - 1) << bit_off;
 336         }
 337     }
 338   else
 339     {
 340       const u64 n_dwords = 2;
 341       ioaddr |= (n_dwords - 1) << 4;
 342
 343       if (n_lines > 1)
 344         {
 345           lmt_arg |= (--n_lines) << 12;
 346
 347           for (u8 bit_off = 19; n_lines; n_lines--, bit_off += 3)
 348             lmt_arg |= (n_dwords - 1) << bit_off;
 349         }
 350     }
 351
 352   roc_lmt_submit_steorl (lmt_arg, ioaddr);
 353
 354   return n_pkts;
 355 }
 356
 357 VNET_DEV_NODE_FN (oct_tx_node)
 358 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 359 {
 360   vnet_dev_tx_node_runtime_t *rt = vnet_dev_get_tx_node_runtime (node);
 361   vnet_dev_tx_queue_t *txq = rt->tx_queue;
 362   oct_txq_t *ctq = vnet_dev_get_tx_queue_data (txq);
 363   u32 node_index = node->node_index;
 364   u32 *from = vlib_frame_vector_args (frame);
 365   u32 n, n_enq, n_left, n_pkts = frame->n_vectors;
 366   vlib_buffer_t *buffers[VLIB_FRAME_SIZE + 8], **b = buffers;
 367   u64 lmt_id = vm->thread_index << ROC_LMT_LINES_PER_CORE_LOG2;
 368
 369   oct_tx_ctx_t ctx = {
 370     .node = node,
 371     .hdr_w0_teplate = {
 372       .aura = roc_npa_aura_handle_to_aura (ctq->aura_handle),
 373       .sq = ctq->sq.qid,
 374       .sizem1 = 1,
 375     },
 376     .lmt_id = lmt_id,
 377     .lmt_ioaddr = ctq->io_addr,
 378     .lmt_lines = ctq->lmt_addr + (lmt_id << ROC_LMT_LINE_SIZE_LOG2),
 379   };
 380
 381   vlib_get_buffers (vm, vlib_frame_vector_args (frame), b, n_pkts);
 382   for (int i = 0; i < 8; i++)
 383     b[n_pkts + i] = b[n_pkts - 1];
 384
 385   vnet_dev_tx_queue_lock_if_needed (txq);
 386
 387   n_enq = ctq->n_enq;
 388   n_enq -= oct_batch_free (vm, &ctx, txq);
 389
 390   if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
 391     {
 392       for (n_left = clib_min (n_pkts, txq->size - n_enq), n = 0; n_left >= 16;
 393            n_left -= 16, b += 16)
 394         n += oct_tx_enq16 (vm, &ctx, txq, b, 16, /* trace */ 1);
 395
 396       if (n_left)
 397         n += oct_tx_enq16 (vm, &ctx, txq, b, n_left, /* trace */ 1);
 398     }
 399   else
 400     {
 401       for (n_left = clib_min (n_pkts, txq->size - n_enq), n = 0; n_left >= 16;
 402            n_left -= 16, b += 16)
 403         n += oct_tx_enq16 (vm, &ctx, txq, b, 16, /* trace */ 0);
 404
 405       if (n_left)
 406         n += oct_tx_enq16 (vm, &ctx, txq, b, n_left, /* trace */ 0);
 407     }
 408
 409   ctq->n_enq = n_enq + n - ctx.n_drop;
 410
 411   if (n < n_pkts)
 412     {
 413       u32 n_free = n_pkts - n;
 414       vlib_buffer_free (vm, from + n, n_free);
 415       vlib_error_count (vm, node->node_index, OCT_TX_NODE_CTR_NO_FREE_SLOTS,
 416                         n_free);
 417       n_pkts -= n_free;
 418     }
 419
 420   if (ctx.n_drop)
 421     vlib_error_count (vm, node->node_index, OCT_TX_NODE_CTR_CHAIN_TOO_LONG,
 422                       ctx.n_drop);
 423
 424   if (ctx.batch_alloc_not_ready)
 425     vlib_error_count (vm, node_index,
 426                       OCT_TX_NODE_CTR_AURA_BATCH_ALLOC_NOT_READY,
 427                       ctx.batch_alloc_not_ready);
 428
 429   if (ctx.batch_alloc_issue_fail)
 430     vlib_error_count (vm, node_index,
 431                       OCT_TX_NODE_CTR_AURA_BATCH_ALLOC_ISSUE_FAIL,
 432                       ctx.batch_alloc_issue_fail);
 433
 434   vnet_dev_tx_queue_unlock_if_needed (txq);
 435
 436   if (ctx.n_drop)
 437     {
 438       u32 bi[VLIB_FRAME_SIZE];
 439       vlib_get_buffer_indices (vm, ctx.drop, bi, ctx.n_drop);
 440       vlib_buffer_free (vm, bi, ctx.n_drop);
 441       n_pkts -= ctx.n_drop;
 442     }
 443
 444   return n_pkts;
 445 }