octeon: fix memory ordering issue in tx batch free 35/40635/4
authorDamjan Marion <damarion@cisco.com>
Fri, 29 Mar 2024 20:33:03 +0000 (21:33 +0100)
committerFlorin Coras <florin.coras@gmail.com>
Fri, 5 Apr 2024 19:33:32 +0000 (19:33 +0000)
Type: fix
Fixes: 01fe7ab
Change-Id: I4425e809f0977521ddecf91b58b26fe4519dd6e0
Signed-off-by: Damjan Marion <damarion@cisco.com>
src/plugins/dev_octeon/octeon.h
src/plugins/dev_octeon/port.c
src/plugins/dev_octeon/queue.c
src/plugins/dev_octeon/tx_node.c

index fd8a92c..72d2d56 100644 (file)
@@ -15,6 +15,8 @@
 #include <base/roc_api.h>
 #include <dev_octeon/hw_defs.h>
 
+#define OCT_BATCH_ALLOC_IOVA0_MASK 0xFFFFFFFFFFFFFF80
+
 typedef enum
 {
   OCT_DEVICE_TYPE_UNKNOWN = 0,
@@ -72,13 +74,15 @@ typedef struct
 typedef struct
 {
   CLIB_ALIGN_MARK (cl, 128);
-  union
-  {
-    struct npa_batch_alloc_status_s status;
-    u64 iova[16];
-  };
+  u64 iova[16];
 } oct_npa_batch_alloc_cl128_t;
 
+typedef union
+{
+  struct npa_batch_alloc_status_s status;
+  u64 as_u64;
+} oct_npa_batch_alloc_status_t;
+
 STATIC_ASSERT_SIZEOF (oct_npa_batch_alloc_cl128_t, 128);
 
 typedef struct
index 00ad8b9..a82e480 100644 (file)
@@ -284,8 +284,11 @@ oct_txq_stop (vlib_main_t *vm, vnet_dev_tx_queue_t *txq)
     for (n = ctq->ba_num_cl, cl = ctq->ba_buffer + ctq->ba_first_cl; n;
         cl++, n--)
       {
-       if (cl->status.ccode != 0)
-         for (u32 i = 0; i < cl->status.count; i++)
+       oct_npa_batch_alloc_status_t st;
+
+       st.as_u64 = __atomic_load_n (cl->iova, __ATOMIC_ACQUIRE);
+       if (st.status.ccode != ALLOC_CCODE_INVAL)
+         for (u32 i = 0; i < st.status.count; i++)
            {
              vlib_buffer_t *b = (vlib_buffer_t *) (cl->iova[i] + off);
              vlib_buffer_free_one (vm, vlib_get_buffer_index (vm, b));
index 9378fc3..d6ae794 100644 (file)
@@ -57,12 +57,20 @@ oct_tx_queue_alloc (vlib_main_t *vm, vnet_dev_tx_queue_t *txq)
   oct_txq_t *ctq = vnet_dev_get_tx_queue_data (txq);
   vnet_dev_port_t *port = txq->port;
   vnet_dev_t *dev = port->dev;
+  u32 sz = sizeof (void *) * ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS;
+  vnet_dev_rv_t rv;
 
   log_debug (dev, "tx_queue_alloc: queue %u alocated", txq->queue_id);
 
-  return vnet_dev_dma_mem_alloc (
-    vm, dev, sizeof (void *) * ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS, 128,
-    (void **) &ctq->ba_buffer);
+  rv = vnet_dev_dma_mem_alloc (vm, dev, sz, 128, (void **) &ctq->ba_buffer);
+
+  if (rv != VNET_DEV_OK)
+    return rv;
+
+  clib_memset_u64 (ctq->ba_buffer, OCT_BATCH_ALLOC_IOVA0_MASK,
+                  ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS);
+
+  return rv;
 }
 
 void
index 28e8f25..0dbf875 100644 (file)
@@ -46,9 +46,12 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
 
       for (cl = ctq->ba_buffer + ctq->ba_first_cl; num_cl > 0; num_cl--, cl++)
        {
-         u8 count;
-         if (cl->status.ccode == ALLOC_CCODE_INVAL)
+         oct_npa_batch_alloc_status_t st;
+
+         if ((st.as_u64 = __atomic_load_n (cl->iova, __ATOMIC_RELAXED)) ==
+             OCT_BATCH_ALLOC_IOVA0_MASK + ALLOC_CCODE_INVAL)
            {
+           cl_not_ready:
              ctx->batch_alloc_not_ready++;
              n_freed = bi - (u32 *) ctq->ba_buffer;
              if (n_freed > 0)
@@ -63,11 +66,15 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
              return 0;
            }
 
-         count = cl->status.count;
+         if (st.status.count > 8 &&
+             __atomic_load_n (cl->iova + 8, __ATOMIC_RELAXED) ==
+               OCT_BATCH_ALLOC_IOVA0_MASK)
+           goto cl_not_ready;
+
 #if (CLIB_DEBUG > 0)
-         cl->status.count = cl->status.ccode = 0;
+         cl->iova[0] &= OCT_BATCH_ALLOC_IOVA0_MASK;
 #endif
-         if (PREDICT_TRUE (count == 16))
+         if (PREDICT_TRUE (st.status.count == 16))
            {
              /* optimize for likely case where cacheline is full */
              vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi, 16,
@@ -76,9 +83,9 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
            }
          else
            {
-             vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi, count,
-                                                  off);
-             bi += count;
+             vlib_get_buffer_indices_with_offset (vm, (void **) cl, bi,
+                                                  st.status.count, off);
+             bi += st.status.count;
            }
        }
 
@@ -89,7 +96,8 @@ oct_batch_free (vlib_main_t *vm, oct_tx_ctx_t *ctx, vnet_dev_tx_queue_t *txq)
       /* clear status bits in each cacheline */
       n = cl - ctq->ba_buffer;
       for (u32 i = 0; i < n; i++)
-       ctq->ba_buffer[i].iova[0] = 0;
+       ctq->ba_buffer[i].iova[0] = ctq->ba_buffer[i].iova[8] =
+         OCT_BATCH_ALLOC_IOVA0_MASK;
 
       ctq->ba_num_cl = ctq->ba_first_cl = 0;
     }