crypto-sw-scheduler: crypto-dispatch improvement
[vpp.git] / src / plugins / crypto_sw_scheduler / main.c
index 7de84ff..73a158e 100644 (file)
@@ -25,14 +25,14 @@ crypto_sw_scheduler_set_worker_crypto (u32 worker_idx, u8 enabled)
   crypto_sw_scheduler_main_t *cm = &crypto_sw_scheduler_main;
   vlib_thread_main_t *tm = vlib_get_thread_main ();
   crypto_sw_scheduler_per_thread_data_t *ptd = 0;
-  u32 count = 0, i = vlib_num_workers () > 0;
+  u32 count = 0, i;
 
   if (worker_idx >= vlib_num_workers ())
     {
       return VNET_API_ERROR_INVALID_VALUE;
     }
 
-  for (; i < tm->n_vlib_mains; i++)
+  for (i = 0; i < tm->n_vlib_mains; i++)
     {
       ptd = cm->per_thread_data + i;
       count += ptd->self_crypto_enabled;
@@ -74,108 +74,94 @@ crypto_sw_scheduler_key_handler (vlib_main_t * vm, vnet_crypto_key_op_t kop,
 }
 
 static int
-crypto_sw_scheduler_frame_enqueue (vlib_main_t * vm,
-                                  vnet_crypto_async_frame_t * frame)
+crypto_sw_scheduler_frame_enqueue (vlib_main_t *vm,
+                                  vnet_crypto_async_frame_t *frame, u8 is_enc)
 {
   crypto_sw_scheduler_main_t *cm = &crypto_sw_scheduler_main;
-  crypto_sw_scheduler_per_thread_data_t *ptd
-    = vec_elt_at_index (cm->per_thread_data, vm->thread_index);
-  crypto_sw_scheduler_queue_t *q = ptd->queues[frame->op];
-  u64 head = q->head;
-
-  if (q->jobs[head & CRYPTO_SW_SCHEDULER_QUEUE_MASK])
+  crypto_sw_scheduler_per_thread_data_t *ptd =
+    vec_elt_at_index (cm->per_thread_data, vm->thread_index);
+  crypto_sw_scheduler_queue_t *current_queue =
+    is_enc ? &ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_ENCRYPT] :
+            &ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_DECRYPT];
+  u64 head = current_queue->head;
+
+  if (current_queue->jobs[head & CRYPTO_SW_SCHEDULER_QUEUE_MASK])
     {
       u32 n_elts = frame->n_elts, i;
       for (i = 0; i < n_elts; i++)
        frame->elts[i].status = VNET_CRYPTO_OP_STATUS_FAIL_ENGINE_ERR;
-      frame->state = VNET_CRYPTO_FRAME_STATE_ELT_ERROR;
       return -1;
     }
-  frame->state = VNET_CRYPTO_FRAME_STATE_NOT_PROCESSED;
-  q->jobs[head & CRYPTO_SW_SCHEDULER_QUEUE_MASK] = frame;
+
+  current_queue->jobs[head & CRYPTO_SW_SCHEDULER_QUEUE_MASK] = frame;
   head += 1;
   CLIB_MEMORY_STORE_BARRIER ();
-  q->head = head;
+  current_queue->head = head;
   return 0;
 }
 
-static_always_inline vnet_crypto_async_frame_t *
-crypto_sw_scheduler_get_pending_frame (crypto_sw_scheduler_queue_t * q)
+static int
+crypto_sw_scheduler_frame_enqueue_decrypt (vlib_main_t *vm,
+                                          vnet_crypto_async_frame_t *frame)
 {
-  vnet_crypto_async_frame_t *f;
-  u32 i;
-  u32 tail = q->tail;
-  u32 head = q->head;
-
-  for (i = tail; i < head; i++)
-    {
-      f = q->jobs[i & CRYPTO_SW_SCHEDULER_QUEUE_MASK];
-      if (!f)
-       continue;
-      if (clib_atomic_bool_cmp_and_swap
-         (&f->state, VNET_CRYPTO_FRAME_STATE_PENDING,
-          VNET_CRYPTO_FRAME_STATE_WORK_IN_PROGRESS))
-       {
-         return f;
-       }
+  return crypto_sw_scheduler_frame_enqueue (vm, frame, 0);
     }
-  return NULL;
-}
-
-static_always_inline vnet_crypto_async_frame_t *
-crypto_sw_scheduler_get_completed_frame (crypto_sw_scheduler_queue_t * q)
-{
-  vnet_crypto_async_frame_t *f = 0;
-  if (q->jobs[q->tail & CRYPTO_SW_SCHEDULER_QUEUE_MASK]
-      && q->jobs[q->tail & CRYPTO_SW_SCHEDULER_QUEUE_MASK]->state
-      >= VNET_CRYPTO_FRAME_STATE_SUCCESS)
+    static int
+    crypto_sw_scheduler_frame_enqueue_encrypt (
+      vlib_main_t *vm, vnet_crypto_async_frame_t *frame)
     {
-      u32 tail = q->tail;
-      CLIB_MEMORY_STORE_BARRIER ();
-      q->tail++;
-      f = q->jobs[tail & CRYPTO_SW_SCHEDULER_QUEUE_MASK];
-      q->jobs[tail & CRYPTO_SW_SCHEDULER_QUEUE_MASK] = 0;
+
+      return crypto_sw_scheduler_frame_enqueue (vm, frame, 1);
     }
-  return f;
-}
 
 static_always_inline void
-cryptodev_sw_scheduler_sgl (vlib_main_t * vm,
-                           crypto_sw_scheduler_per_thread_data_t * ptd,
-                           vlib_buffer_t * b, vnet_crypto_op_t * op,
-                           i32 offset, i32 len)
+cryptodev_sw_scheduler_sgl (vlib_main_t *vm,
+                           crypto_sw_scheduler_per_thread_data_t *ptd,
+                           vlib_buffer_t *b, vnet_crypto_op_t *op, i16 offset,
+                           u32 len)
 {
   vnet_crypto_op_chunk_t *ch;
-  vlib_buffer_t *nb = b;
-  u32 n_chunks = 0;
-  u32 chunk_index = vec_len (ptd->chunks);
+  u32 n_chunks;
+
+  /*
+   * offset is relative to b->data (can be negative if we stay in pre_data
+   * area). Make sure it does not go beyond the 1st buffer.
+   */
+  ASSERT (b->current_data + b->current_length > offset);
+  offset = clib_min (b->current_data + b->current_length, offset);
 
-  op->flags |= VNET_CRYPTO_OP_FLAG_CHAINED_BUFFERS;
+  op->chunk_index = vec_len (ptd->chunks);
 
-  while (len)
+  vec_add2 (ptd->chunks, ch, 1);
+  ch->src = ch->dst = b->data + offset;
+  ch->len = clib_min (b->current_data + b->current_length - offset, len);
+  len -= ch->len;
+  n_chunks = 1;
+
+  while (len && b->flags & VLIB_BUFFER_NEXT_PRESENT)
     {
-      if (nb->current_data + nb->current_length > offset)
-       {
-         vec_add2 (ptd->chunks, ch, 1);
-         ch->src = ch->dst = nb->data + offset;
-         ch->len
-           = clib_min (nb->current_data + nb->current_length - offset, len);
-         len -= ch->len;
-         offset = 0;
-         n_chunks++;
-         if (!len)
-           break;
-       }
-      if (offset)
-       offset -= nb->current_data + nb->current_length;
-      if (nb->flags & VLIB_BUFFER_NEXT_PRESENT)
-       nb = vlib_get_buffer (vm, nb->next_buffer);
-      else
-       break;
+      b = vlib_get_buffer (vm, b->next_buffer);
+      vec_add2 (ptd->chunks, ch, 1);
+      ch->src = ch->dst = vlib_buffer_get_current (b);
+      ch->len = clib_min (b->current_length, len);
+      len -= ch->len;
+      n_chunks++;
+    }
+
+  if (len)
+    {
+      /* Some async crypto users can use buffers in creative ways, let's allow
+       * some flexibility here...
+       * Current example is ESP decrypt with ESN in async mode: it will stash
+       * ESN at the end of the last buffer (if it can) because it must be part
+       * of the integrity check but it will not update the buffer length.
+       * Fixup the last operation chunk length if we have room.
+       */
+      ASSERT (vlib_buffer_space_left_at_end (vm, b) >= len);
+      if (vlib_buffer_space_left_at_end (vm, b) >= len)
+       ch->len += len;
     }
 
-  ASSERT (offset == 0 && len == 0);
-  op->chunk_index = chunk_index;
   op->n_chunks = n_chunks;
 }
 
@@ -253,14 +239,12 @@ crypto_sw_scheduler_convert_link_crypto (vlib_main_t * vm,
   crypto_op->iv = fe->iv;
   crypto_op->key_index = key->index_crypto;
   crypto_op->user_data = 0;
+  crypto_op->flags = fe->flags & ~VNET_CRYPTO_OP_FLAG_HMAC_CHECK;
   integ_op->op = integ_op_id;
   integ_op->digest = fe->digest;
   integ_op->digest_len = digest_len;
   integ_op->key_index = key->index_integ;
-  if (is_enc)
-    crypto_op->flags |= VNET_CRYPTO_OP_FLAG_INIT_IV;
-  else
-    integ_op->flags |= VNET_CRYPTO_OP_FLAG_HMAC_CHECK;
+  integ_op->flags = fe->flags;
   crypto_op->user_data = integ_op->user_data = index;
 }
 
@@ -276,17 +260,22 @@ process_ops (vlib_main_t * vm, vnet_crypto_async_frame_t * f,
 
   n_fail = n_ops - vnet_crypto_process_ops (vm, op, n_ops);
 
-  while (n_fail)
+  /*
+   * If we had a failure in the ops then we need to walk all the ops
+   * and set the status in the corresponding frame. This status is
+   * not set in the case with no failures, as in that case the overall
+   * frame status is success.
+   */
+  if (n_fail)
     {
-      ASSERT (op - ops < n_ops);
-
-      if (op->status != VNET_CRYPTO_OP_STATUS_COMPLETED)
+      for (int i = 0; i < n_ops; i++)
        {
+         ASSERT (op - ops < n_ops);
+
          f->elts[op->user_data].status = op->status;
-         *state = VNET_CRYPTO_FRAME_STATE_ELT_ERROR;
-         n_fail--;
+         op++;
        }
-      op++;
+      *state = VNET_CRYPTO_FRAME_STATE_ELT_ERROR;
     }
 }
 
@@ -303,170 +292,287 @@ process_chained_ops (vlib_main_t * vm, vnet_crypto_async_frame_t * f,
 
   n_fail = n_ops - vnet_crypto_process_chained_ops (vm, op, chunks, n_ops);
 
-  while (n_fail)
+  /*
+   * If we had a failure in the ops then we need to walk all the ops
+   * and set the status in the corresponding frame. This status is
+   * not set in the case with no failures, as in that case the overall
+   * frame status is success.
+   */
+  if (n_fail)
     {
-      ASSERT (op - ops < n_ops);
-
-      if (op->status != VNET_CRYPTO_OP_STATUS_COMPLETED)
+      for (int i = 0; i < n_ops; i++)
        {
+         ASSERT (op - ops < n_ops);
+
          f->elts[op->user_data].status = op->status;
-         *state = VNET_CRYPTO_FRAME_STATE_ELT_ERROR;
-         n_fail--;
+         op++;
        }
-      op++;
+      *state = VNET_CRYPTO_FRAME_STATE_ELT_ERROR;
     }
 }
 
-static_always_inline vnet_crypto_async_frame_t *
-crypto_sw_scheduler_dequeue_aead (vlib_main_t * vm,
-                                 vnet_crypto_async_op_id_t async_op_id,
-                                 vnet_crypto_op_id_t sync_op_id, u8 tag_len,
-                                 u8 aad_len, u32 * nb_elts_processed,
-                                 u32 * enqueue_thread_idx)
+static_always_inline void
+crypto_sw_scheduler_process_aead (vlib_main_t *vm,
+                                 crypto_sw_scheduler_per_thread_data_t *ptd,
+                                 vnet_crypto_async_frame_t *f, u32 aead_op,
+                                 u32 aad_len, u32 digest_len)
 {
-  crypto_sw_scheduler_main_t *cm = &crypto_sw_scheduler_main;
-  crypto_sw_scheduler_per_thread_data_t *ptd = 0;
-  crypto_sw_scheduler_queue_t *q = 0;
-  vnet_crypto_async_frame_t *f = 0;
   vnet_crypto_async_frame_elt_t *fe;
   u32 *bi;
-  u32 n_elts;
-  int i = 0;
+  u32 n_elts = f->n_elts;
   u8 state = VNET_CRYPTO_FRAME_STATE_SUCCESS;
 
-  if (cm->per_thread_data[vm->thread_index].self_crypto_enabled)
-    {
-      /* *INDENT-OFF* */
-      vec_foreach_index (i, cm->per_thread_data)
-      {
-        ptd = cm->per_thread_data + i;
-        q = ptd->queues[async_op_id];
-        f = crypto_sw_scheduler_get_pending_frame (q);
-        if (f)
-          break;
-      }
-      /* *INDENT-ON* */
-    }
+  vec_reset_length (ptd->crypto_ops);
+  vec_reset_length (ptd->integ_ops);
+  vec_reset_length (ptd->chained_crypto_ops);
+  vec_reset_length (ptd->chained_integ_ops);
+  vec_reset_length (ptd->chunks);
 
-  ptd = cm->per_thread_data + vm->thread_index;
+  fe = f->elts;
+  bi = f->buffer_indices;
 
-  if (f)
+  while (n_elts--)
     {
-      *nb_elts_processed = n_elts = f->n_elts;
-      fe = f->elts;
-      bi = f->buffer_indices;
-
-      vec_reset_length (ptd->crypto_ops);
-      vec_reset_length (ptd->chained_crypto_ops);
-      vec_reset_length (ptd->chunks);
-
-      while (n_elts--)
-       {
-         if (n_elts > 1)
-           CLIB_PREFETCH (fe + 1, CLIB_CACHE_LINE_BYTES, LOAD);
+      if (n_elts > 1)
+       clib_prefetch_load (fe + 1);
 
-         crypto_sw_scheduler_convert_aead (vm, ptd, fe, fe - f->elts, bi[0],
-                                           sync_op_id, aad_len, tag_len);
-         bi++;
-         fe++;
-       }
+      crypto_sw_scheduler_convert_aead (vm, ptd, fe, fe - f->elts, bi[0],
+                                       aead_op, aad_len, digest_len);
+      bi++;
+      fe++;
+    }
 
       process_ops (vm, f, ptd->crypto_ops, &state);
       process_chained_ops (vm, f, ptd->chained_crypto_ops, ptd->chunks,
                           &state);
       f->state = state;
-      *enqueue_thread_idx = f->enqueue_thread_index;
-    }
-
-  return crypto_sw_scheduler_get_completed_frame (ptd->queues[async_op_id]);
 }
 
-static_always_inline vnet_crypto_async_frame_t *
-crypto_sw_scheduler_dequeue_link (vlib_main_t * vm,
-                                 vnet_crypto_async_op_id_t async_op_id,
-                                 vnet_crypto_op_id_t sync_crypto_op_id,
-                                 vnet_crypto_op_id_t sync_integ_op_id,
-                                 u16 digest_len, u8 is_enc,
-                                 u32 * nb_elts_processed,
-                                 u32 * enqueue_thread_idx)
+static_always_inline void
+crypto_sw_scheduler_process_link (vlib_main_t *vm,
+                                 crypto_sw_scheduler_main_t *cm,
+                                 crypto_sw_scheduler_per_thread_data_t *ptd,
+                                 vnet_crypto_async_frame_t *f, u32 crypto_op,
+                                 u32 auth_op, u16 digest_len, u8 is_enc)
 {
-  crypto_sw_scheduler_main_t *cm = &crypto_sw_scheduler_main;
-  crypto_sw_scheduler_per_thread_data_t *ptd = 0;
-  crypto_sw_scheduler_queue_t *q = 0;
-  vnet_crypto_async_frame_t *f = 0;
   vnet_crypto_async_frame_elt_t *fe;
   u32 *bi;
-  u32 n_elts;
-  int i = 0;
+  u32 n_elts = f->n_elts;
   u8 state = VNET_CRYPTO_FRAME_STATE_SUCCESS;
 
-  if (cm->per_thread_data[vm->thread_index].self_crypto_enabled)
+  vec_reset_length (ptd->crypto_ops);
+  vec_reset_length (ptd->integ_ops);
+  vec_reset_length (ptd->chained_crypto_ops);
+  vec_reset_length (ptd->chained_integ_ops);
+  vec_reset_length (ptd->chunks);
+  fe = f->elts;
+  bi = f->buffer_indices;
+
+  while (n_elts--)
+    {
+      if (n_elts > 1)
+       clib_prefetch_load (fe + 1);
+
+      crypto_sw_scheduler_convert_link_crypto (
+       vm, ptd, cm->keys + fe->key_index, fe, fe - f->elts, bi[0], crypto_op,
+       auth_op, digest_len, is_enc);
+      bi++;
+      fe++;
+    }
+
+  if (is_enc)
     {
-      /* *INDENT-OFF* */
-      vec_foreach_index (i, cm->per_thread_data)
-      {
-        ptd = cm->per_thread_data + i;
-        q = ptd->queues[async_op_id];
-        f = crypto_sw_scheduler_get_pending_frame (q);
-        if (f)
-          break;
-      }
-      /* *INDENT-ON* */
+      process_ops (vm, f, ptd->crypto_ops, &state);
+      process_chained_ops (vm, f, ptd->chained_crypto_ops, ptd->chunks,
+                          &state);
+      process_ops (vm, f, ptd->integ_ops, &state);
+      process_chained_ops (vm, f, ptd->chained_integ_ops, ptd->chunks, &state);
+    }
+  else
+    {
+      process_ops (vm, f, ptd->integ_ops, &state);
+      process_chained_ops (vm, f, ptd->chained_integ_ops, ptd->chunks, &state);
+      process_ops (vm, f, ptd->crypto_ops, &state);
+      process_chained_ops (vm, f, ptd->chained_crypto_ops, ptd->chunks,
+                          &state);
     }
 
-  ptd = cm->per_thread_data + vm->thread_index;
+  f->state = state;
+}
 
-  if (f)
+static_always_inline int
+convert_async_crypto_id (vnet_crypto_async_op_id_t async_op_id, u32 *crypto_op,
+                        u32 *auth_op_or_aad_len, u16 *digest_len, u8 *is_enc)
+{
+  switch (async_op_id)
     {
-      vec_reset_length (ptd->crypto_ops);
-      vec_reset_length (ptd->integ_ops);
-      vec_reset_length (ptd->chained_crypto_ops);
-      vec_reset_length (ptd->chained_integ_ops);
-      vec_reset_length (ptd->chunks);
+#define _(n, s, k, t, a)                                                      \
+  case VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_ENC:                            \
+    *crypto_op = VNET_CRYPTO_OP_##n##_ENC;                                    \
+    *auth_op_or_aad_len = a;                                                  \
+    *digest_len = t;                                                          \
+    *is_enc = 1;                                                              \
+    return 1;                                                                 \
+  case VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_DEC:                            \
+    *crypto_op = VNET_CRYPTO_OP_##n##_DEC;                                    \
+    *auth_op_or_aad_len = a;                                                  \
+    *digest_len = t;                                                          \
+    *is_enc = 0;                                                              \
+    return 1;
+      foreach_crypto_aead_async_alg
+#undef _
+
+#define _(c, h, s, k, d)                                                      \
+  case VNET_CRYPTO_OP_##c##_##h##_TAG##d##_ENC:                               \
+    *crypto_op = VNET_CRYPTO_OP_##c##_ENC;                                    \
+    *auth_op_or_aad_len = VNET_CRYPTO_OP_##h##_HMAC;                          \
+    *digest_len = d;                                                          \
+    *is_enc = 1;                                                              \
+    return 0;                                                                 \
+  case VNET_CRYPTO_OP_##c##_##h##_TAG##d##_DEC:                               \
+    *crypto_op = VNET_CRYPTO_OP_##c##_DEC;                                    \
+    *auth_op_or_aad_len = VNET_CRYPTO_OP_##h##_HMAC;                          \
+    *digest_len = d;                                                          \
+    *is_enc = 0;                                                              \
+    return 0;
+       foreach_crypto_link_async_alg
+#undef _
 
-      *nb_elts_processed = n_elts = f->n_elts;
-      fe = f->elts;
-      bi = f->buffer_indices;
+       default : return -1;
+    }
 
-      while (n_elts--)
-       {
-         if (n_elts > 1)
-           CLIB_PREFETCH (fe + 1, CLIB_CACHE_LINE_BYTES, LOAD);
-
-         crypto_sw_scheduler_convert_link_crypto (vm, ptd,
-                                                  cm->keys + fe->key_index,
-                                                  fe, fe - f->elts, bi[0],
-                                                  sync_crypto_op_id,
-                                                  sync_integ_op_id,
-                                                  digest_len, is_enc);
-         bi++;
-         fe++;
-       }
+  return -1;
+}
 
-      if (is_enc)
-       {
-         process_ops (vm, f, ptd->crypto_ops, &state);
-         process_chained_ops (vm, f, ptd->chained_crypto_ops, ptd->chunks,
-                              &state);
-         process_ops (vm, f, ptd->integ_ops, &state);
-         process_chained_ops (vm, f, ptd->chained_integ_ops, ptd->chunks,
-                              &state);
-       }
-      else
+static_always_inline vnet_crypto_async_frame_t *
+crypto_sw_scheduler_dequeue (vlib_main_t *vm, u32 *nb_elts_processed,
+                            u32 *enqueue_thread_idx)
+{
+  crypto_sw_scheduler_main_t *cm = &crypto_sw_scheduler_main;
+  crypto_sw_scheduler_per_thread_data_t *ptd =
+    cm->per_thread_data + vm->thread_index;
+  vnet_crypto_async_frame_t *f = 0;
+  crypto_sw_scheduler_queue_t *current_queue = 0;
+  u32 tail, head;
+  u8 found = 0;
+  u8 recheck_queues = 1;
+
+run_next_queues:
+  /* get a pending frame to process */
+  if (ptd->self_crypto_enabled)
+    {
+      u32 i = ptd->last_serve_lcore_id + 1;
+
+      while (1)
        {
-         process_ops (vm, f, ptd->integ_ops, &state);
-         process_chained_ops (vm, f, ptd->chained_integ_ops, ptd->chunks,
-                              &state);
-         process_ops (vm, f, ptd->crypto_ops, &state);
-         process_chained_ops (vm, f, ptd->chained_crypto_ops, ptd->chunks,
-                              &state);
+         crypto_sw_scheduler_per_thread_data_t *st;
+         u32 j;
+
+         if (i >= vec_len (cm->per_thread_data))
+           i = 0;
+
+         st = cm->per_thread_data + i;
+
+         if (ptd->last_serve_encrypt)
+           current_queue = &st->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_DECRYPT];
+         else
+           current_queue = &st->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_ENCRYPT];
+
+         tail = current_queue->tail;
+         head = current_queue->head;
+
+         /* Skip this queue unless tail < head or head has overflowed
+          * and tail has not. At the point where tail overflows (== 0),
+          * the largest possible value of head is (queue size - 1).
+          * Prior to that, the largest possible value of head is
+          * (queue size - 2).
+          */
+         if ((tail > head) && (head >= CRYPTO_SW_SCHEDULER_QUEUE_MASK))
+           goto skip_queue;
+
+         for (j = tail; j != head; j++)
+           {
+
+             f = current_queue->jobs[j & CRYPTO_SW_SCHEDULER_QUEUE_MASK];
+
+             if (!f)
+               continue;
+
+             if (clib_atomic_bool_cmp_and_swap (
+                   &f->state, VNET_CRYPTO_FRAME_STATE_PENDING,
+                   VNET_CRYPTO_FRAME_STATE_WORK_IN_PROGRESS))
+               {
+                 found = 1;
+                 break;
+               }
+           }
+
+       skip_queue:
+         if (found || i == ptd->last_serve_lcore_id)
+           {
+             CLIB_MEMORY_STORE_BARRIER ();
+             ptd->last_serve_encrypt = !ptd->last_serve_encrypt;
+             break;
+           }
+
+         i++;
        }
 
-      f->state = state;
+      ptd->last_serve_lcore_id = i;
+    }
+
+  if (found)
+    {
+      u32 crypto_op, auth_op_or_aad_len;
+      u16 digest_len;
+      u8 is_enc;
+      int ret;
+
+      ret = convert_async_crypto_id (f->op, &crypto_op, &auth_op_or_aad_len,
+                                    &digest_len, &is_enc);
+
+      if (ret == 1)
+       crypto_sw_scheduler_process_aead (vm, ptd, f, crypto_op,
+                                         auth_op_or_aad_len, digest_len);
+      else if (ret == 0)
+       crypto_sw_scheduler_process_link (
+         vm, cm, ptd, f, crypto_op, auth_op_or_aad_len, digest_len, is_enc);
+
       *enqueue_thread_idx = f->enqueue_thread_index;
+      *nb_elts_processed = f->n_elts;
+    }
+
+  if (ptd->last_return_queue)
+    {
+      current_queue = &ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_DECRYPT];
+      ptd->last_return_queue = 0;
+    }
+  else
+    {
+      current_queue = &ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_ENCRYPT];
+      ptd->last_return_queue = 1;
+    }
+
+  tail = current_queue->tail & CRYPTO_SW_SCHEDULER_QUEUE_MASK;
+
+  if (current_queue->jobs[tail] &&
+      current_queue->jobs[tail]->state >= VNET_CRYPTO_FRAME_STATE_SUCCESS)
+    {
+
+      CLIB_MEMORY_STORE_BARRIER ();
+      current_queue->tail++;
+      f = current_queue->jobs[tail];
+      current_queue->jobs[tail] = 0;
+
+      return f;
     }
 
-  return crypto_sw_scheduler_get_completed_frame (ptd->queues[async_op_id]);
+  if (!found && recheck_queues)
+    {
+      recheck_queues = 0;
+      goto run_next_queues;
+    }
+  return 0;
 }
 
 static clib_error_t *
@@ -526,14 +632,12 @@ sw_scheduler_set_worker_crypto (vlib_main_t * vm, unformat_input_t * input,
  * @cliexstart{set sw_scheduler worker 0 crypto off}
  * @cliexend
  ?*/
-/* *INDENT-OFF* */
 VLIB_CLI_COMMAND (cmd_set_sw_scheduler_worker_crypto, static) = {
   .path = "set sw_scheduler",
   .short_help = "set sw_scheduler worker <idx> crypto <on|off>",
   .function = sw_scheduler_set_worker_crypto,
   .is_mp_safe = 1,
 };
-/* *INDENT-ON* */
 
 static clib_error_t *
 sw_scheduler_show_workers (vlib_main_t * vm, unformat_input_t * input,
@@ -562,14 +666,12 @@ sw_scheduler_show_workers (vlib_main_t * vm, unformat_input_t * input,
  * @cliexstart{show sw_scheduler workers}
  * @cliexend
  ?*/
-/* *INDENT-OFF* */
 VLIB_CLI_COMMAND (cmd_show_sw_scheduler_workers, static) = {
   .path = "show sw_scheduler workers",
   .short_help = "show sw_scheduler workers",
   .function = sw_scheduler_show_workers,
   .is_mp_safe = 1,
 };
-/* *INDENT-ON* */
 
 clib_error_t *
 sw_scheduler_cli_init (vlib_main_t * vm)
@@ -579,50 +681,6 @@ sw_scheduler_cli_init (vlib_main_t * vm)
 
 VLIB_INIT_FUNCTION (sw_scheduler_cli_init);
 
-/* *INDENT-OFF* */
-#define _(n, s, k, t, a)                                                      \
-  static vnet_crypto_async_frame_t                                            \
-      *crypto_sw_scheduler_frame_dequeue_##n##_TAG_##t##_AAD_##a##_enc (      \
-          vlib_main_t *vm, u32 *nb_elts_processed, u32 * thread_idx)          \
-  {                                                                           \
-    return crypto_sw_scheduler_dequeue_aead (                                 \
-        vm, VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_ENC,                       \
-        VNET_CRYPTO_OP_##n##_ENC, t, a, nb_elts_processed, thread_idx);       \
-  }                                                                           \
-  static vnet_crypto_async_frame_t                                            \
-      *crypto_sw_scheduler_frame_dequeue_##n##_TAG_##t##_AAD_##a##_dec (      \
-          vlib_main_t *vm, u32 *nb_elts_processed, u32 * thread_idx)          \
-  {                                                                           \
-    return crypto_sw_scheduler_dequeue_aead (                                 \
-        vm, VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_DEC,                       \
-        VNET_CRYPTO_OP_##n##_DEC, t, a, nb_elts_processed, thread_idx);       \
-  }
-foreach_crypto_aead_async_alg
-#undef _
-
-#define _(c, h, s, k, d)                                                      \
-  static vnet_crypto_async_frame_t                                            \
-      *crypto_sw_scheduler_frame_dequeue_##c##_##h##_TAG##d##_enc (           \
-          vlib_main_t *vm, u32 *nb_elts_processed, u32 * thread_idx)          \
-  {                                                                           \
-    return crypto_sw_scheduler_dequeue_link (                                 \
-        vm, VNET_CRYPTO_OP_##c##_##h##_TAG##d##_ENC,                          \
-        VNET_CRYPTO_OP_##c##_ENC, VNET_CRYPTO_OP_##h##_HMAC, d, 1,            \
-        nb_elts_processed, thread_idx);                                       \
-  }                                                                           \
-  static vnet_crypto_async_frame_t                                            \
-      *crypto_sw_scheduler_frame_dequeue_##c##_##h##_TAG##d##_dec (           \
-          vlib_main_t *vm, u32 *nb_elts_processed, u32 * thread_idx)          \
-  {                                                                           \
-    return crypto_sw_scheduler_dequeue_link (                                 \
-        vm, VNET_CRYPTO_OP_##c##_##h##_TAG##d##_DEC,                          \
-        VNET_CRYPTO_OP_##c##_DEC, VNET_CRYPTO_OP_##h##_HMAC, d, 0,            \
-        nb_elts_processed, thread_idx);                                       \
-  }
-    foreach_crypto_link_async_alg
-#undef _
-        /* *INDENT-ON* */
-
 crypto_sw_scheduler_main_t crypto_sw_scheduler_main;
 clib_error_t *
 crypto_sw_scheduler_init (vlib_main_t * vm)
@@ -631,26 +689,33 @@ crypto_sw_scheduler_init (vlib_main_t * vm)
   vlib_thread_main_t *tm = vlib_get_thread_main ();
   clib_error_t *error = 0;
   crypto_sw_scheduler_per_thread_data_t *ptd;
-
-  u32 queue_size = CRYPTO_SW_SCHEDULER_QUEUE_SIZE * sizeof (void *)
-    + sizeof (crypto_sw_scheduler_queue_t);
+  u32 i;
 
   vec_validate_aligned (cm->per_thread_data, tm->n_vlib_mains - 1,
                        CLIB_CACHE_LINE_BYTES);
 
-  vec_foreach (ptd, cm->per_thread_data)
-  {
-    ptd->self_crypto_enabled = 1;
-    u32 i;
-    for (i = 0; i < VNET_CRYPTO_ASYNC_OP_N_IDS; i++)
-      {
-       crypto_sw_scheduler_queue_t *q
-         = clib_mem_alloc_aligned (queue_size, CLIB_CACHE_LINE_BYTES);
-       ASSERT (q != 0);
-       ptd->queues[i] = q;
-       clib_memset_u8 (q, 0, queue_size);
-      }
-  }
+  for (i = 0; i < tm->n_vlib_mains; i++)
+    {
+      ptd = cm->per_thread_data + i;
+      ptd->self_crypto_enabled = i > 0 || vlib_num_workers () < 1;
+
+      ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_DECRYPT].head = 0;
+      ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_DECRYPT].tail = 0;
+
+      vec_validate_aligned (
+       ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_DECRYPT].jobs,
+       CRYPTO_SW_SCHEDULER_QUEUE_SIZE - 1, CLIB_CACHE_LINE_BYTES);
+
+      ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_ENCRYPT].head = 0;
+      ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_ENCRYPT].tail = 0;
+
+      ptd->last_serve_encrypt = 0;
+      ptd->last_return_queue = 0;
+
+      vec_validate_aligned (
+       ptd->queue[CRYPTO_SW_SCHED_QUEUE_TYPE_ENCRYPT].jobs,
+       CRYPTO_SW_SCHEDULER_QUEUE_SIZE - 1, CLIB_CACHE_LINE_BYTES);
+    }
 
   cm->crypto_engine_index =
     vnet_crypto_register_engine (vm, "sw_scheduler", 100,
@@ -661,33 +726,28 @@ crypto_sw_scheduler_init (vlib_main_t * vm)
 
   crypto_sw_scheduler_api_init (vm);
 
-  /* *INDENT-OFF* */
 #define _(n, s, k, t, a)                                                      \
-  vnet_crypto_register_async_handler (                                        \
-      vm, cm->crypto_engine_index,                                            \
-      VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_ENC,                             \
-      crypto_sw_scheduler_frame_enqueue,                                      \
-      crypto_sw_scheduler_frame_dequeue_##n##_TAG_##t##_AAD_##a##_enc);       \
-  vnet_crypto_register_async_handler (                                        \
-      vm, cm->crypto_engine_index,                                            \
-      VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_DEC,                             \
-      crypto_sw_scheduler_frame_enqueue,                                      \
-      crypto_sw_scheduler_frame_dequeue_##n##_TAG_##t##_AAD_##a##_dec);
+  vnet_crypto_register_enqueue_handler (                                      \
+    vm, cm->crypto_engine_index, VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_ENC,  \
+    crypto_sw_scheduler_frame_enqueue_encrypt);                               \
+  vnet_crypto_register_enqueue_handler (                                      \
+    vm, cm->crypto_engine_index, VNET_CRYPTO_OP_##n##_TAG##t##_AAD##a##_DEC,  \
+    crypto_sw_scheduler_frame_enqueue_decrypt);
   foreach_crypto_aead_async_alg
 #undef _
 
 #define _(c, h, s, k, d)                                                      \
-  vnet_crypto_register_async_handler (                                        \
-      vm, cm->crypto_engine_index, VNET_CRYPTO_OP_##c##_##h##_TAG##d##_ENC,   \
-      crypto_sw_scheduler_frame_enqueue,                                      \
-      crypto_sw_scheduler_frame_dequeue_##c##_##h##_TAG##d##_enc);            \
-  vnet_crypto_register_async_handler (                                        \
-      vm, cm->crypto_engine_index, VNET_CRYPTO_OP_##c##_##h##_TAG##d##_DEC,   \
-      crypto_sw_scheduler_frame_enqueue,                                      \
-      crypto_sw_scheduler_frame_dequeue_##c##_##h##_TAG##d##_dec);
-      foreach_crypto_link_async_alg
+  vnet_crypto_register_enqueue_handler (                                      \
+    vm, cm->crypto_engine_index, VNET_CRYPTO_OP_##c##_##h##_TAG##d##_ENC,     \
+    crypto_sw_scheduler_frame_enqueue_encrypt);                               \
+  vnet_crypto_register_enqueue_handler (                                      \
+    vm, cm->crypto_engine_index, VNET_CRYPTO_OP_##c##_##h##_TAG##d##_DEC,     \
+    crypto_sw_scheduler_frame_enqueue_decrypt);
+    foreach_crypto_link_async_alg
 #undef _
-      /* *INDENT-ON* */
+
+      vnet_crypto_register_dequeue_handler (vm, cm->crypto_engine_index,
+                                           crypto_sw_scheduler_dequeue);
 
   if (error)
     vec_free (cm->per_thread_data);
@@ -695,7 +755,6 @@ crypto_sw_scheduler_init (vlib_main_t * vm)
   return error;
 }
 
-/* *INDENT-OFF* */
 VLIB_INIT_FUNCTION (crypto_sw_scheduler_init) = {
   .runs_after = VLIB_INITS ("vnet_crypto_init"),
 };
@@ -704,7 +763,6 @@ VLIB_PLUGIN_REGISTER () = {
   .version = VPP_BUILD_VER,
   .description = "SW Scheduler Crypto Async Engine plugin",
 };
-/* *INDENT-ON* */
 
 /*
  * fd.io coding-style-patch-verification: ON