crypto: use fixed crypto frame pool
[vpp.git] / src / plugins / wireguard / wireguard_output_tun.c
index 2feb057..a563081 100644 (file)
 #include <wireguard/wireguard.h>
 #include <wireguard/wireguard_send.h>
 
-#define foreach_wg_output_error                                         \
- _(NONE, "No error")                                                   \
- _(PEER, "Peer error")                                                  \
- _(KEYPAIR, "Keypair error")                                            \
- _(TOO_BIG, "packet too big")                                           \
+#define foreach_wg_output_error                                               \
+  _ (NONE, "No error")                                                        \
+  _ (PEER, "Peer error")                                                      \
+  _ (KEYPAIR, "Keypair error")                                                \
+  _ (NO_BUFFERS, "No buffers")                                                \
+  _ (CRYPTO_ENGINE_ERROR, "crypto engine error (packet dropped)")
 
 typedef enum
 {
@@ -56,6 +57,12 @@ typedef struct
   u8 is_ip4;
 } wg_output_tun_trace_t;
 
+typedef struct
+{
+  index_t peer;
+  u32 next_index;
+} wg_output_tun_post_trace_t;
+
 u8 *
 format_ip4_udp_header (u8 * s, va_list * args)
 {
@@ -93,6 +100,123 @@ format_wg_output_tun_trace (u8 * s, va_list * args)
   return s;
 }
 
+/* post node - packet trace format function */
+static u8 *
+format_wg_output_tun_post_trace (u8 *s, va_list *args)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+
+  wg_output_tun_post_trace_t *t = va_arg (*args, wg_output_tun_post_trace_t *);
+
+  s = format (s, "peer: %d\n", t->peer);
+  s = format (s, "  wg-post: next node index %u", t->next_index);
+  return s;
+}
+
+static_always_inline void
+wg_output_chain_crypto (vlib_main_t *vm, wg_per_thread_data_t *ptd,
+                       vlib_buffer_t *b, vlib_buffer_t *lb, u8 *start,
+                       u32 start_len, u16 *n_ch)
+{
+  vnet_crypto_op_chunk_t *ch;
+  vlib_buffer_t *cb = b;
+  u32 n_chunks = 1;
+
+  vec_add2 (ptd->chunks, ch, 1);
+  ch->len = start_len;
+  ch->src = ch->dst = start;
+  cb = vlib_get_buffer (vm, cb->next_buffer);
+
+  while (1)
+    {
+      vec_add2 (ptd->chunks, ch, 1);
+      n_chunks += 1;
+      if (lb == cb)
+       ch->len = cb->current_length - NOISE_AUTHTAG_LEN;
+      else
+       ch->len = cb->current_length;
+
+      ch->src = ch->dst = vlib_buffer_get_current (cb);
+
+      if (!(cb->flags & VLIB_BUFFER_NEXT_PRESENT))
+       break;
+
+      cb = vlib_get_buffer (vm, cb->next_buffer);
+    }
+
+  if (n_ch)
+    *n_ch = n_chunks;
+}
+
+static_always_inline void
+wg_prepare_sync_enc_op (vlib_main_t *vm, wg_per_thread_data_t *ptd,
+                       vlib_buffer_t *b, vlib_buffer_t *lb,
+                       vnet_crypto_op_t **crypto_ops, u8 *src, u32 src_len,
+                       u8 *dst, u8 *aad, u32 aad_len, u64 nonce,
+                       vnet_crypto_key_index_t key_index, u32 bi, u8 *iv)
+{
+  vnet_crypto_op_t _op, *op = &_op;
+  u8 src_[] = {};
+
+  clib_memset (iv, 0, 4);
+  clib_memcpy (iv + 4, &nonce, sizeof (nonce));
+
+  vec_add2_aligned (crypto_ops[0], op, 1, CLIB_CACHE_LINE_BYTES);
+  vnet_crypto_op_init (op, VNET_CRYPTO_OP_CHACHA20_POLY1305_ENC);
+
+  op->tag_len = NOISE_AUTHTAG_LEN;
+  op->tag = vlib_buffer_get_tail (lb) - NOISE_AUTHTAG_LEN;
+  op->key_index = key_index;
+  op->aad = aad;
+  op->aad_len = aad_len;
+  op->iv = iv;
+  op->user_data = bi;
+
+  if (b != lb)
+    {
+      /* Chained buffers */
+      op->flags |= VNET_CRYPTO_OP_FLAG_CHAINED_BUFFERS;
+      op->chunk_index = vec_len (ptd->chunks);
+      wg_output_chain_crypto (vm, ptd, b, lb, src, src_len, &op->n_chunks);
+    }
+  else
+    {
+      op->src = !src ? src_ : src;
+      op->len = src_len;
+      op->dst = dst;
+    }
+}
+
+static_always_inline void
+wg_output_process_chained_ops (vlib_main_t *vm, vlib_node_runtime_t *node,
+                              vnet_crypto_op_t *ops, vlib_buffer_t *b[],
+                              u16 *nexts, vnet_crypto_op_chunk_t *chunks,
+                              u16 drop_next)
+{
+  u32 n_fail, n_ops = vec_len (ops);
+  vnet_crypto_op_t *op = ops;
+
+  if (n_ops == 0)
+    return;
+
+  n_fail = n_ops - vnet_crypto_process_chained_ops (vm, op, chunks, n_ops);
+
+  while (n_fail)
+    {
+      ASSERT (op - ops < n_ops);
+
+      if (op->status != VNET_CRYPTO_OP_STATUS_COMPLETED)
+       {
+         u32 bi = op->user_data;
+         b[bi]->error = node->errors[WG_OUTPUT_ERROR_CRYPTO_ENGINE_ERROR];
+         nexts[bi] = drop_next;
+         n_fail--;
+       }
+      op++;
+    }
+}
+
 static_always_inline void
 wg_output_process_ops (vlib_main_t *vm, vlib_node_runtime_t *node,
                       vnet_crypto_op_t *ops, vlib_buffer_t *b[], u16 *nexts,
@@ -113,7 +237,7 @@ wg_output_process_ops (vlib_main_t *vm, vlib_node_runtime_t *node,
       if (op->status != VNET_CRYPTO_OP_STATUS_COMPLETED)
        {
          u32 bi = op->user_data;
-         b[bi]->error = node->errors[WG_OUTPUT_ERROR_KEYPAIR];
+         b[bi]->error = node->errors[WG_OUTPUT_ERROR_CRYPTO_ENGINE_ERROR];
          nexts[bi] = drop_next;
          n_fail--;
        }
@@ -121,10 +245,183 @@ wg_output_process_ops (vlib_main_t *vm, vlib_node_runtime_t *node,
     }
 }
 
+static_always_inline void
+wg_output_tun_add_to_frame (vlib_main_t *vm, vnet_crypto_async_frame_t *f,
+                           u32 key_index, u32 crypto_len,
+                           i16 crypto_start_offset, u32 buffer_index,
+                           u16 next_node, u8 *iv, u8 *tag, u8 flags)
+{
+  vnet_crypto_async_frame_elt_t *fe;
+  u16 index;
+
+  ASSERT (f->n_elts < VNET_CRYPTO_FRAME_SIZE);
+
+  index = f->n_elts;
+  fe = &f->elts[index];
+  f->n_elts++;
+  fe->key_index = key_index;
+  fe->crypto_total_length = crypto_len;
+  fe->crypto_start_offset = crypto_start_offset;
+  fe->iv = iv;
+  fe->tag = tag;
+  fe->flags = flags;
+  f->buffer_indices[index] = buffer_index;
+  f->next_node_index[index] = next_node;
+}
+
+static_always_inline enum noise_state_crypt
+wg_output_tun_process (vlib_main_t *vm, wg_per_thread_data_t *ptd,
+                      vlib_buffer_t *b, vlib_buffer_t *lb,
+                      vnet_crypto_op_t **crypto_ops, noise_remote_t *r,
+                      uint32_t *r_idx, uint64_t *nonce, uint8_t *src,
+                      size_t srclen, uint8_t *dst, u32 bi, u8 *iv, f64 time)
+{
+  noise_keypair_t *kp;
+  enum noise_state_crypt ret = SC_FAILED;
+
+  if ((kp = r->r_current) == NULL)
+    goto error;
+
+  /* We confirm that our values are within our tolerances. We want:
+   *  - a valid keypair
+   *  - our keypair to be less than REJECT_AFTER_TIME seconds old
+   *  - our receive counter to be less than REJECT_AFTER_MESSAGES
+   *  - our send counter to be less than REJECT_AFTER_MESSAGES
+   */
+  if (!kp->kp_valid ||
+      wg_birthdate_has_expired_opt (kp->kp_birthdate, REJECT_AFTER_TIME,
+                                   time) ||
+      kp->kp_ctr.c_recv >= REJECT_AFTER_MESSAGES ||
+      ((*nonce = noise_counter_send (&kp->kp_ctr)) > REJECT_AFTER_MESSAGES))
+    goto error;
+
+  /* We encrypt into the same buffer, so the caller must ensure that buf
+   * has NOISE_AUTHTAG_LEN bytes to store the MAC. The nonce and index
+   * are passed back out to the caller through the provided data pointer. */
+  *r_idx = kp->kp_remote_index;
+
+  wg_prepare_sync_enc_op (vm, ptd, b, lb, crypto_ops, src, srclen, dst, NULL,
+                         0, *nonce, kp->kp_send_index, bi, iv);
+
+  /* If our values are still within tolerances, but we are approaching
+   * the tolerances, we notify the caller with ESTALE that they should
+   * establish a new keypair. The current keypair can continue to be used
+   * until the tolerances are hit. We notify if:
+   *  - our send counter is valid and not less than REKEY_AFTER_MESSAGES
+   *  - we're the initiator and our keypair is older than
+   *    REKEY_AFTER_TIME seconds */
+  ret = SC_KEEP_KEY_FRESH;
+  if ((kp->kp_valid && *nonce >= REKEY_AFTER_MESSAGES) ||
+      (kp->kp_is_initiator && wg_birthdate_has_expired_opt (
+                               kp->kp_birthdate, REKEY_AFTER_TIME, time)))
+    goto error;
+
+  ret = SC_OK;
+error:
+  return ret;
+}
+
+static_always_inline enum noise_state_crypt
+wg_add_to_async_frame (vlib_main_t *vm, wg_per_thread_data_t *ptd,
+                      vnet_crypto_async_frame_t **async_frame,
+                      vlib_buffer_t *b, vlib_buffer_t *lb, u8 *payload,
+                      u32 payload_len, u32 bi, u16 next, u16 async_next,
+                      noise_remote_t *r, uint32_t *r_idx, uint64_t *nonce,
+                      u8 *iv, f64 time)
+{
+  wg_post_data_t *post = wg_post_data (b);
+  u8 flag = 0;
+  u8 *tag;
+  noise_keypair_t *kp;
+
+  post->next_index = next;
+
+  /* crypto */
+  enum noise_state_crypt ret = SC_FAILED;
+
+  if ((kp = r->r_current) == NULL)
+    goto error;
+
+  /* We confirm that our values are within our tolerances. We want:
+   *  - a valid keypair
+   *  - our keypair to be less than REJECT_AFTER_TIME seconds old
+   *  - our receive counter to be less than REJECT_AFTER_MESSAGES
+   *  - our send counter to be less than REJECT_AFTER_MESSAGES
+   */
+  if (!kp->kp_valid ||
+      wg_birthdate_has_expired_opt (kp->kp_birthdate, REJECT_AFTER_TIME,
+                                   time) ||
+      kp->kp_ctr.c_recv >= REJECT_AFTER_MESSAGES ||
+      ((*nonce = noise_counter_send (&kp->kp_ctr)) > REJECT_AFTER_MESSAGES))
+    goto error;
+
+  /* We encrypt into the same buffer, so the caller must ensure that buf
+   * has NOISE_AUTHTAG_LEN bytes to store the MAC. The nonce and index
+   * are passed back out to the caller through the provided data pointer. */
+  *r_idx = kp->kp_remote_index;
+
+  clib_memset (iv, 0, 4);
+  clib_memcpy (iv + 4, nonce, sizeof (*nonce));
+
+  /* get a frame for this op if we don't yet have one or it's full  */
+  if (NULL == *async_frame || vnet_crypto_async_frame_is_full (*async_frame))
+    {
+      *async_frame = vnet_crypto_async_get_frame (
+       vm, VNET_CRYPTO_OP_CHACHA20_POLY1305_TAG16_AAD0_ENC);
+      if (PREDICT_FALSE (NULL == *async_frame))
+       goto error;
+      /* Save the frame to the list we'll submit at the end */
+      vec_add1 (ptd->async_frames, *async_frame);
+    }
+
+  if (b != lb)
+    flag |= VNET_CRYPTO_OP_FLAG_CHAINED_BUFFERS;
+
+  tag = vlib_buffer_get_tail (lb) - NOISE_AUTHTAG_LEN;
+
+  /* this always succeeds because we know the frame is not full */
+  wg_output_tun_add_to_frame (vm, *async_frame, kp->kp_send_index, payload_len,
+                             payload - b->data, bi, async_next, iv, tag,
+                             flag);
+
+  /* If our values are still within tolerances, but we are approaching
+   * the tolerances, we notify the caller with ESTALE that they should
+   * establish a new keypair. The current keypair can continue to be used
+   * until the tolerances are hit. We notify if:
+   *  - our send counter is valid and not less than REKEY_AFTER_MESSAGES
+   *  - we're the initiator and our keypair is older than
+   *    REKEY_AFTER_TIME seconds */
+  ret = SC_KEEP_KEY_FRESH;
+  if ((kp->kp_valid && *nonce >= REKEY_AFTER_MESSAGES) ||
+      (kp->kp_is_initiator && wg_birthdate_has_expired_opt (
+                               kp->kp_birthdate, REKEY_AFTER_TIME, time)))
+    goto error;
+
+  ret = SC_OK;
+error:
+  return ret;
+}
+
+static_always_inline void
+wg_calc_checksum (vlib_main_t *vm, vlib_buffer_t *b)
+{
+  int bogus = 0;
+  u8 ip_ver_out = (*((u8 *) vlib_buffer_get_current (b)) >> 4);
+
+  /* IPv6 UDP checksum is mandatory */
+  if (ip_ver_out == 6)
+    {
+      ip6_header_t *ip6 =
+       (ip6_header_t *) ((u8 *) vlib_buffer_get_current (b));
+      udp_header_t *udp = ip6_next_header (ip6);
+      udp->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+    }
+}
+
 /* is_ip4 - inner header flag */
 always_inline uword
 wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
-                     vlib_frame_t *frame, u8 is_ip4)
+                     vlib_frame_t *frame, u8 is_ip4, u16 async_next_node)
 {
   wg_main_t *wmp = &wg_main;
   wg_per_thread_data_t *ptd =
@@ -135,15 +432,26 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
   ip6_udp_wg_header_t *hdr6_out = NULL;
   message_data_t *message_data_wg = NULL;
   vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
-  vnet_crypto_op_t **crypto_ops = &ptd->crypto_ops;
+  vlib_buffer_t *lb;
+  vnet_crypto_op_t **crypto_ops;
   u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
   vlib_buffer_t *sync_bufs[VLIB_FRAME_SIZE];
   u32 thread_index = vm->thread_index;
   u16 n_sync = 0;
-  u16 drop_next = WG_OUTPUT_NEXT_ERROR;
+  const u16 drop_next = WG_OUTPUT_NEXT_ERROR;
+  const u8 is_async = wg_op_mode_is_set_ASYNC ();
+  vnet_crypto_async_frame_t *async_frame = NULL;
+  u16 n_async = 0;
+  u16 noop_nexts[VLIB_FRAME_SIZE], *noop_next = noop_nexts, n_noop = 0;
+  u16 err = !0;
+  u32 sync_bi[VLIB_FRAME_SIZE];
+  u32 noop_bi[VLIB_FRAME_SIZE];
 
   vlib_get_buffers (vm, from, bufs, n_left_from);
   vec_reset_length (ptd->crypto_ops);
+  vec_reset_length (ptd->chained_crypto_ops);
+  vec_reset_length (ptd->chunks);
+  vec_reset_length (ptd->async_frames);
 
   wg_peer_t *peer = NULL;
   u32 adj_index = 0;
@@ -158,6 +466,10 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
       u8 is_ip4_out = 1;
       u8 *plain_data;
       u16 plain_data_len;
+      u16 plain_data_len_total;
+      u16 n_bufs;
+      u16 b_space_left_at_beginning;
+      u32 bi = from[b - bufs];
 
       if (n_left_from > 2)
        {
@@ -169,13 +481,19 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
                         LOAD);
        }
 
-      next[0] = WG_OUTPUT_NEXT_ERROR;
+      noop_next[0] = WG_OUTPUT_NEXT_ERROR;
+      err = WG_OUTPUT_NEXT_ERROR;
 
       adj_index = vnet_buffer (b[0])->ip.adj_index[VLIB_TX];
 
       if (PREDICT_FALSE (last_adj_index != adj_index))
        {
          peeri = wg_peer_get_by_adj_index (adj_index);
+         if (peeri == INDEX_INVALID)
+           {
+             b[0]->error = node->errors[WG_OUTPUT_ERROR_PEER];
+             goto out;
+           }
          peer = wg_peer_get (peeri);
        }
 
@@ -193,9 +511,10 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
                                    wg_peer_assign_thread (thread_index));
        }
 
-      if (PREDICT_TRUE (thread_index != peer->output_thread_index))
+      if (PREDICT_FALSE (thread_index != peer->output_thread_index))
        {
-         next[0] = WG_OUTPUT_NEXT_HANDOFF;
+         noop_next[0] = WG_OUTPUT_NEXT_HANDOFF;
+         err = WG_OUTPUT_NEXT_HANDOFF;
          goto next;
        }
 
@@ -206,35 +525,83 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
          goto out;
        }
 
-      is_ip4_out = ip46_address_is_ip4 (&peer->src.addr);
-      if (is_ip4_out)
+      lb = b[0];
+      n_bufs = vlib_buffer_chain_linearize (vm, b[0]);
+      if (n_bufs == 0)
        {
-         hdr4_out = vlib_buffer_get_current (b[0]);
-         message_data_wg = &hdr4_out->wg;
+         b[0]->error = node->errors[WG_OUTPUT_ERROR_NO_BUFFERS];
+         goto out;
        }
-      else
+
+      if (n_bufs > 1)
        {
-         hdr6_out = vlib_buffer_get_current (b[0]);
-         message_data_wg = &hdr6_out->wg;
+         /* Find last buffer in the chain */
+         while (lb->flags & VLIB_BUFFER_NEXT_PRESENT)
+           lb = vlib_get_buffer (vm, lb->next_buffer);
+       }
+
+      /* Ensure there is enough free space at the beginning of the first buffer
+       * to write ethernet header (e.g. IPv6 VxLAN over IPv6 Wireguard will
+       * trigger this)
+       */
+      ASSERT ((signed) b[0]->current_data >=
+             (signed) -VLIB_BUFFER_PRE_DATA_SIZE);
+      b_space_left_at_beginning =
+       b[0]->current_data + VLIB_BUFFER_PRE_DATA_SIZE;
+      if (PREDICT_FALSE (b_space_left_at_beginning <
+                        sizeof (ethernet_header_t)))
+       {
+         u32 size_diff =
+           sizeof (ethernet_header_t) - b_space_left_at_beginning;
+
+         /* Can only move buffer when it's single and has enough free space*/
+         if (lb == b[0] &&
+             vlib_buffer_space_left_at_end (vm, b[0]) >= size_diff)
+           {
+             vlib_buffer_move (vm, b[0],
+                               b[0]->current_data + (signed) size_diff);
+           }
+         else
+           {
+             b[0]->error = node->errors[WG_OUTPUT_ERROR_NO_BUFFERS];
+             goto out;
+           }
+       }
+
+      /*
+       * Ensure there is enough free space at the end of the last buffer to
+       * write auth tag */
+      if (PREDICT_FALSE (vlib_buffer_space_left_at_end (vm, lb) <
+                        NOISE_AUTHTAG_LEN))
+       {
+         u32 tmp_bi = 0;
+         if (vlib_buffer_alloc (vm, &tmp_bi, 1) != 1)
+           {
+             b[0]->error = node->errors[WG_OUTPUT_ERROR_NO_BUFFERS];
+             goto out;
+           }
+         lb = vlib_buffer_chain_buffer (vm, lb, tmp_bi);
        }
 
       iph_offset = vnet_buffer (b[0])->ip.save_rewrite_length;
       plain_data = vlib_buffer_get_current (b[0]) + iph_offset;
-      plain_data_len = vlib_buffer_length_in_chain (vm, b[0]) - iph_offset;
+      plain_data_len = b[0]->current_length - iph_offset;
+      plain_data_len_total =
+       vlib_buffer_length_in_chain (vm, b[0]) - iph_offset;
+      size_t encrypted_packet_len = message_data_len (plain_data_len_total);
+      vlib_buffer_chain_increase_length (b[0], lb, NOISE_AUTHTAG_LEN);
       u8 *iv_data = b[0]->pre_data;
 
-      size_t encrypted_packet_len = message_data_len (plain_data_len);
-
-      /*
-       * Ensure there is enough space to write the encrypted data
-       * into the packet
-       */
-      if (PREDICT_FALSE (encrypted_packet_len >= WG_DEFAULT_DATA_SIZE) ||
-         PREDICT_FALSE ((b[0]->current_data + encrypted_packet_len) >=
-                        vlib_buffer_get_default_data_size (vm)))
+      is_ip4_out = ip46_address_is_ip4 (&peer->src.addr);
+      if (is_ip4_out)
        {
-         b[0]->error = node->errors[WG_OUTPUT_ERROR_TOO_BIG];
-         goto out;
+         hdr4_out = vlib_buffer_get_current (b[0]);
+         message_data_wg = &hdr4_out->wg;
+       }
+      else
+       {
+         hdr6_out = vlib_buffer_get_current (b[0]);
+         message_data_wg = &hdr6_out->wg;
        }
 
       if (PREDICT_FALSE (last_adj_index != adj_index))
@@ -245,12 +612,31 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
          last_adj_index = adj_index;
        }
 
+      /* Here we are sure that can send packet to next node */
+      next[0] = WG_OUTPUT_NEXT_INTERFACE_OUTPUT;
+
+      if (lb != b[0])
+       crypto_ops = &ptd->chained_crypto_ops;
+      else
+       crypto_ops = &ptd->crypto_ops;
+
       enum noise_state_crypt state;
 
-      state = noise_sync_remote_encrypt (
-       vm, crypto_ops, &peer->remote, &message_data_wg->receiver_index,
-       &message_data_wg->counter, plain_data, plain_data_len, plain_data,
-       n_sync, iv_data, time);
+      if (is_async)
+       {
+         state = wg_add_to_async_frame (
+           vm, ptd, &async_frame, b[0], lb, plain_data, plain_data_len_total,
+           bi, next[0], async_next_node, &peer->remote,
+           &message_data_wg->receiver_index, &message_data_wg->counter,
+           iv_data, time);
+       }
+      else
+       {
+         state = wg_output_tun_process (
+           vm, ptd, b[0], lb, crypto_ops, &peer->remote,
+           &message_data_wg->receiver_index, &message_data_wg->counter,
+           plain_data, plain_data_len, plain_data, n_sync, iv_data, time);
+       }
 
       if (PREDICT_FALSE (state == SC_KEEP_KEY_FRESH))
        {
@@ -261,31 +647,27 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
          // TODO: Maybe wrong
          wg_send_handshake_from_mt (peeri, false);
          wg_peer_update_flags (peeri, WG_PEER_ESTABLISHED, false);
+         noop_next[0] = WG_OUTPUT_NEXT_ERROR;
          goto out;
        }
 
-      /* Here we are sure that can send packet to next node */
-      next[0] = WG_OUTPUT_NEXT_INTERFACE_OUTPUT;
+      err = WG_OUTPUT_NEXT_INTERFACE_OUTPUT;
 
       if (is_ip4_out)
        {
          hdr4_out->wg.header.type = MESSAGE_DATA;
          hdr4_out->udp.length = clib_host_to_net_u16 (encrypted_packet_len +
                                                       sizeof (udp_header_t));
-         b[0]->current_length =
-           (encrypted_packet_len + sizeof (ip4_udp_header_t));
          ip4_header_set_len_w_chksum (
-           &hdr4_out->ip4, clib_host_to_net_u16 (b[0]->current_length));
+           &hdr4_out->ip4, clib_host_to_net_u16 (encrypted_packet_len +
+                                                 sizeof (ip4_udp_header_t)));
        }
       else
        {
          hdr6_out->wg.header.type = MESSAGE_DATA;
-         hdr6_out->udp.length = clib_host_to_net_u16 (encrypted_packet_len +
-                                                      sizeof (udp_header_t));
-         b[0]->current_length =
-           (encrypted_packet_len + sizeof (ip6_udp_header_t));
-         hdr6_out->ip6.payload_length =
-           clib_host_to_net_u16 (b[0]->current_length);
+         hdr6_out->ip6.payload_length = hdr6_out->udp.length =
+           clib_host_to_net_u16 (encrypted_packet_len +
+                                 sizeof (udp_header_t));
        }
 
     out:
@@ -304,31 +686,231 @@ wg_output_tun_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
        }
 
     next:
-      sync_bufs[n_sync] = b[0];
-      n_sync += 1;
+      if (PREDICT_FALSE (err != WG_OUTPUT_NEXT_INTERFACE_OUTPUT))
+       {
+         noop_bi[n_noop] = bi;
+         n_noop++;
+         noop_next++;
+         goto next_left;
+       }
+      if (!is_async)
+       {
+         sync_bi[n_sync] = bi;
+         sync_bufs[n_sync] = b[0];
+         n_sync += 1;
+         next += 1;
+       }
+      else
+       {
+         n_async++;
+       }
+    next_left:
       n_left_from -= 1;
-      next += 1;
       b += 1;
     }
 
-  /* wg-output-process-ops */
-  wg_output_process_ops (vm, node, ptd->crypto_ops, sync_bufs, nexts,
-                        drop_next);
+  if (n_sync)
+    {
+      /* wg-output-process-ops */
+      wg_output_process_ops (vm, node, ptd->crypto_ops, sync_bufs, nexts,
+                            drop_next);
+      wg_output_process_chained_ops (vm, node, ptd->chained_crypto_ops,
+                                    sync_bufs, nexts, ptd->chunks, drop_next);
+
+      int n_left_from_sync_bufs = n_sync;
+      while (n_left_from_sync_bufs > 0)
+       {
+         n_left_from_sync_bufs--;
+         wg_calc_checksum (vm, sync_bufs[n_left_from_sync_bufs]);
+       }
+
+      vlib_buffer_enqueue_to_next (vm, node, sync_bi, nexts, n_sync);
+    }
+  if (n_async)
+    {
+      /* submit all of the open frames */
+      vnet_crypto_async_frame_t **async_frame;
+
+      vec_foreach (async_frame, ptd->async_frames)
+       {
+         if (PREDICT_FALSE (
+               vnet_crypto_async_submit_open_frame (vm, *async_frame) < 0))
+           {
+             u32 n_drop = (*async_frame)->n_elts;
+             u32 *bi = (*async_frame)->buffer_indices;
+             u16 index = n_noop;
+             while (n_drop--)
+               {
+                 noop_bi[index] = bi[0];
+                 vlib_buffer_t *b = vlib_get_buffer (vm, bi[0]);
+                 noop_nexts[index] = drop_next;
+                 b->error = node->errors[WG_OUTPUT_ERROR_CRYPTO_ENGINE_ERROR];
+                 bi++;
+                 index++;
+               }
+             n_noop += (*async_frame)->n_elts;
+
+             vnet_crypto_async_reset_frame (*async_frame);
+             vnet_crypto_async_free_frame (vm, *async_frame);
+           }
+       }
+    }
+  if (n_noop)
+    {
+      vlib_buffer_enqueue_to_next (vm, node, noop_bi, noop_nexts, n_noop);
+    }
+
+  return frame->n_vectors;
+}
+
+always_inline uword
+wg_output_tun_post (vlib_main_t *vm, vlib_node_runtime_t *node,
+                   vlib_frame_t *frame)
+{
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
+  u16 nexts[VLIB_FRAME_SIZE], *next = nexts;
+  u32 *from = vlib_frame_vector_args (frame);
+  u32 n_left = frame->n_vectors;
+
+  index_t peeri = ~0;
+
+  vlib_get_buffers (vm, from, b, n_left);
+
+  if (n_left >= 4)
+    {
+      vlib_prefetch_buffer_header (b[0], LOAD);
+      vlib_prefetch_buffer_header (b[1], LOAD);
+      vlib_prefetch_buffer_header (b[2], LOAD);
+      vlib_prefetch_buffer_header (b[3], LOAD);
+    }
+
+  while (n_left > 8)
+    {
+      vlib_prefetch_buffer_header (b[4], LOAD);
+      vlib_prefetch_buffer_header (b[5], LOAD);
+      vlib_prefetch_buffer_header (b[6], LOAD);
+      vlib_prefetch_buffer_header (b[7], LOAD);
+
+      next[0] = (wg_post_data (b[0]))->next_index;
+      next[1] = (wg_post_data (b[1]))->next_index;
+      next[2] = (wg_post_data (b[2]))->next_index;
+      next[3] = (wg_post_data (b[3]))->next_index;
+
+      wg_calc_checksum (vm, b[0]);
+      wg_calc_checksum (vm, b[1]);
+      wg_calc_checksum (vm, b[2]);
+      wg_calc_checksum (vm, b[3]);
+
+      if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE))
+       {
+         if (b[0]->flags & VLIB_BUFFER_IS_TRACED)
+           {
+             wg_output_tun_post_trace_t *tr =
+               vlib_add_trace (vm, node, b[0], sizeof (*tr));
+             peeri = wg_peer_get_by_adj_index (
+               vnet_buffer (b[0])->ip.adj_index[VLIB_TX]);
+             tr->peer = peeri;
+             tr->next_index = next[0];
+           }
+         if (b[1]->flags & VLIB_BUFFER_IS_TRACED)
+           {
+             wg_output_tun_post_trace_t *tr =
+               vlib_add_trace (vm, node, b[1], sizeof (*tr));
+             peeri = wg_peer_get_by_adj_index (
+               vnet_buffer (b[1])->ip.adj_index[VLIB_TX]);
+             tr->next_index = next[1];
+           }
+         if (b[2]->flags & VLIB_BUFFER_IS_TRACED)
+           {
+             wg_output_tun_post_trace_t *tr =
+               vlib_add_trace (vm, node, b[2], sizeof (*tr));
+             peeri = wg_peer_get_by_adj_index (
+               vnet_buffer (b[2])->ip.adj_index[VLIB_TX]);
+             tr->next_index = next[2];
+           }
+         if (b[3]->flags & VLIB_BUFFER_IS_TRACED)
+           {
+             wg_output_tun_post_trace_t *tr =
+               vlib_add_trace (vm, node, b[3], sizeof (*tr));
+             peeri = wg_peer_get_by_adj_index (
+               vnet_buffer (b[3])->ip.adj_index[VLIB_TX]);
+             tr->next_index = next[3];
+           }
+       }
+
+      b += 4;
+      next += 4;
+      n_left -= 4;
+    }
+
+  while (n_left > 0)
+    {
+      wg_calc_checksum (vm, b[0]);
+
+      next[0] = (wg_post_data (b[0]))->next_index;
+      if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) &&
+                        (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
+       {
+         wg_output_tun_post_trace_t *tr =
+           vlib_add_trace (vm, node, b[0], sizeof (*tr));
+         peeri = wg_peer_get_by_adj_index (
+           vnet_buffer (b[0])->ip.adj_index[VLIB_TX]);
+         tr->next_index = next[0];
+       }
+
+      b += 1;
+      next += 1;
+      n_left -= 1;
+    }
 
-  vlib_buffer_enqueue_to_next (vm, node, from, nexts, n_sync);
+  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
   return frame->n_vectors;
 }
 
+VLIB_REGISTER_NODE (wg4_output_tun_post_node) = {
+  .name = "wg4-output-tun-post-node",
+  .vector_size = sizeof (u32),
+  .format_trace = format_wg_output_tun_post_trace,
+  .type = VLIB_NODE_TYPE_INTERNAL,
+  .sibling_of = "wg4-output-tun",
+  .n_errors = ARRAY_LEN (wg_output_error_strings),
+  .error_strings = wg_output_error_strings,
+};
+
+VLIB_REGISTER_NODE (wg6_output_tun_post_node) = {
+  .name = "wg6-output-tun-post-node",
+  .vector_size = sizeof (u32),
+  .format_trace = format_wg_output_tun_post_trace,
+  .type = VLIB_NODE_TYPE_INTERNAL,
+  .sibling_of = "wg6-output-tun",
+  .n_errors = ARRAY_LEN (wg_output_error_strings),
+  .error_strings = wg_output_error_strings,
+};
+
+VLIB_NODE_FN (wg4_output_tun_post_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
+{
+  return wg_output_tun_post (vm, node, from_frame);
+}
+
+VLIB_NODE_FN (wg6_output_tun_post_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame)
+{
+  return wg_output_tun_post (vm, node, from_frame);
+}
+
 VLIB_NODE_FN (wg4_output_tun_node)
 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
-  return wg_output_tun_inline (vm, node, frame, /* is_ip4 */ 1);
+  return wg_output_tun_inline (vm, node, frame, /* is_ip4 */ 1,
+                              wg_encrypt_async_next.wg4_post_next);
 }
 
 VLIB_NODE_FN (wg6_output_tun_node)
 (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
 {
-  return wg_output_tun_inline (vm, node, frame, /* is_ip4 */ 0);
+  return wg_output_tun_inline (vm, node, frame, /* is_ip4 */ 0,
+                              wg_encrypt_async_next.wg6_post_next);
 }
 
 /* *INDENT-OFF* */