dpdk/ipsec: rework plus improved cli commands
[vpp.git] / src / plugins / dpdk / ipsec / esp_decrypt.c
index c4f295d..90be466 100644 (file)
@@ -1,10 +1,10 @@
 /*
  * esp_decrypt.c : IPSec ESP Decrypt node using DPDK Cryptodev
  *
- * Copyright (c) 2016 Intel and/or its affiliates.
+ * Copyright (c) 2017 Intel and/or its affiliates.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
+ * You may obtain a opy of the License at:
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
 #include <vnet/ip/ip.h>
 
 #include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
 #include <dpdk/ipsec/ipsec.h>
-#include <dpdk/ipsec/esp.h>
 #include <dpdk/device/dpdk.h>
 #include <dpdk/device/dpdk_priv.h>
 
 #define foreach_esp_decrypt_next              \
 _(DROP, "error-drop")                         \
-_(IP4_INPUT, "ip4-input")                     \
+_(IP4_INPUT, "ip4-input-no-checksum")         \
 _(IP6_INPUT, "ip6-input")
 
 #define _(v, s) ESP_DECRYPT_NEXT_##v,
@@ -43,8 +43,10 @@ typedef enum {
  _(REPLAY, "SA replayed packet")                \
  _(NOT_IP, "Not IP packet (dropped)")           \
  _(ENQ_FAIL, "Enqueue failed (buffer full)")     \
- _(NO_CRYPTODEV, "Cryptodev not configured")     \
- _(BAD_LEN, "Invalid ciphertext length")
+ _(DISCARD, "Not enough crypto operations, discarding frame")  \
+ _(BAD_LEN, "Invalid ciphertext length")         \
+ _(SESSION, "Failed to get crypto session")      \
+ _(NOSUP, "Cipher/Auth not supported")
 
 
 typedef enum {
@@ -65,6 +67,7 @@ vlib_node_registration_t dpdk_esp_decrypt_node;
 typedef struct {
   ipsec_crypto_alg_t crypto_alg;
   ipsec_integ_alg_t integ_alg;
+  u8 packet_data[64];
 } esp_decrypt_trace_t;
 
 /* packet trace format function */
@@ -73,10 +76,14 @@ static u8 * format_esp_decrypt_trace (u8 * s, va_list * args)
   CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
   CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
   esp_decrypt_trace_t * t = va_arg (*args, esp_decrypt_trace_t *);
+  uword indent = format_get_indent (s);
 
-  s = format (s, "esp: crypto %U integrity %U",
+  s = format (s, "cipher %U auth %U\n",
              format_ipsec_crypto_alg, t->crypto_alg,
              format_ipsec_integ_alg, t->integ_alg);
+  s = format (s, "%U%U",
+             format_white_space, indent,
+             format_esp_header, t->packet_data);
   return s;
 }
 
@@ -87,30 +94,31 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
 {
   u32 n_left_from, *from, *to_next, next_index;
   ipsec_main_t *im = &ipsec_main;
-  u32 thread_index = vlib_get_thread_index();
-  dpdk_crypto_main_t * dcm = &dpdk_crypto_main;
-  dpdk_esp_main_t * em = &dpdk_esp_main;
-  u32 i;
+  u32 thread_idx = vlib_get_thread_index();
+  dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
+  crypto_resource_t *res = 0;
+  ipsec_sa_t *sa0 = 0;
+  crypto_alg_t *cipher_alg = 0, *auth_alg = 0;
+  struct rte_cryptodev_sym_session *session = 0;
+  u32 ret, last_sa_index = ~0;
+  u8 numa = rte_socket_id ();
+  u8 is_aead = 0;
+  crypto_worker_main_t *cwm =
+    vec_elt_at_index (dcm->workers_main, thread_idx);
+  struct rte_crypto_op **ops = cwm->ops;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
 
-  crypto_worker_main_t *cwm =
-    vec_elt_at_index(dcm->workers_main, thread_index);
-  u32 n_qps = vec_len(cwm->qp_data);
-  struct rte_crypto_op ** cops_to_enq[n_qps];
-  u32 n_cop_qp[n_qps], * bi_to_enq[n_qps];
-
-  for (i = 0; i < n_qps; i++)
+  ret = crypto_alloc_ops (numa, ops, n_left_from);
+  if (ret)
     {
-      bi_to_enq[i] = cwm->qp_data[i].bi;
-      cops_to_enq[i] = cwm->qp_data[i].cops;
+      vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+                                  ESP_DECRYPT_ERROR_DISCARD, 1);
+      /* Discard whole frame */
+      return n_left_from;
     }
 
-  memset(n_cop_qp, 0, n_qps * sizeof(u32));
-
-  crypto_alloc_cops();
-
   next_index = ESP_DECRYPT_NEXT_DROP;
 
   while (n_left_from > 0)
@@ -121,44 +129,79 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
 
       while (n_left_from > 0 && n_left_to_next > 0)
        {
-         u32 bi0, sa_index0 = ~0, seq, trunc_size, iv_size;
-         vlib_buffer_t * b0;
-         esp_header_t * esp0;
-         ipsec_sa_t * sa0;
-         struct rte_mbuf * mb0 = 0;
-         const int BLOCK_SIZE = 16;
-         crypto_sa_session_t * sa_sess;
-         void * sess;
-         u16 qp_index;
-         struct rte_crypto_op * cop = 0;
+         clib_error_t *error;
+         u32 bi0, sa_index0, seq, iv_size;
+         u8 trunc_size;
+         vlib_buffer_t *b0;
+         esp_header_t *esp0;
+         struct rte_mbuf *mb0;
+         struct rte_crypto_op *op;
+         u16 res_idx;
 
          bi0 = from[0];
          from += 1;
          n_left_from -= 1;
 
          b0 = vlib_get_buffer (vm, bi0);
+         mb0 = rte_mbuf_from_vlib_buffer(b0);
          esp0 = vlib_buffer_get_current (b0);
 
-         sa_index0 = vnet_buffer(b0)->ipsec.sad_index;
-         sa0 = pool_elt_at_index (im->sad, sa_index0);
+         /* ih0/ih6_0 */
+         CLIB_PREFETCH (esp0, sizeof (esp0[0]) + 16, LOAD);
+         /* mb0 */
+         CLIB_PREFETCH (mb0, CLIB_CACHE_LINE_BYTES, STORE);
 
-         seq = clib_host_to_net_u32(esp0->seq);
+         op = ops[0];
+         ops += 1;
+         ASSERT (op->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED);
 
-         /* anti-replay check */
-         if (sa0->use_anti_replay)
+         dpdk_op_priv_t *priv = crypto_op_get_priv (op);
+
+         u16 op_len =
+           sizeof (op[0]) + sizeof (op[0].sym[0]) + sizeof (priv[0]);
+         CLIB_PREFETCH (op, op_len, STORE);
+
+         sa_index0 = vnet_buffer(b0)->ipsec.sad_index;
+
+         if (sa_index0 != last_sa_index)
            {
-             int rv = 0;
+             last_sa_index = sa_index0;
 
-             if (PREDICT_TRUE(sa0->use_esn))
-               rv = esp_replay_check_esn(sa0, seq);
-             else
-               rv = esp_replay_check(sa0, seq);
+             sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+             cipher_alg = vec_elt_at_index (dcm->cipher_algs, sa0->crypto_alg);
+             auth_alg = vec_elt_at_index (dcm->auth_algs, sa0->integ_alg);
+
+#if DPDK_NO_AEAD
+             is_aead = (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128 |
+                           sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_192 |
+                           sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_256);
+#else
+             is_aead = (cipher_alg->type == RTE_CRYPTO_SYM_XFORM_AEAD);
+#endif
+             if (is_aead)
+               auth_alg = cipher_alg;
 
-             if (PREDICT_FALSE(rv))
+             res_idx = get_resource (cwm, sa0);
+
+             if (PREDICT_FALSE (res_idx == (u16) ~0))
                {
-                 clib_warning ("anti-replay SPI %u seq %u", sa0->spi, seq);
+                 clib_warning ("unsupported SA by thread index %u", thread_idx);
                  vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
-                                              ESP_DECRYPT_ERROR_REPLAY, 1);
+                                              ESP_DECRYPT_ERROR_NOSUP, 1);
+                 to_next[0] = bi0;
+                 to_next += 1;
+                 n_left_to_next -= 1;
+                 goto trace;
+               }
+             res = vec_elt_at_index (dcm->resource, res_idx);
+
+             error = crypto_get_session (&session, sa_index0, res, cwm, 0);
+             if (PREDICT_FALSE (error || !session))
+               {
+                 clib_warning ("failed to get crypto session");
+                 vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+                                              ESP_DECRYPT_ERROR_SESSION, 1);
                  to_next[0] = bi0;
                  to_next += 1;
                  n_left_to_next -= 1;
@@ -166,16 +209,23 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
                }
            }
 
-         sa0->total_data_size += b0->current_length;
+         /* anti-replay check */
+         if (sa0->use_anti_replay)
+           {
+             int rv = 0;
 
-         sa_sess = pool_elt_at_index(cwm->sa_sess_d[0], sa_index0);
+             seq = clib_net_to_host_u32 (esp0->seq);
 
-         if (PREDICT_FALSE(!sa_sess->sess))
-           {
-             int ret = create_sym_sess(sa0, sa_sess, 0);
+             if (PREDICT_TRUE(sa0->use_esn))
+               rv = esp_replay_check_esn (sa0, seq);
+             else
+               rv = esp_replay_check (sa0, seq);
 
-             if (PREDICT_FALSE (ret))
+             if (PREDICT_FALSE (rv))
                {
+                 clib_warning ("failed anti-replay check");
+                 vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+                                              ESP_DECRYPT_ERROR_REPLAY, 1);
                  to_next[0] = bi0;
                  to_next += 1;
                  n_left_to_next -= 1;
@@ -183,112 +233,101 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
                }
            }
 
-         sess = sa_sess->sess;
-         qp_index = sa_sess->qp_index;
-
-         ASSERT (vec_len (vec_elt (cwm->qp_data, qp_index).free_cops) > 0);
-         cop = vec_pop (vec_elt (cwm->qp_data, qp_index).free_cops);
-         ASSERT (cop->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED);
-
-         cops_to_enq[qp_index][0] = cop;
-         cops_to_enq[qp_index] += 1;
-         n_cop_qp[qp_index] += 1;
-         bi_to_enq[qp_index][0] = bi0;
-         bi_to_enq[qp_index] += 1;
+         priv->next = DPDK_CRYPTO_INPUT_NEXT_DECRYPT_POST;
 
-         rte_crypto_op_attach_sym_session(cop, sess);
+         /* FIXME multi-seg */
+         sa0->total_data_size += b0->current_length;
 
-         if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
-           trunc_size = 16;
-         else
-           trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size;
-         iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+         res->ops[res->n_ops] = op;
+         res->bi[res->n_ops] = bi0;
+         res->n_ops += 1;
 
          /* Convert vlib buffer to mbuf */
-         mb0 = rte_mbuf_from_vlib_buffer(b0);
          mb0->data_len = b0->current_length;
          mb0->pkt_len = b0->current_length;
          mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data;
 
+         trunc_size = auth_alg->trunc_size;
+         iv_size = cipher_alg->iv_len;
+
          /* Outer IP header has already been stripped */
-         u16 payload_len = rte_pktmbuf_pkt_len(mb0) - sizeof (esp_header_t) -
-             iv_size - trunc_size;
+         u16 payload_len =
+           b0->current_length - sizeof (esp_header_t) - iv_size - trunc_size;
 
-         if ((payload_len & (BLOCK_SIZE - 1)) || (payload_len <= 0))
+         ASSERT (payload_len >= 4);
+
+         if (payload_len & (cipher_alg->boundary - 1))
            {
              clib_warning ("payload %u not multiple of %d\n",
-                           payload_len, BLOCK_SIZE);
+                           payload_len, cipher_alg->boundary);
              vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
                                           ESP_DECRYPT_ERROR_BAD_LEN, 1);
-             vec_add (vec_elt (cwm->qp_data, qp_index).free_cops, &cop, 1);
-             bi_to_enq[qp_index] -= 1;
-             cops_to_enq[qp_index] -= 1;
-             n_cop_qp[qp_index] -= 1;
+             res->n_ops -= 1;
              to_next[0] = bi0;
              to_next += 1;
              n_left_to_next -= 1;
              goto trace;
            }
 
-         struct rte_crypto_sym_op *sym_cop = (struct rte_crypto_sym_op *)(cop + 1);
-
-         u8 is_aead = sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128;
          u32 cipher_off, cipher_len;
-         u32 auth_off = 0, auth_len = 0, aad_size = 0;
+         u32 auth_len = 0, aad_size = 0;
          u8 *aad = NULL, *digest = NULL;
          u64 digest_paddr = 0;
 
-          u8 *iv = rte_pktmbuf_mtod_offset(mb0, void*, sizeof (esp_header_t));
-          dpdk_cop_priv_t *priv = (dpdk_cop_priv_t *)(sym_cop + 1);
+          u8 *iv = (u8 *) (esp0 + 1);
+
           dpdk_gcm_cnt_blk *icb = &priv->cb;
 
          cipher_off = sizeof (esp_header_t) + iv_size;
          cipher_len = payload_len;
 
-          digest =
-           vlib_buffer_get_current (b0) + sizeof(esp_header_t) +
-           iv_size + payload_len;
+          digest = vlib_buffer_get_tail (b0) - trunc_size;
 
-          if (is_aead)
-            {
+         if (cipher_alg->alg == RTE_CRYPTO_CIPHER_AES_CBC)
+           clib_memcpy(icb, iv, 16);
+         else /* CTR/GCM */
+           {
              u32 *_iv = (u32 *) iv;
 
              crypto_set_icb (icb, sa0->salt, _iv[0], _iv[1]);
+#if DPDK_NO_AEAD
              iv_size = 16;
+#else
+             iv_size = 12;
+#endif
+           }
 
+          if (is_aead)
+            {
               aad = priv->aad;
               clib_memcpy(aad, esp0, 8);
-             aad_size = 8;
-              if (sa0->use_esn)
+              if (PREDICT_FALSE (sa0->use_esn))
                {
                  *((u32*)&aad[8]) = sa0->seq_hi;
                  aad_size = 12;
                }
+             else
+               aad_size = 8;
             }
           else
             {
-             clib_memcpy(icb, iv, 16);
-
-             auth_off = 0;
              auth_len = sizeof(esp_header_t) + iv_size + payload_len;
 
               if (sa0->use_esn)
                 {
-                  dpdk_cop_priv_t* priv = (dpdk_cop_priv_t*) (sym_cop + 1);
-
                   clib_memcpy (priv->icv, digest, trunc_size);
                   *((u32*) digest) = sa0->seq_hi;
                  auth_len += sizeof(sa0->seq_hi);
 
                   digest = priv->icv;
                  digest_paddr =
-                   cop->phys_addr + (uintptr_t) priv->icv - (uintptr_t) cop;
+                   op->phys_addr + (uintptr_t) priv->icv - (uintptr_t) op;
                 }
             }
 
-         crypto_op_setup (is_aead, mb0, cop, sess,
+         crypto_op_setup (is_aead, mb0, op, session,
                           cipher_off, cipher_len, (u8 *) icb, iv_size,
-                          auth_off, auth_len, aad, aad_size,
+                          0, auth_len, aad, aad_size,
                           digest, digest_paddr, trunc_size);
 trace:
          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -296,38 +335,21 @@ trace:
              esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
              tr->crypto_alg = sa0->crypto_alg;
              tr->integ_alg = sa0->integ_alg;
+             clib_memcpy (tr->packet_data, vlib_buffer_get_current (b0),
+                          sizeof (esp_header_t));
            }
        }
       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
+
   vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
                               ESP_DECRYPT_ERROR_RX_PKTS,
                               from_frame->n_vectors);
-  crypto_qp_data_t *qpd;
-  /* *INDENT-OFF* */
-  vec_foreach_index (i, cwm->qp_data)
-    {
-      u32 enq;
-
-      if (!n_cop_qp[i])
-       continue;
 
-      qpd = vec_elt_at_index(cwm->qp_data, i);
-      enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id,
-                                       qpd->cops, n_cop_qp[i]);
-      qpd->inflights += enq;
+  crypto_enqueue_ops (vm, cwm, 0, dpdk_esp_decrypt_node.index,
+                     ESP_DECRYPT_ERROR_ENQ_FAIL, numa);
 
-      if (PREDICT_FALSE(enq < n_cop_qp[i]))
-       {
-         crypto_free_cop (qpd, &qpd->cops[enq], n_cop_qp[i] - enq);
-         vlib_buffer_free (vm, &qpd->bi[enq], n_cop_qp[i] - enq);
-
-         vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
-                                      ESP_DECRYPT_ERROR_ENQ_FAIL,
-                                      n_cop_qp[i] - enq);
-       }
-    }
-  /* *INDENT-ON* */
+  crypto_free_ops (numa, ops, cwm->ops + from_frame->n_vectors - ops);
 
   return from_frame->n_vectors;
 }
@@ -378,6 +400,21 @@ vlib_node_registration_t dpdk_esp_decrypt_post_node;
 
 static u8 * format_esp_decrypt_post_trace (u8 * s, va_list * args)
 {
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+  esp_decrypt_trace_t * t = va_arg (*args, esp_decrypt_trace_t *);
+  uword indent = format_get_indent (s);
+
+  s = format (s, "cipher %U auth %U\n",
+             format_ipsec_crypto_alg, t->crypto_alg,
+             format_ipsec_integ_alg, t->integ_alg);
+
+  ip4_header_t *ih4 = (ip4_header_t *) t->packet_data;
+  if ((ih4->ip_version_and_header_length & 0xF0) == 0x60)
+    s = format (s, "%U%U", format_white_space, indent, format_ip6_header, ih4);
+  else
+    s = format (s, "%U%U", format_white_space, indent, format_ip4_header, ih4);
+
   return s;
 }
 
@@ -390,7 +427,7 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
   ipsec_sa_t * sa0;
   u32 sa_index0 = ~0;
   ipsec_main_t *im = &ipsec_main;
-  dpdk_esp_main_t *em = &dpdk_esp_main;
+  dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -406,12 +443,13 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
       while (n_left_from > 0 && n_left_to_next > 0)
        {
          esp_footer_t * f0;
-         u32 bi0, next0, trunc_size, iv_size;
+         u32 bi0, iv_size, next0;
          vlib_buffer_t * b0 = 0;
          ip4_header_t *ih4 = 0, *oh4 = 0;
          ip6_header_t *ih6 = 0, *oh6 = 0;
-         u8 tunnel_mode = 1;
-         u8 transport_ip6 = 0;
+         crypto_alg_t *cipher_alg, *auth_alg;
+         esp_header_t *esp0;
+         u8 trunc_size, is_aead;
 
          next0 = ESP_DECRYPT_NEXT_DROP;
 
@@ -421,6 +459,7 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
          n_left_to_next -= 1;
 
          b0 = vlib_get_buffer (vm, bi0);
+         esp0 = vlib_buffer_get_current (b0);
 
          sa_index0 = vnet_buffer(b0)->ipsec.sad_index;
          sa0 = pool_elt_at_index (im->sad, sa_index0);
@@ -428,15 +467,24 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
          to_next[0] = bi0;
          to_next += 1;
 
-         if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128)
-           trunc_size = 16;
-         else
-           trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size;
-         iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len;
+         cipher_alg = vec_elt_at_index (dcm->cipher_algs, sa0->crypto_alg);
+         auth_alg = vec_elt_at_index (dcm->auth_algs, sa0->integ_alg);
+#if DPDK_NO_AEAD
+         is_aead = (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128 |
+                       sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_192 |
+                       sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_256);
+#else
+         is_aead = cipher_alg->type == RTE_CRYPTO_SYM_XFORM_AEAD;
+#endif
+         if (is_aead)
+           auth_alg = cipher_alg;
+
+         trunc_size = auth_alg->trunc_size;
+
+         iv_size = cipher_alg->iv_len;
 
          if (sa0->use_anti_replay)
            {
-             esp_header_t * esp0 = vlib_buffer_get_current (b0);
              u32 seq;
              seq = clib_host_to_net_u32(esp0->seq);
              if (PREDICT_TRUE(sa0->use_esn))
@@ -445,39 +493,30 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
                esp_replay_advance(sa0, seq);
            }
 
+         /* FIXME ip header */
          ih4 = (ip4_header_t *) (b0->data + sizeof(ethernet_header_t));
          vlib_buffer_advance (b0, sizeof (esp_header_t) + iv_size);
 
-         b0->current_length -= (trunc_size + 2);
          b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
-         f0 = (esp_footer_t *) ((u8 *) vlib_buffer_get_current (b0) +
-                                b0->current_length);
-         b0->current_length -= f0->pad_length;
-
-         /* transport mode */
-         if (PREDICT_FALSE(!sa0->is_tunnel && !sa0->is_tunnel_ip6))
+         f0 = (esp_footer_t *) (vlib_buffer_get_tail (b0) - trunc_size - 2);
+         b0->current_length -= (f0->pad_length + trunc_size + 2);
+#if 0
+         /* check padding */
+         const u8 *padding = vlib_buffer_get_tail (b0);
+         if (PREDICT_FALSE (memcmp (padding, pad_data, f0->pad_length)))
            {
-             tunnel_mode = 0;
-
-             if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) != 0x40))
-               {
-                 if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) == 0x60))
-                   transport_ip6 = 1;
-                 else
-                   {
-                     clib_warning("next header: 0x%x", f0->next_header);
-                     vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
-                                                  ESP_DECRYPT_ERROR_NOT_IP, 1);
-                     goto trace;
-                   }
-               }
+             clib_warning("bad padding");
+             vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+                                          ESP_DECRYPT_ERROR_DECRYPTION_FAILED,
+                                          1);
+             goto trace;
            }
-
-         if (PREDICT_TRUE (tunnel_mode))
+#endif
+         if (sa0->is_tunnel)
            {
-             if (PREDICT_TRUE(f0->next_header == IP_PROTOCOL_IP_IN_IP))
+             if (f0->next_header == IP_PROTOCOL_IP_IN_IP)
                next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
-             else if (f0->next_header == IP_PROTOCOL_IPV6)
+             else if (sa0->is_tunnel_ip6 && f0->next_header == IP_PROTOCOL_IPV6)
                next0 = ESP_DECRYPT_NEXT_IP6_INPUT;
              else
                {
@@ -488,11 +527,31 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
                  goto trace;
                }
            }
-         /* transport mode */
-         else
+         else /* transport mode */
            {
-             if (PREDICT_FALSE(transport_ip6))
+             if ((ih4->ip_version_and_header_length & 0xF0) == 0x40)
                {
+                 u16 ih4_len = ip4_header_bytes (ih4);
+                 vlib_buffer_advance (b0, - ih4_len);
+                 oh4 = vlib_buffer_get_current (b0);
+                 memmove(oh4, ih4, ih4_len);
+
+                 next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
+                 u16 old_ttl_prot =
+                   ((u16) oh4->ttl) << 8 | (u16) oh4->protocol;
+                 u16 new_ttl_prot =
+                   ((u16) oh4->ttl) << 8 | (u16) f0->next_header;
+                 oh4->protocol = f0->next_header;
+                 u16 new_len = clib_host_to_net_u16 (b0->current_length);
+                 oh4->length = new_len;
+                 /* rfc1264 incremental checksum update */
+                 oh4->checksum = ~(~oh4->checksum + ~oh4->length + new_len +
+                                   ~old_ttl_prot + new_ttl_prot);
+
+               }
+             else if ((ih4->ip_version_and_header_length & 0xF0) == 0x60)
+               {
+                 /* FIXME find ip header */
                  ih6 = (ip6_header_t *) (b0->data + sizeof(ethernet_header_t));
                  vlib_buffer_advance (b0, -sizeof(ip6_header_t));
                  oh6 = vlib_buffer_get_current (b0);
@@ -500,36 +559,29 @@ dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm,
 
                  next0 = ESP_DECRYPT_NEXT_IP6_INPUT;
                  oh6->protocol = f0->next_header;
-                 oh6->payload_length =
-                     clib_host_to_net_u16 (
-                         vlib_buffer_length_in_chain(vm, b0) -
-                         sizeof (ip6_header_t));
+                 u16 len = b0->current_length - sizeof (ip6_header_t);
+                 oh6->payload_length = clib_host_to_net_u16 (len);
                }
              else
                {
-                 vlib_buffer_advance (b0, -sizeof(ip4_header_t));
-                 oh4 = vlib_buffer_get_current (b0);
-                 memmove(oh4, ih4, sizeof(ip4_header_t));
-
-                 next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
-                 oh4->ip_version_and_header_length = 0x45;
-                 oh4->fragment_id = 0;
-                 oh4->flags_and_fragment_offset = 0;
-                 oh4->protocol = f0->next_header;
-                 oh4->length = clib_host_to_net_u16 (
-                     vlib_buffer_length_in_chain (vm, b0));
-                 oh4->checksum = ip4_header_checksum (oh4);
+                 clib_warning("next header: 0x%x", f0->next_header);
+                 vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index,
+                                              ESP_DECRYPT_ERROR_DECRYPTION_FAILED,
+                                              1);
+                 goto trace;
                }
            }
 
          vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32)~0;
 
-trace:
+       trace:
          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
            {
              esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
              tr->crypto_alg = sa0->crypto_alg;
              tr->integ_alg = sa0->integ_alg;
+             ih4 = vlib_buffer_get_current (b0);
+             clib_memcpy (tr->packet_data, ih4, sizeof (ip6_header_t));
            }
 
          vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
@@ -537,6 +589,7 @@ trace:
        }
       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
+
   vlib_node_increment_counter (vm, dpdk_esp_decrypt_post_node.index,
                               ESP_DECRYPT_POST_ERROR_PKTS,
                               from_frame->n_vectors);