classify: preformance improvements in classifiers 36/25836/20
authorRay Kinsella <mdr@ashroe.eu>
Thu, 12 Mar 2020 15:52:41 +0000 (15:52 +0000)
committerDamjan Marion <dmarion@me.com>
Mon, 28 Sep 2020 16:40:56 +0000 (16:40 +0000)
Reworked the code to reduce line fill buffer pressure. Improved compiler loop
unrolling, over the existing complex hand-unrolling. Updated the code to use
vlib_get_buffers & vlib_buffer_enqueue_to_next.

Type: improvement

Signed-off-by: Ray Kinsella <mdr@ashroe.eu>
Change-Id: I7dca7515ba91672eaf50a6eecd13811210cf0006

src/vnet/ip/ip_in_out_acl.c

index 8f550e2..2f73e13 100644 (file)
@@ -22,7 +22,8 @@ typedef struct
   u32 next_index;
   u32 table_index;
   u32 offset;
-} ip_in_out_acl_trace_t;
+}
+ip_in_out_acl_trace_t;
 
 /* packet trace format function */
 static u8 *
@@ -71,7 +72,8 @@ typedef enum
   foreach_ip_inacl_error
 #undef _
     IP_INACL_N_ERROR,
-} ip_inacl_error_t;
+}
+ip_inacl_error_t;
 
 static char *ip_inacl_error_strings[] = {
 #define _(sym,string) string,
@@ -85,7 +87,8 @@ typedef enum
   foreach_ip_outacl_error
 #undef _
     IP_OUTACL_N_ERROR,
-} ip_outacl_error_t;
+}
+ip_outacl_error_t;
 
 static char *ip_outacl_error_strings[] = {
 #define _(sym,string) string,
@@ -93,13 +96,12 @@ static char *ip_outacl_error_strings[] = {
 #undef _
 };
 
-static inline uword
+static_always_inline void
 ip_in_out_acl_inline (vlib_main_t * vm,
-                     vlib_node_runtime_t * node, vlib_frame_t * frame,
-                     int is_ip4, int is_output)
+                     vlib_node_runtime_t * node, vlib_buffer_t ** b,
+                     u16 * next, u32 n_left, int is_ip4, int is_output,
+                     int do_trace)
 {
-  u32 n_left_from, *from, *to_next;
-  acl_next_index_t next_index;
   in_out_acl_main_t *am = &in_out_acl_main;
   vnet_classify_main_t *vcm = am->vnet_classify_main;
   f64 now = vlib_time_now (vm);
@@ -110,6 +112,12 @@ ip_in_out_acl_inline (vlib_main_t * vm,
   vlib_node_runtime_t *error_node;
   u32 n_next_nodes;
 
+  u8 *h[4];
+  u32 sw_if_index[4];
+  u32 table_index[4];
+  vnet_classify_table_t *t[4] = { 0, 0 };
+  u64 hash[4];
+
   n_next_nodes = node->n_next_nodes;
 
   if (is_ip4)
@@ -123,355 +131,640 @@ ip_in_out_acl_inline (vlib_main_t * vm,
       error_node = vlib_node_get_runtime (vm, ip6_input_node.index);
     }
 
-  from = vlib_frame_vector_args (frame);
-  n_left_from = frame->n_vectors;
+  /* calculate hashes for b[0] & b[1] */
+  if (n_left >= 2)
+    {
+      sw_if_index[2] =
+       vnet_buffer (b[0])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+      sw_if_index[3] =
+       vnet_buffer (b[1])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+
+      table_index[2] =
+       am->classify_table_index_by_sw_if_index[is_output][tid]
+       [sw_if_index[2]];
+      table_index[3] =
+       am->classify_table_index_by_sw_if_index[is_output][tid]
+       [sw_if_index[3]];
+
+      t[2] = pool_elt_at_index (vcm->tables, table_index[2]);
+      t[3] = pool_elt_at_index (vcm->tables, table_index[3]);
+
+      if (t[2]->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+       h[2] =
+         (void *) vlib_buffer_get_current (b[0]) + t[2]->current_data_offset;
+      else
+       h[2] = b[0]->data;
+
+      if (t[3]->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+       h[3] =
+         (void *) vlib_buffer_get_current (b[1]) + t[3]->current_data_offset;
+      else
+       h[3] = b[1]->data;
+
+      if (is_output)
+       {
+         /* Save the rewrite length, since we are using the l2_classify struct */
+         vnet_buffer (b[0])->l2_classify.pad.l2_len =
+           vnet_buffer (b[0])->ip.save_rewrite_length;
+         /* advance the match pointer so the matching happens on IP header */
+         h[2] += vnet_buffer (b[0])->l2_classify.pad.l2_len;
+
+         /* Save the rewrite length, since we are using the l2_classify struct */
+         vnet_buffer (b[1])->l2_classify.pad.l2_len =
+           vnet_buffer (b[1])->ip.save_rewrite_length;
+         /* advance the match pointer so the matching happens on IP header */
+         h[3] += vnet_buffer (b[1])->l2_classify.pad.l2_len;
+       }
 
-  /* First pass: compute hashes */
+      hash[2] = vnet_classify_hash_packet_inline (t[2], (u8 *) h[2]);
+      hash[3] = vnet_classify_hash_packet_inline (t[3], (u8 *) h[3]);
 
-  while (n_left_from > 2)
+      vnet_buffer (b[0])->l2_classify.hash = hash[2];
+      vnet_buffer (b[1])->l2_classify.hash = hash[3];
+
+      vnet_buffer (b[0])->l2_classify.table_index = table_index[2];
+      vnet_buffer (b[1])->l2_classify.table_index = table_index[3];
+
+      vnet_buffer (b[0])->l2_classify.opaque_index = ~0;
+      vnet_buffer (b[1])->l2_classify.opaque_index = ~0;
+
+      vnet_classify_prefetch_bucket (t[2],
+                                    vnet_buffer (b[0])->l2_classify.hash);
+      vnet_classify_prefetch_bucket (t[3],
+                                    vnet_buffer (b[1])->l2_classify.hash);
+    }
+
+  while (n_left >= 2)
     {
-      vlib_buffer_t *b0, *b1;
-      u32 bi0, bi1;
-      u8 *h0, *h1;
-      u32 sw_if_index0, sw_if_index1;
-      u32 table_index0, table_index1;
-      vnet_classify_table_t *t0, *t1;
+      vnet_classify_entry_t *e[2] = { 0, 0 };
+      u32 _next[2] = { ACL_NEXT_INDEX_DENY, ACL_NEXT_INDEX_DENY };
+      u8 error[2];
+
+      h[0] = h[2];
+      h[1] = h[3];
+      t[0] = t[2];
+      t[1] = t[3];
+
+      sw_if_index[0] = sw_if_index[2];
+      sw_if_index[1] = sw_if_index[3];
+
+      table_index[0] = table_index[2];
+      table_index[1] = table_index[3];
+
+      hash[0] = hash[2];
+      hash[1] = hash[3];
 
       /* prefetch next iteration */
-      {
-       vlib_buffer_t *p1, *p2;
+      if (n_left >= 6)
+       {
+         vlib_prefetch_buffer_header (b[4], LOAD);
+         vlib_prefetch_buffer_header (b[5], LOAD);
 
-       p1 = vlib_get_buffer (vm, from[1]);
-       p2 = vlib_get_buffer (vm, from[2]);
+         CLIB_PREFETCH (b[4]->data, CLIB_CACHE_LINE_BYTES, LOAD);
+         CLIB_PREFETCH (b[5]->data, CLIB_CACHE_LINE_BYTES, LOAD);
+       }
 
-       vlib_prefetch_buffer_header (p1, STORE);
-       CLIB_PREFETCH (p1->data, CLIB_CACHE_LINE_BYTES, STORE);
-       vlib_prefetch_buffer_header (p2, STORE);
-       CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
-      }
+      /* calculate hashes for b[2] & b[3] */
+      if (n_left >= 4)
+       {
+         sw_if_index[2] =
+           vnet_buffer (b[2])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+         sw_if_index[3] =
+           vnet_buffer (b[3])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+
+         table_index[2] =
+           am->classify_table_index_by_sw_if_index[is_output][tid]
+           [sw_if_index[2]];
+         table_index[3] =
+           am->classify_table_index_by_sw_if_index[is_output][tid]
+           [sw_if_index[3]];
+
+         t[2] = pool_elt_at_index (vcm->tables, table_index[2]);
+         t[3] = pool_elt_at_index (vcm->tables, table_index[3]);
+
+         if (t[2]->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+           h[2] =
+             (void *) vlib_buffer_get_current (b[2]) +
+             t[2]->current_data_offset;
+         else
+           h[2] = b[2]->data;
+
+         if (t[3]->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+           h[3] =
+             (void *) vlib_buffer_get_current (b[3]) +
+             t[3]->current_data_offset;
+         else
+           h[3] = b[3]->data;
+
+         if (is_output)
+           {
+             /* Save the rewrite length, since we are using the l2_classify struct */
+             vnet_buffer (b[2])->l2_classify.pad.l2_len =
+               vnet_buffer (b[2])->ip.save_rewrite_length;
+             /* advance the match pointer so the matching happens on IP header */
+             h[2] += vnet_buffer (b[2])->l2_classify.pad.l2_len;
 
-      bi0 = from[0];
-      b0 = vlib_get_buffer (vm, bi0);
+             /* Save the rewrite length, since we are using the l2_classify struct */
+             vnet_buffer (b[3])->l2_classify.pad.l2_len =
+               vnet_buffer (b[3])->ip.save_rewrite_length;
+             /* advance the match pointer so the matching happens on IP header */
+             h[3] += vnet_buffer (b[3])->l2_classify.pad.l2_len;
+           }
 
-      bi1 = from[1];
-      b1 = vlib_get_buffer (vm, bi1);
+         hash[2] = vnet_classify_hash_packet_inline (t[2], (u8 *) h[2]);
+         hash[3] = vnet_classify_hash_packet_inline (t[3], (u8 *) h[3]);
 
-      sw_if_index0 =
-       vnet_buffer (b0)->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
-      table_index0 =
-       am->classify_table_index_by_sw_if_index[is_output][tid][sw_if_index0];
+         vnet_buffer (b[2])->l2_classify.hash = hash[2];
+         vnet_buffer (b[3])->l2_classify.hash = hash[3];
 
-      sw_if_index1 =
-       vnet_buffer (b1)->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
-      table_index1 =
-       am->classify_table_index_by_sw_if_index[is_output][tid][sw_if_index1];
+         vnet_buffer (b[2])->l2_classify.table_index = table_index[2];
+         vnet_buffer (b[3])->l2_classify.table_index = table_index[3];
 
-      t0 = pool_elt_at_index (vcm->tables, table_index0);
+         vnet_buffer (b[2])->l2_classify.opaque_index = ~0;
+         vnet_buffer (b[3])->l2_classify.opaque_index = ~0;
 
-      t1 = pool_elt_at_index (vcm->tables, table_index1);
+         vnet_classify_prefetch_bucket (t[2],
+                                        vnet_buffer (b[2])->
+                                        l2_classify.hash);
+         vnet_classify_prefetch_bucket (t[3],
+                                        vnet_buffer (b[3])->
+                                        l2_classify.hash);
+       }
 
-      if (t0->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
-       h0 = (void *) vlib_buffer_get_current (b0) + t0->current_data_offset;
-      else
-       h0 = b0->data;
+      /* find entry for b[0] & b[1] */
+      vnet_get_config_data (am->vnet_config_main[is_output][tid],
+                           &b[0]->current_config_index, &_next[0],
+                           /* # bytes of config data */ 0);
+      vnet_get_config_data (am->vnet_config_main[is_output][tid],
+                           &b[1]->current_config_index, &_next[1],
+                           /* # bytes of config data */ 0);
 
-      if (is_output)
+      if (PREDICT_TRUE (table_index[0] != ~0))
        {
-         /* Save the rewrite length, since we are using the l2_classify struct */
-         vnet_buffer (b0)->l2_classify.pad.l2_len =
-           vnet_buffer (b0)->ip.save_rewrite_length;
-         /* advance the match pointer so the matching happens on IP header */
-         h0 += vnet_buffer (b0)->l2_classify.pad.l2_len;
+         e[0] =
+           vnet_classify_find_entry_inline (t[0], (u8 *) h[0], hash[0], now);
+         if (e[0])
+           {
+             vnet_buffer (b[0])->l2_classify.opaque_index
+               = e[0]->opaque_index;
+             vlib_buffer_advance (b[0], e[0]->advance);
+
+             _next[0] = (e[0]->next_index < n_next_nodes) ?
+               e[0]->next_index : _next[0];
+
+             hits++;
+
+             if (is_ip4)
+               error[0] = (_next[0] == ACL_NEXT_INDEX_DENY) ?
+                 (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
+                  IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
+             else
+               error[0] = (_next[0] == ACL_NEXT_INDEX_DENY) ?
+                 (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
+                  IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
+             b[0]->error = error_node->errors[error[0]];
+
+             if (!is_output)
+               {
+                 if (e[0]->action == CLASSIFY_ACTION_SET_IP4_FIB_INDEX ||
+                     e[0]->action == CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
+                   vnet_buffer (b[0])->sw_if_index[VLIB_TX] = e[0]->metadata;
+                 else if (e[0]->action == CLASSIFY_ACTION_SET_METADATA)
+                   vnet_buffer (b[0])->ip.adj_index[VLIB_TX] =
+                     e[0]->metadata;
+               }
+           }
+         else
+           {
+             while (1)
+               {
+                 if (PREDICT_TRUE (t[0]->next_table_index != ~0))
+                   t[0] = pool_elt_at_index (vcm->tables,
+                                             t[0]->next_table_index);
+                 else
+                   {
+                     _next[0] = (t[0]->miss_next_index < n_next_nodes) ?
+                       t[0]->miss_next_index : _next[0];
+
+                     misses++;
+
+                     if (is_ip4)
+                       error[0] = (_next[0] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP4_ERROR_OUTACL_TABLE_MISS :
+                          IP4_ERROR_INACL_TABLE_MISS) : IP4_ERROR_NONE;
+                     else
+                       error[0] = (_next[0] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP6_ERROR_OUTACL_TABLE_MISS :
+                          IP6_ERROR_INACL_TABLE_MISS) : IP6_ERROR_NONE;
+                     b[0]->error = error_node->errors[error[0]];
+                     break;
+                   }
+
+                 if (t[0]->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+                   h[0] =
+                     (void *) vlib_buffer_get_current (b[0]) +
+                     t[0]->current_data_offset;
+                 else
+                   h[0] = b[0]->data;
+
+                 /* advance the match pointer so the matching happens on IP header */
+                 if (is_output)
+                   h[0] += vnet_buffer (b[0])->l2_classify.pad.l2_len;
+
+                 hash[0] =
+                   vnet_classify_hash_packet_inline (t[0], (u8 *) h[0]);
+                 e[0] =
+                   vnet_classify_find_entry_inline (t[0], (u8 *) h[0],
+                                                    hash[0], now);
+                 if (e[0])
+                   {
+                     vnet_buffer (b[0])->l2_classify.opaque_index
+                       = e[0]->opaque_index;
+                     vlib_buffer_advance (b[0], e[0]->advance);
+                     _next[0] = (e[0]->next_index < n_next_nodes) ?
+                       e[0]->next_index : _next[0];
+                     hits++;
+                     chain_hits++;
+
+                     if (is_ip4)
+                       error[0] = (_next[0] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
+                          IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
+                     else
+                       error[0] = (_next[0] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
+                          IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
+                     b[0]->error = error_node->errors[error[0]];
+
+                     if (!is_output)
+                       {
+                         if (e[0]->action ==
+                             CLASSIFY_ACTION_SET_IP4_FIB_INDEX
+                             || e[0]->action ==
+                             CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
+                           vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
+                             e[0]->metadata;
+                         else if (e[0]->action ==
+                                  CLASSIFY_ACTION_SET_METADATA)
+                           vnet_buffer (b[0])->ip.adj_index[VLIB_TX] =
+                             e[0]->metadata;
+                       }
+                     break;
+                   }
+               }
+           }
        }
 
-      vnet_buffer (b0)->l2_classify.hash =
-       vnet_classify_hash_packet (t0, (u8 *) h0);
+      if (PREDICT_TRUE (table_index[1] != ~0))
+       {
+         e[1] =
+           vnet_classify_find_entry_inline (t[1], (u8 *) h[1], hash[1], now);
+         if (e[1])
+           {
+             vnet_buffer (b[1])->l2_classify.opaque_index
+               = e[1]->opaque_index;
+             vlib_buffer_advance (b[1], e[1]->advance);
 
-      vnet_classify_prefetch_bucket (t0, vnet_buffer (b0)->l2_classify.hash);
+             _next[1] = (e[1]->next_index < n_next_nodes) ?
+               e[1]->next_index : _next[1];
 
-      if (t1->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
-       h1 = (void *) vlib_buffer_get_current (b1) + t1->current_data_offset;
-      else
-       h1 = b1->data;
+             hits++;
 
-      if (is_output)
+             if (is_ip4)
+               error[1] = (_next[1] == ACL_NEXT_INDEX_DENY) ?
+                 (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
+                  IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
+             else
+               error[1] = (_next[1] == ACL_NEXT_INDEX_DENY) ?
+                 (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
+                  IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
+             b[1]->error = error_node->errors[error[1]];
+
+             if (!is_output)
+               {
+                 if (e[1]->action == CLASSIFY_ACTION_SET_IP4_FIB_INDEX ||
+                     e[1]->action == CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
+                   vnet_buffer (b[1])->sw_if_index[VLIB_TX] = e[1]->metadata;
+                 else if (e[1]->action == CLASSIFY_ACTION_SET_METADATA)
+                   vnet_buffer (b[1])->ip.adj_index[VLIB_TX] =
+                     e[1]->metadata;
+               }
+           }
+         else
+           {
+             while (1)
+               {
+                 if (PREDICT_TRUE (t[1]->next_table_index != ~0))
+                   t[1] = pool_elt_at_index (vcm->tables,
+                                             t[1]->next_table_index);
+                 else
+                   {
+                     _next[1] = (t[1]->miss_next_index < n_next_nodes) ?
+                       t[1]->miss_next_index : _next[1];
+
+                     misses++;
+
+                     if (is_ip4)
+                       error[1] = (_next[1] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP4_ERROR_OUTACL_TABLE_MISS :
+                          IP4_ERROR_INACL_TABLE_MISS) : IP4_ERROR_NONE;
+                     else
+                       error[1] = (_next[1] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP6_ERROR_OUTACL_TABLE_MISS :
+                          IP6_ERROR_INACL_TABLE_MISS) : IP6_ERROR_NONE;
+                     b[1]->error = error_node->errors[error[1]];
+                     break;
+                   }
+
+                 if (t[1]->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+                   h[1] =
+                     (void *) vlib_buffer_get_current (b[1]) +
+                     t[1]->current_data_offset;
+                 else
+                   h[1] = b[1]->data;
+
+                 /* advance the match pointer so the matching happens on IP header */
+                 if (is_output)
+                   h[1] += vnet_buffer (b[1])->l2_classify.pad.l2_len;
+
+                 hash[1] =
+                   vnet_classify_hash_packet_inline (t[1], (u8 *) h[1]);
+                 e[1] =
+                   vnet_classify_find_entry_inline (t[1], (u8 *) h[1],
+                                                    hash[1], now);
+                 if (e[1])
+                   {
+                     vnet_buffer (b[1])->l2_classify.opaque_index
+                       = e[1]->opaque_index;
+                     vlib_buffer_advance (b[1], e[1]->advance);
+                     _next[1] = (e[1]->next_index < n_next_nodes) ?
+                       e[1]->next_index : _next[1];
+                     hits++;
+                     chain_hits++;
+
+                     if (is_ip4)
+                       error[1] = (_next[1] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
+                          IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
+                     else
+                       error[1] = (_next[1] == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
+                          IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
+                     b[1]->error = error_node->errors[error[1]];
+
+                     if (!is_output)
+                       {
+                         if (e[1]->action ==
+                             CLASSIFY_ACTION_SET_IP4_FIB_INDEX
+                             || e[1]->action ==
+                             CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
+                           vnet_buffer (b[1])->sw_if_index[VLIB_TX] =
+                             e[1]->metadata;
+                         else if (e[1]->action ==
+                                  CLASSIFY_ACTION_SET_METADATA)
+                           vnet_buffer (b[1])->ip.adj_index[VLIB_TX] =
+                             e[1]->metadata;
+                       }
+                     break;
+                   }
+               }
+           }
+       }
+
+      if (do_trace && b[0]->flags & VLIB_BUFFER_IS_TRACED)
        {
-         /* Save the rewrite length, since we are using the l2_classify struct */
-         vnet_buffer (b1)->l2_classify.pad.l2_len =
-           vnet_buffer (b1)->ip.save_rewrite_length;
-         /* advance the match pointer so the matching happens on IP header */
-         h1 += vnet_buffer (b1)->l2_classify.pad.l2_len;
+         ip_in_out_acl_trace_t *_t =
+           vlib_add_trace (vm, node, b[0], sizeof (*_t));
+         _t->sw_if_index =
+           vnet_buffer (b[0])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+         _t->next_index = _next[0];
+         _t->table_index = t[0] ? t[0] - vcm->tables : ~0;
+         _t->offset = (e[0]
+                       && t[0]) ? vnet_classify_get_offset (t[0], e[0]) : ~0;
        }
 
-      vnet_buffer (b1)->l2_classify.hash =
-       vnet_classify_hash_packet (t1, (u8 *) h1);
+      if (do_trace && b[1]->flags & VLIB_BUFFER_IS_TRACED)
+       {
+         ip_in_out_acl_trace_t *_t =
+           vlib_add_trace (vm, node, b[1], sizeof (*_t));
+         _t->sw_if_index =
+           vnet_buffer (b[1])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+         _t->next_index = _next[1];
+         _t->table_index = t[1] ? t[1] - vcm->tables : ~0;
+         _t->offset = (e[1]
+                       && t[1]) ? vnet_classify_get_offset (t[1], e[1]) : ~0;
+       }
 
-      vnet_classify_prefetch_bucket (t1, vnet_buffer (b1)->l2_classify.hash);
+      if ((_next[0] == ACL_NEXT_INDEX_DENY) && is_output)
+       {
+         /* on output, for the drop node to work properly, go back to ip header */
+         vlib_buffer_advance (b[0], vnet_buffer (b[0])->l2.l2_len);
+       }
 
-      vnet_buffer (b0)->l2_classify.table_index = table_index0;
+      if ((_next[1] == ACL_NEXT_INDEX_DENY) && is_output)
+       {
+         /* on output, for the drop node to work properly, go back to ip header */
+         vlib_buffer_advance (b[1], vnet_buffer (b[1])->l2.l2_len);
+       }
 
-      vnet_buffer (b1)->l2_classify.table_index = table_index1;
+      next[0] = _next[0];
+      next[1] = _next[1];
 
-      from += 2;
-      n_left_from -= 2;
+      /* _next */
+      next += 2;
+      b += 2;
+      n_left -= 2;
     }
 
-  while (n_left_from > 0)
+  while (n_left > 0)
     {
-      vlib_buffer_t *b0;
-      u32 bi0;
       u8 *h0;
       u32 sw_if_index0;
       u32 table_index0;
-      vnet_classify_table_t *t0;
-
-      bi0 = from[0];
-      b0 = vlib_get_buffer (vm, bi0);
+      vnet_classify_table_t *t0 = 0;
+      vnet_classify_entry_t *e0 = 0;
+      u32 next0 = ACL_NEXT_INDEX_DENY;
+      u64 hash0;
+      u8 error0;
 
       sw_if_index0 =
-       vnet_buffer (b0)->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+       vnet_buffer (b[0])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
       table_index0 =
        am->classify_table_index_by_sw_if_index[is_output][tid][sw_if_index0];
 
       t0 = pool_elt_at_index (vcm->tables, table_index0);
 
       if (t0->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
-       h0 = (void *) vlib_buffer_get_current (b0) + t0->current_data_offset;
+       h0 =
+         (void *) vlib_buffer_get_current (b[0]) + t0->current_data_offset;
       else
-       h0 = b0->data;
+       h0 = b[0]->data;
 
       if (is_output)
        {
          /* Save the rewrite length, since we are using the l2_classify struct */
-         vnet_buffer (b0)->l2_classify.pad.l2_len =
-           vnet_buffer (b0)->ip.save_rewrite_length;
+         vnet_buffer (b[0])->l2_classify.pad.l2_len =
+           vnet_buffer (b[0])->ip.save_rewrite_length;
          /* advance the match pointer so the matching happens on IP header */
-         h0 += vnet_buffer (b0)->l2_classify.pad.l2_len;
+         h0 += vnet_buffer (b[0])->l2_classify.pad.l2_len;
        }
 
-      vnet_buffer (b0)->l2_classify.hash =
+      vnet_buffer (b[0])->l2_classify.hash =
        vnet_classify_hash_packet (t0, (u8 *) h0);
 
-      vnet_buffer (b0)->l2_classify.table_index = table_index0;
-      vnet_classify_prefetch_bucket (t0, vnet_buffer (b0)->l2_classify.hash);
+      vnet_buffer (b[0])->l2_classify.table_index = table_index0;
+      vnet_buffer (b[0])->l2_classify.opaque_index = ~0;
 
-      from++;
-      n_left_from--;
-    }
+      vnet_get_config_data (am->vnet_config_main[is_output][tid],
+                           &b[0]->current_config_index, &next0,
+                           /* # bytes of config data */ 0);
 
-  next_index = node->cached_next_index;
-  from = vlib_frame_vector_args (frame);
-  n_left_from = frame->n_vectors;
+      if (PREDICT_TRUE (table_index0 != ~0))
+       {
+         hash0 = vnet_buffer (b[0])->l2_classify.hash;
+         t0 = pool_elt_at_index (vcm->tables, table_index0);
 
-  while (n_left_from > 0)
-    {
-      u32 n_left_to_next;
+         if (t0->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+           h0 =
+             (void *) vlib_buffer_get_current (b[0]) +
+             t0->current_data_offset;
+         else
+           h0 = b[0]->data;
 
-      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+         /* advance the match pointer so the matching happens on IP header */
+         if (is_output)
+           h0 += vnet_buffer (b[0])->l2_classify.pad.l2_len;
 
-      /* Not enough load/store slots to dual loop... */
-      while (n_left_from > 0 && n_left_to_next > 0)
-       {
-         u32 bi0;
-         vlib_buffer_t *b0;
-         u32 next0 = ACL_NEXT_INDEX_DENY;
-         u32 table_index0;
-         vnet_classify_table_t *t0;
-         vnet_classify_entry_t *e0;
-         u64 hash0;
-         u8 *h0;
-         u8 error0;
-
-         /* Stride 3 seems to work best */
-         if (PREDICT_TRUE (n_left_from > 3))
+         e0 = vnet_classify_find_entry_inline (t0, (u8 *) h0, hash0, now);
+         if (e0)
            {
-             vlib_buffer_t *p1 = vlib_get_buffer (vm, from[3]);
-             vnet_classify_table_t *tp1;
-             u32 table_index1;
-             u64 phash1;
+             vnet_buffer (b[0])->l2_classify.opaque_index = e0->opaque_index;
+             vlib_buffer_advance (b[0], e0->advance);
 
-             table_index1 = vnet_buffer (p1)->l2_classify.table_index;
+             next0 = (e0->next_index < n_next_nodes) ?
+               e0->next_index : next0;
+
+             hits++;
+
+             if (is_ip4)
+               error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
+                 (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
+                  IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
+             else
+               error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
+                 (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
+                  IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
+             b[0]->error = error_node->errors[error0];
 
-             if (PREDICT_TRUE (table_index1 != ~0))
+             if (!is_output)
                {
-                 tp1 = pool_elt_at_index (vcm->tables, table_index1);
-                 phash1 = vnet_buffer (p1)->l2_classify.hash;
-                 vnet_classify_prefetch_entry (tp1, phash1);
+                 if (e0->action == CLASSIFY_ACTION_SET_IP4_FIB_INDEX ||
+                     e0->action == CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
+                   vnet_buffer (b[0])->sw_if_index[VLIB_TX] = e0->metadata;
+                 else if (e0->action == CLASSIFY_ACTION_SET_METADATA)
+                   vnet_buffer (b[0])->ip.adj_index[VLIB_TX] = e0->metadata;
                }
            }
-
-
-         /* speculatively enqueue b0 to the current next frame */
-         bi0 = from[0];
-         to_next[0] = bi0;
-         from += 1;
-         to_next += 1;
-         n_left_from -= 1;
-         n_left_to_next -= 1;
-
-         b0 = vlib_get_buffer (vm, bi0);
-         table_index0 = vnet_buffer (b0)->l2_classify.table_index;
-         e0 = 0;
-         t0 = 0;
-         vnet_get_config_data (am->vnet_config_main[is_output][tid],
-                               &b0->current_config_index, &next0,
-                               /* # bytes of config data */ 0);
-
-         vnet_buffer (b0)->l2_classify.opaque_index = ~0;
-
-         if (PREDICT_TRUE (table_index0 != ~0))
+         else
            {
-             hash0 = vnet_buffer (b0)->l2_classify.hash;
-             t0 = pool_elt_at_index (vcm->tables, table_index0);
-
-             if (t0->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
-               h0 =
-                 (void *) vlib_buffer_get_current (b0) +
-                 t0->current_data_offset;
-             else
-               h0 = b0->data;
-
-             /* advance the match pointer so the matching happens on IP header */
-             if (is_output)
-               h0 += vnet_buffer (b0)->l2_classify.pad.l2_len;
-
-             e0 = vnet_classify_find_entry (t0, (u8 *) h0, hash0, now);
-             if (e0)
+             while (1)
                {
-                 vnet_buffer (b0)->l2_classify.opaque_index
-                   = e0->opaque_index;
-                 vlib_buffer_advance (b0, e0->advance);
+                 if (PREDICT_TRUE (t0->next_table_index != ~0))
+                   t0 =
+                     pool_elt_at_index (vcm->tables, t0->next_table_index);
+                 else
+                   {
+                     next0 = (t0->miss_next_index < n_next_nodes) ?
+                       t0->miss_next_index : next0;
 
-                 next0 = (e0->next_index < n_next_nodes) ?
-                   e0->next_index : next0;
+                     misses++;
 
-                 hits++;
+                     if (is_ip4)
+                       error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP4_ERROR_OUTACL_TABLE_MISS :
+                          IP4_ERROR_INACL_TABLE_MISS) : IP4_ERROR_NONE;
+                     else
+                       error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP6_ERROR_OUTACL_TABLE_MISS :
+                          IP6_ERROR_INACL_TABLE_MISS) : IP6_ERROR_NONE;
+                     b[0]->error = error_node->errors[error0];
+                     break;
+                   }
 
-                 if (is_ip4)
-                   error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
-                     (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
-                      IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
+                 if (t0->current_data_flag == CLASSIFY_FLAG_USE_CURR_DATA)
+                   h0 =
+                     (void *) vlib_buffer_get_current (b[0]) +
+                     t0->current_data_offset;
                  else
-                   error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
-                     (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
-                      IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
-                 b0->error = error_node->errors[error0];
+                   h0 = b[0]->data;
 
-                 if (!is_output)
-                   {
-                     if (e0->action == CLASSIFY_ACTION_SET_IP4_FIB_INDEX ||
-                         e0->action == CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
-                       vnet_buffer (b0)->sw_if_index[VLIB_TX] = e0->metadata;
-                     else if (e0->action == CLASSIFY_ACTION_SET_METADATA)
-                       vnet_buffer (b0)->ip.adj_index[VLIB_TX] =
-                         e0->metadata;
-                   }
-               }
-             else
-               {
-                 while (1)
-                   {
-                     if (PREDICT_TRUE (t0->next_table_index != ~0))
-                       t0 = pool_elt_at_index (vcm->tables,
-                                               t0->next_table_index);
-                     else
-                       {
-                         next0 = (t0->miss_next_index < n_next_nodes) ?
-                           t0->miss_next_index : next0;
-
-                         misses++;
-
-                         if (is_ip4)
-                           error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
-                             (is_output ? IP4_ERROR_OUTACL_TABLE_MISS :
-                              IP4_ERROR_INACL_TABLE_MISS) : IP4_ERROR_NONE;
-                         else
-                           error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
-                             (is_output ? IP6_ERROR_OUTACL_TABLE_MISS :
-                              IP6_ERROR_INACL_TABLE_MISS) : IP6_ERROR_NONE;
-                         b0->error = error_node->errors[error0];
-                         break;
-                       }
+                 /* advance the match pointer so the matching happens on IP header */
+                 if (is_output)
+                   h0 += vnet_buffer (b[0])->l2_classify.pad.l2_len;
 
-                     if (t0->current_data_flag ==
-                         CLASSIFY_FLAG_USE_CURR_DATA)
-                       h0 =
-                         (void *) vlib_buffer_get_current (b0) +
-                         t0->current_data_offset;
+                 hash0 = vnet_classify_hash_packet_inline (t0, (u8 *) h0);
+                 e0 = vnet_classify_find_entry_inline
+                   (t0, (u8 *) h0, hash0, now);
+                 if (e0)
+                   {
+                     vnet_buffer (b[0])->l2_classify.opaque_index
+                       = e0->opaque_index;
+                     vlib_buffer_advance (b[0], e0->advance);
+                     next0 = (e0->next_index < n_next_nodes) ?
+                       e0->next_index : next0;
+                     hits++;
+
+                     if (is_ip4)
+                       error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
+                          IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
                      else
-                       h0 = b0->data;
-
-                     /* advance the match pointer so the matching happens on IP header */
-                     if (is_output)
-                       h0 += vnet_buffer (b0)->l2_classify.pad.l2_len;
+                       error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
+                         (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
+                          IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
+                     b[0]->error = error_node->errors[error0];
 
-                     hash0 = vnet_classify_hash_packet (t0, (u8 *) h0);
-                     e0 = vnet_classify_find_entry
-                       (t0, (u8 *) h0, hash0, now);
-                     if (e0)
+                     if (!is_output)
                        {
-                         vnet_buffer (b0)->l2_classify.opaque_index
-                           = e0->opaque_index;
-                         vlib_buffer_advance (b0, e0->advance);
-                         next0 = (e0->next_index < n_next_nodes) ?
-                           e0->next_index : next0;
-                         hits++;
-                         chain_hits++;
-
-                         if (is_ip4)
-                           error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
-                             (is_output ? IP4_ERROR_OUTACL_SESSION_DENY :
-                              IP4_ERROR_INACL_SESSION_DENY) : IP4_ERROR_NONE;
-                         else
-                           error0 = (next0 == ACL_NEXT_INDEX_DENY) ?
-                             (is_output ? IP6_ERROR_OUTACL_SESSION_DENY :
-                              IP6_ERROR_INACL_SESSION_DENY) : IP6_ERROR_NONE;
-                         b0->error = error_node->errors[error0];
-
-                         if (!is_output)
-                           {
-                             if (e0->action ==
-                                 CLASSIFY_ACTION_SET_IP4_FIB_INDEX
-                                 || e0->action ==
-                                 CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
-                               vnet_buffer (b0)->sw_if_index[VLIB_TX] =
-                                 e0->metadata;
-                             else if (e0->action ==
-                                      CLASSIFY_ACTION_SET_METADATA)
-                               vnet_buffer (b0)->ip.adj_index[VLIB_TX] =
-                                 e0->metadata;
-                           }
-                         break;
+                         if (e0->action ==
+                             CLASSIFY_ACTION_SET_IP4_FIB_INDEX
+                             || e0->action ==
+                             CLASSIFY_ACTION_SET_IP6_FIB_INDEX)
+                           vnet_buffer (b[0])->sw_if_index[VLIB_TX] =
+                             e0->metadata;
+                         else if (e0->action == CLASSIFY_ACTION_SET_METADATA)
+                           vnet_buffer (b[0])->ip.adj_index[VLIB_TX] =
+                             e0->metadata;
                        }
+                     break;
                    }
                }
            }
+       }
 
-         if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)
-                            && (b0->flags & VLIB_BUFFER_IS_TRACED)))
-           {
-             ip_in_out_acl_trace_t *t =
-               vlib_add_trace (vm, node, b0, sizeof (*t));
-             t->sw_if_index =
-               vnet_buffer (b0)->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
-             t->next_index = next0;
-             t->table_index = t0 ? t0 - vcm->tables : ~0;
-             t->offset = (e0 && t0) ? vnet_classify_get_offset (t0, e0) : ~0;
-           }
-
-         if ((next0 == ACL_NEXT_INDEX_DENY) && is_output)
-           {
-             /* on output, for the drop node to work properly, go back to ip header */
-             vlib_buffer_advance (b0, vnet_buffer (b0)->l2.l2_len);
-           }
+      if (do_trace && b[0]->flags & VLIB_BUFFER_IS_TRACED)
+       {
+         ip_in_out_acl_trace_t *t =
+           vlib_add_trace (vm, node, b[0], sizeof (*t));
+         t->sw_if_index =
+           vnet_buffer (b[0])->sw_if_index[is_output ? VLIB_TX : VLIB_RX];
+         t->next_index = next0;
+         t->table_index = t0 ? t0 - vcm->tables : ~0;
+         t->offset = (e0 && t0) ? vnet_classify_get_offset (t0, e0) : ~0;
+       }
 
-         /* verify speculative enqueue, maybe switch current next frame */
-         vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
-                                          to_next, n_left_to_next,
-                                          bi0, next0);
+      if ((next0 == ACL_NEXT_INDEX_DENY) && is_output)
+       {
+         /* on output, for the drop node to work properly, go back to ip header */
+         vlib_buffer_advance (b[0], vnet_buffer (b[0])->l2.l2_len);
        }
 
-      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+      next[0] = next0;
+
+      /* next */
+      next++;
+      b++;
+      n_left--;
     }
 
   vlib_node_increment_counter (vm, node->node_index,
@@ -483,23 +776,58 @@ ip_in_out_acl_inline (vlib_main_t * vm,
   vlib_node_increment_counter (vm, node->node_index,
                               is_output ? IP_OUTACL_ERROR_CHAIN_HIT :
                               IP_INACL_ERROR_CHAIN_HIT, chain_hits);
-  return frame->n_vectors;
 }
 
 VLIB_NODE_FN (ip4_inacl_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
                               vlib_frame_t * frame)
 {
-  return ip_in_out_acl_inline (vm, node, frame, 1 /* is_ip4 */ ,
-                              0 /* is_output */ );
+
+  u32 *from;
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
+  u16 nexts[VLIB_FRAME_SIZE];
+
+  from = vlib_frame_vector_args (frame);
+
+  vlib_get_buffers (vm, from, bufs, frame->n_vectors);
+
+  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         1 /* is_ip4 */ ,
+                         0 /* is_output */ , 1 /* is_trace */ );
+  else
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         1 /* is_ip4 */ ,
+                         0 /* is_output */ , 0 /* is_trace */ );
+
+  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
+
+  return frame->n_vectors;
 }
 
 VLIB_NODE_FN (ip4_outacl_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
                                vlib_frame_t * frame)
 {
-  return ip_in_out_acl_inline (vm, node, frame, 1 /* is_ip4 */ ,
-                              1 /* is_output */ );
-}
+  u32 *from;
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
+  u16 nexts[VLIB_FRAME_SIZE];
 
+  from = vlib_frame_vector_args (frame);
+
+  vlib_get_buffers (vm, from, bufs, frame->n_vectors);
+
+  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         1 /* is_ip4 */ ,
+                         1 /* is_output */ , 1 /* is_trace */ );
+  else
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         1 /* is_ip4 */ ,
+                         1 /* is_output */ , 0 /* is_trace */ );
+
+  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
+
+  return frame->n_vectors;
+}
 
 /* *INDENT-OFF* */
 VLIB_REGISTER_NODE (ip4_inacl_node) = {
@@ -532,15 +860,51 @@ VLIB_REGISTER_NODE (ip4_outacl_node) = {
 VLIB_NODE_FN (ip6_inacl_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
                               vlib_frame_t * frame)
 {
-  return ip_in_out_acl_inline (vm, node, frame, 0 /* is_ip4 */ ,
-                              0 /* is_output */ );
+  u32 *from;
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
+  u16 nexts[VLIB_FRAME_SIZE];
+
+  from = vlib_frame_vector_args (frame);
+
+  vlib_get_buffers (vm, from, bufs, frame->n_vectors);
+
+  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         0 /* is_ip4 */ ,
+                         0 /* is_output */ , 1 /* is_trace */ );
+  else
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         0 /* is_ip4 */ ,
+                         0 /* is_output */ , 0 /* is_trace */ );
+
+  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
+
+  return frame->n_vectors;
 }
 
 VLIB_NODE_FN (ip6_outacl_node) (vlib_main_t * vm, vlib_node_runtime_t * node,
                                vlib_frame_t * frame)
 {
-  return ip_in_out_acl_inline (vm, node, frame, 0 /* is_ip4 */ ,
-                              1 /* is_output */ );
+  u32 *from;
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE];
+  u16 nexts[VLIB_FRAME_SIZE];
+
+  from = vlib_frame_vector_args (frame);
+
+  vlib_get_buffers (vm, from, bufs, frame->n_vectors);
+
+  if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE)))
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         0 /* is_ip4 */ ,
+                         1 /* is_output */ , 1 /* is_trace */ );
+  else
+    ip_in_out_acl_inline (vm, node, bufs, nexts, frame->n_vectors,
+                         0 /* is_ip4 */ ,
+                         1 /* is_output */ , 0 /* is_trace */ );
+
+  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
+
+  return frame->n_vectors;
 }
 
 /* *INDENT-OFF* */