This saves about 20 clocks/packet in both code paths.
Type: improvement
Signed-off-by: Klement Sekera <ksekera@cisco.com>
Change-Id: Ib559c74bf8168e3ddd764d51b7e5bcd2a557f591
ip_csum_t sum0;
b0 = *b;
+ b++;
+
+ /* Prefetch next iteration. */
+ if (PREDICT_TRUE (n_left_from >= 2))
+ {
+ vlib_buffer_t *p2;
+
+ p2 = *b;
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
if (is_output_feature)
{
n_left_from--;
next++;
- b++;
}
vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
ip_csum_t sum0;
b0 = *b;
+ b++;
+
+ /* Prefetch next iteration. */
+ if (PREDICT_TRUE (n_left_from >= 2))
+ {
+ vlib_buffer_t *p2;
+
+ p2 = *b;
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
next[0] = vnet_buffer2 (b0)->nat.arc_next;
vnet_buffer (b0)->snat.flags = 0;
n_left_from--;
next++;
- b++;
}
vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,