+#ifdef CLIB_HAVE_VEC256
+ /* only lower 16 bits of hash due to single precision fp arithmetic */
+ u32x8 mask8, sc8u, h8a, h8b;
+ f32x8 sc8f;
+
+ if (use_modulo_shortcut)
+ {
+ mask8 = u32x8_splat (mask);
+ }
+ else
+ {
+ mask8 = u32x8_splat (0xffff);
+ sc8u = u32x8_splat (n_slaves);
+ sc8f = f32x8_from_u32x8 (sc8u);
+ }
+
+ while (n_left > 16)
+ {
+ h8a = u32x8_load_unaligned (h) & mask8;
+ h8b = u32x8_load_unaligned (h + 8) & mask8;
+
+ if (use_modulo_shortcut == 0)
+ {
+ h8a -= sc8u * u32x8_from_f32x8 (f32x8_from_u32x8 (h8a) / sc8f);
+ h8b -= sc8u * u32x8_from_f32x8 (f32x8_from_u32x8 (h8b) / sc8f);
+ }
+
+ u32x8_store_unaligned (h8a, h);
+ u32x8_store_unaligned (h8b, h + 8);
+ n_left -= 16;
+ h += 16;
+ }
+#endif
+
+ while (n_left > 4)
+ {
+ if (use_modulo_shortcut)
+ {
+ h[0] &= mask;
+ h[1] &= mask;
+ h[2] &= mask;
+ h[3] &= mask;
+ }
+ else
+ {
+ h[0] %= n_slaves;
+ h[1] %= n_slaves;
+ h[2] %= n_slaves;
+ h[3] %= n_slaves;
+ }
+ n_left -= 4;
+ h += 4;
+ }
+ while (n_left)
+ {
+ if (use_modulo_shortcut)
+ h[0] &= mask;
+ else
+ h[0] %= n_slaves;
+ n_left -= 1;
+ h += 1;
+ }
+}
+
+static_always_inline void
+bond_update_sw_if_index (bond_per_thread_data_t * ptd, bond_if_t * bif,
+ u32 * bi, vlib_buffer_t ** b, u32 * data, u32 n_left,
+ int single_sw_if_index)
+{
+ u32 sw_if_index = data[0];
+ u32 *h = data;
+
+ while (n_left >= 4)
+ {
+ // Prefetch next iteration
+ if (n_left >= 8)
+ {
+ vlib_buffer_t **pb = b + 4;
+ vlib_prefetch_buffer_header (pb[0], LOAD);
+ vlib_prefetch_buffer_header (pb[1], LOAD);
+ vlib_prefetch_buffer_header (pb[2], LOAD);
+ vlib_prefetch_buffer_header (pb[3], LOAD);
+ }
+
+ if (PREDICT_FALSE (single_sw_if_index))
+ {
+ vnet_buffer (b[0])->sw_if_index[VLIB_TX] = sw_if_index;
+ vnet_buffer (b[1])->sw_if_index[VLIB_TX] = sw_if_index;
+ vnet_buffer (b[2])->sw_if_index[VLIB_TX] = sw_if_index;
+ vnet_buffer (b[3])->sw_if_index[VLIB_TX] = sw_if_index;
+
+ bond_tx_add_to_queue (ptd, 0, bi[0]);
+ bond_tx_add_to_queue (ptd, 0, bi[1]);
+ bond_tx_add_to_queue (ptd, 0, bi[2]);
+ bond_tx_add_to_queue (ptd, 0, bi[3]);
+ }
+ else
+ {
+ u32 sw_if_index[4];
+
+ sw_if_index[0] = *vec_elt_at_index (bif->active_slaves, h[0]);
+ sw_if_index[1] = *vec_elt_at_index (bif->active_slaves, h[1]);
+ sw_if_index[2] = *vec_elt_at_index (bif->active_slaves, h[2]);
+ sw_if_index[3] = *vec_elt_at_index (bif->active_slaves, h[3]);
+
+ vnet_buffer (b[0])->sw_if_index[VLIB_TX] = sw_if_index[0];
+ vnet_buffer (b[1])->sw_if_index[VLIB_TX] = sw_if_index[1];
+ vnet_buffer (b[2])->sw_if_index[VLIB_TX] = sw_if_index[2];
+ vnet_buffer (b[3])->sw_if_index[VLIB_TX] = sw_if_index[3];
+
+ bond_tx_add_to_queue (ptd, h[0], bi[0]);
+ bond_tx_add_to_queue (ptd, h[1], bi[1]);
+ bond_tx_add_to_queue (ptd, h[2], bi[2]);
+ bond_tx_add_to_queue (ptd, h[3], bi[3]);
+ }
+
+ bi += 4;
+ h += 4;
+ b += 4;
+ n_left -= 4;
+ }
+ while (n_left)
+ {
+ if (PREDICT_FALSE (single_sw_if_index))
+ {
+ vnet_buffer (b[0])->sw_if_index[VLIB_TX] = sw_if_index;
+ bond_tx_add_to_queue (ptd, 0, bi[0]);
+ }
+ else
+ {
+ u32 sw_if_index0 = *vec_elt_at_index (bif->active_slaves, h[0]);
+
+ vnet_buffer (b[0])->sw_if_index[VLIB_TX] = sw_if_index0;
+ bond_tx_add_to_queue (ptd, h[0], bi[0]);
+ }
+
+ bi += 1;
+ h += 1;
+ b += 1;
+ n_left -= 1;
+ }
+}
+
+static_always_inline void
+bond_tx_trace (vlib_main_t * vm, vlib_node_runtime_t * node, bond_if_t * bif,
+ vlib_buffer_t ** b, u32 n_left, u32 * h)
+{
+ uword n_trace = vlib_get_trace_count (vm, node);
+
+ while (n_trace > 0 && n_left > 0)
+ {
+ bond_packet_trace_t *t0;
+ ethernet_header_t *eth;
+ u32 next0 = 0;
+
+ vlib_trace_buffer (vm, node, next0, b[0], 0 /* follow_chain */ );
+ vlib_set_trace_count (vm, node, --n_trace);
+ t0 = vlib_add_trace (vm, node, b[0], sizeof (*t0));
+ eth = vlib_buffer_get_current (b[0]);
+ t0->ethernet = *eth;
+ t0->sw_if_index = vnet_buffer (b[0])->sw_if_index[VLIB_TX];
+ if (!h)
+ {
+ t0->bond_sw_if_index = *vec_elt_at_index (bif->active_slaves, 0);
+ }
+ else
+ {
+ t0->bond_sw_if_index = *vec_elt_at_index (bif->active_slaves, h[0]);
+ h++;
+ }
+ b++;
+ n_left--;
+ }
+}
+
+VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)