bond: tx perf improvement, part trois 86/14986/3
authorDamjan Marion <damarion@cisco.com>
Wed, 26 Sep 2018 08:15:41 +0000 (10:15 +0200)
committerSteven <sluong@cisco.com>
Fri, 5 Oct 2018 17:00:12 +0000 (10:00 -0700)
Introduce bond_tx_inline which takes lb as a constant for gcc to do the optimization

The number appears a tad better for 256 bytes frame.

with the patch
--------------
Thread 2 vpp_wk_1 (lcore 3)
Time 4.3, average vectors/node 224.00, last 128 main loops 40.00 per node 222.61
  vector rates in 8.4836e6, out 1.6967e7, drop 0.0000e0, punt 0.0000e0
             Name                 State         Calls          Vectors        Suspends         Clocks       Vectors/Call
BondEthernet0-output             active             141054        36109824               0          2.51e1          256.00
BondEthernet0-tx                 active             141054        36109824               0          2.55e1          256.00
TenGigabitEthernet6/0/0-output   active             141054        18055469               0          9.43e0          128.00
TenGigabitEthernet6/0/0-tx       active             141054        18055469               0          6.97e1          128.00
TenGigabitEthernet6/0/1-output   active             141054        18054355               0          9.54e0          127.99
TenGigabitEthernet6/0/1-tx       active             141054        18054355               0          7.05e1          127.99
bond-input                       active             141054        36109824               0          1.76e1          256.00
dpdk-input                       polling             70527        36109824               0          5.03e1          512.00
ethernet-input                   active             141054        36109824               0          6.12e1          256.00
ip4-input                        active             141054        36109824               0          3.26e1          256.00
ip4-lookup                       active             141054        36109824               0          2.94e1          256.00
ip4-rewrite                      active             141054        36109824               0          3.27e1          256.00

without the patch
-----------------
Thread 2 vpp_wk_1 (lcore 3)
Time 4.3, average vectors/node 224.00, last 128 main loops 40.00 per node 222.61
  vector rates in 8.4443e6, out 1.6889e7, drop 0.0000e0, punt 0.0000e0
             Name                 State         Calls          Vectors        Suspends         Clocks       Vectors/Call
BondEthernet0-output             active             142744        36542464               0          2.51e1          256.00
BondEthernet0-tx                 active             142744        36542464               0          2.67e1          256.00
TenGigabitEthernet6/0/0-output   active             142744        18270813               0          9.19e0          127.99
TenGigabitEthernet6/0/0-tx       active             142744        18270813               0          6.98e1          127.99
TenGigabitEthernet6/0/1-output   active             142744        18271651               0          9.43e0          128.00
TenGigabitEthernet6/0/1-tx       active             142744        18271651               0          7.02e1          128.00
bond-input                       active             142744        36542464               0          1.76e1          256.00
dpdk-input                       polling             71372        36542464               0          5.08e1          512.00
ethernet-input                   active             142744        36542464               0          6.15e1          256.00
ip4-input                        active             142744        36542464               0          3.23e1          256.00
ip4-lookup                       active             142744        36542464               0          2.96e1          256.00
ip4-rewrite                      active             142744        36542464               0          3.28e1          256.00

Change-Id: I9fd43eda3c735cbff680ac6d2f01ecdae81f0eda
Signed-off-by: Damjan Marion <damarion@cisco.com>
src/vnet/bonding/device.c
src/vnet/bonding/node.h

index 79ca2fa..8a78728 100644 (file)
@@ -379,63 +379,28 @@ bond_load_balance_active_backup (vlib_main_t * vm,
   return 0;
 }
 
-static bond_load_balance_func_t bond_load_balance_table[] = {
-#define _(v,f,s, p) { bond_load_balance_##p },
-  foreach_bond_lb_algo
-#undef _
-};
-
-VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
-                                         vlib_node_runtime_t * node,
-                                         vlib_frame_t * frame)
+static_always_inline void
+bond_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
+               vlib_frame_t * frame, bond_if_t * bif,
+               uword slave_count, u32 lb_alg)
 {
-  vnet_interface_output_runtime_t *rund = (void *) node->runtime_data;
   bond_main_t *bm = &bond_main;
-  bond_if_t *bif = pool_elt_at_index (bm->interfaces, rund->dev_instance);
-  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
-  u32 *from = vlib_frame_vector_args (frame);
-  ethernet_header_t *eth;
-  u32 n_left;
-  u32 sw_if_index;
+  vnet_main_t *vnm = vnet_get_main ();
+  u16 thread_index = vm->thread_index;
   bond_packet_trace_t *t0;
   uword n_trace = vlib_get_trace_count (vm, node);
-  u16 thread_index = vm->thread_index;
-  vnet_main_t *vnm = vnet_get_main ();
   u32 *to_next;
   vlib_frame_t *f;
-  uword slave_count;
+  ethernet_header_t *eth;
+  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
+  u32 *from = vlib_frame_vector_args (frame);
+  u32 n_left = frame->n_vectors;
+  u32 sw_if_index;
   u32 port0 = 0, port1 = 0, port2 = 0, port3 = 0;
   bond_per_thread_data_t *ptd = vec_elt_at_index (bm->per_thread_data,
                                                  thread_index);
 
-  if (PREDICT_FALSE (bif->admin_up == 0))
-    {
-      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
-      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
-                                    VNET_INTERFACE_COUNTER_DROP,
-                                    thread_index, bif->sw_if_index,
-                                    frame->n_vectors);
-      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_IF_DOWN,
-                       frame->n_vectors);
-      return frame->n_vectors;
-    }
-
-  n_left = frame->n_vectors;
   vlib_get_buffers (vm, from, bufs, n_left);
-
-  slave_count = vec_len (bif->active_slaves);
-  if (PREDICT_FALSE (slave_count == 0))
-    {
-      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
-      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
-                                    VNET_INTERFACE_COUNTER_DROP,
-                                    thread_index, bif->sw_if_index,
-                                    frame->n_vectors);
-      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_NO_SLAVE,
-                       frame->n_vectors);
-      return frame->n_vectors;
-    }
-
   b = bufs;
   while (n_left >= 4)
     {
@@ -464,22 +429,72 @@ VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
 
       if (PREDICT_TRUE (slave_count > 1))
        {
-         port0 =
-           (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-                                                            bif, b[0],
-                                                            slave_count);
-         port1 =
-           (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-                                                            bif, b[1],
-                                                            slave_count);
-         port2 =
-           (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-                                                            bif, b[2],
-                                                            slave_count);
-         port3 =
-           (bond_load_balance_table[bif->lb]).load_balance (vm, node,
-                                                            bif, b[3],
-                                                            slave_count);
+         if (lb_alg == BOND_LB_L2)
+           {
+             port0 = bond_load_balance_l2 (vm, node, bif, b[0], slave_count);
+             port1 = bond_load_balance_l2 (vm, node, bif, b[1], slave_count);
+             port2 = bond_load_balance_l2 (vm, node, bif, b[2], slave_count);
+             port3 = bond_load_balance_l2 (vm, node, bif, b[3], slave_count);
+           }
+         else if (lb_alg == BOND_LB_L34)
+           {
+             port0 = bond_load_balance_l34 (vm, node, bif, b[0],
+                                            slave_count);
+             port1 = bond_load_balance_l34 (vm, node, bif, b[1],
+                                            slave_count);
+             port2 = bond_load_balance_l34 (vm, node, bif, b[2],
+                                            slave_count);
+             port3 = bond_load_balance_l34 (vm, node, bif, b[3],
+                                            slave_count);
+           }
+         else if (lb_alg == BOND_LB_L23)
+           {
+             port0 = bond_load_balance_l23 (vm, node, bif, b[0],
+                                            slave_count);
+             port1 = bond_load_balance_l23 (vm, node, bif, b[1],
+                                            slave_count);
+             port2 = bond_load_balance_l23 (vm, node, bif, b[2],
+                                            slave_count);
+             port3 = bond_load_balance_l23 (vm, node, bif, b[3],
+                                            slave_count);
+           }
+         else if (lb_alg == BOND_LB_RR)
+           {
+             port0 = bond_load_balance_round_robin (vm, node, bif, b[0],
+                                                    slave_count);
+             port1 = bond_load_balance_round_robin (vm, node, bif, b[1],
+                                                    slave_count);
+             port2 = bond_load_balance_round_robin (vm, node, bif, b[2],
+                                                    slave_count);
+             port3 = bond_load_balance_round_robin (vm, node, bif, b[3],
+                                                    slave_count);
+           }
+         else if (lb_alg == BOND_LB_BC)
+           {
+             port0 = bond_load_balance_broadcast (vm, node, bif, b[0],
+                                                  slave_count);
+             port1 = bond_load_balance_broadcast (vm, node, bif, b[1],
+                                                  slave_count);
+             port2 = bond_load_balance_broadcast (vm, node, bif, b[2],
+                                                  slave_count);
+             port3 = bond_load_balance_broadcast (vm, node, bif, b[3],
+                                                  slave_count);
+           }
+         else if (lb_alg == BOND_LB_AB)
+           {
+             port0 = bond_load_balance_active_backup (vm, node, bif, b[0],
+                                                      slave_count);
+             port1 = bond_load_balance_active_backup (vm, node, bif, b[1],
+                                                      slave_count);
+             port2 = bond_load_balance_active_backup (vm, node, bif, b[2],
+                                                      slave_count);
+             port3 = bond_load_balance_active_backup (vm, node, bif, b[3],
+                                                      slave_count);
+           }
+         else
+           {
+             ASSERT (0);
+           }
        }
 
       sif_if_index0 = *vec_elt_at_index (bif->active_slaves, port0);
@@ -574,9 +589,42 @@ VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
       VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
 
       if (PREDICT_TRUE (slave_count > 1))
-       port0 =
-         (bond_load_balance_table[bif->lb]).load_balance (vm, node, bif,
-                                                          b[0], slave_count);
+       {
+         if (bif->lb == BOND_LB_L2)
+           {
+             port0 = bond_load_balance_l2 (vm, node, bif, b[0], slave_count);
+           }
+         else if (bif->lb == BOND_LB_L34)
+           {
+             port0 = bond_load_balance_l34 (vm, node, bif, b[0],
+                                            slave_count);
+           }
+         else if (bif->lb == BOND_LB_L23)
+           {
+             port0 = bond_load_balance_l23 (vm, node, bif, b[0],
+                                            slave_count);
+           }
+         else if (bif->lb == BOND_LB_RR)
+           {
+             port0 = bond_load_balance_round_robin (vm, node, bif, b[0],
+                                                    slave_count);
+           }
+         else if (bif->lb == BOND_LB_BC)
+           {
+             port0 = bond_load_balance_broadcast (vm, node, bif, b[0],
+                                                  slave_count);
+           }
+         else if (bif->lb == BOND_LB_AB)
+           {
+             port0 = bond_load_balance_active_backup (vm, node, bif, b[0],
+                                                      slave_count);
+           }
+         else
+           {
+             ASSERT (0);
+           }
+       }
+
       sif_if_index0 = *vec_elt_at_index (bif->active_slaves, port0);
 
       /* Do the tracing before the old interface is overwritten */
@@ -622,6 +670,57 @@ VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
   vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters
                                 + VNET_INTERFACE_COUNTER_TX, thread_index,
                                 bif->sw_if_index, frame->n_vectors);
+}
+
+VNET_DEVICE_CLASS_TX_FN (bond_dev_class) (vlib_main_t * vm,
+                                         vlib_node_runtime_t * node,
+                                         vlib_frame_t * frame)
+{
+  vnet_interface_output_runtime_t *rund = (void *) node->runtime_data;
+  bond_main_t *bm = &bond_main;
+  u16 thread_index = vm->thread_index;
+  bond_if_t *bif = pool_elt_at_index (bm->interfaces, rund->dev_instance);
+  uword slave_count;
+
+  if (PREDICT_FALSE (bif->admin_up == 0))
+    {
+      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
+                                    VNET_INTERFACE_COUNTER_DROP,
+                                    thread_index, bif->sw_if_index,
+                                    frame->n_vectors);
+      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_IF_DOWN,
+                       frame->n_vectors);
+      return frame->n_vectors;
+    }
+
+  slave_count = vec_len (bif->active_slaves);
+  if (PREDICT_FALSE (slave_count == 0))
+    {
+      vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+      vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters +
+                                    VNET_INTERFACE_COUNTER_DROP,
+                                    thread_index, bif->sw_if_index,
+                                    frame->n_vectors);
+      vlib_error_count (vm, node->node_index, BOND_TX_ERROR_NO_SLAVE,
+                       frame->n_vectors);
+      return frame->n_vectors;
+    }
+
+  if (bif->lb == BOND_LB_L2)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_L2);
+  else if (bif->lb == BOND_LB_L34)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_L34);
+  else if (bif->lb == BOND_LB_L23)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_L23);
+  else if (bif->lb == BOND_LB_RR)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_RR);
+  else if (bif->lb == BOND_LB_BC)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_BC);
+  else if (bif->lb == BOND_LB_AB)
+    bond_tx_inline (vm, node, frame, bif, slave_count, BOND_LB_AB);
+  else
+    ASSERT (0);
 
   return frame->n_vectors;
 }
index 6b13a46..e1359d0 100644 (file)
@@ -51,13 +51,13 @@ typedef enum
 /* configurable load-balances */
 #define foreach_bond_lb          \
   _ (2, L23, "l23", l23)  \
-  _ (1, l34 , "l34", l34) \
+  _ (1, L34 , "l34", l34) \
   _ (0, L2, "l2", l2)
 
 /* load-balance functions implemented in bond-output */
 #define foreach_bond_lb_algo                    \
   _ (0, L2, "l2", l2)                            \
-  _ (1, l34 , "l34", l34)                        \
+  _ (1, L34 , "l34", l34)                        \
   _ (2, L23, "l23", l23)                         \
   _ (3, RR, "round-robin", round_robin)          \
   _ (4, BC, "broadcast", broadcast)              \