tap gso: experimental support 52/15052/36
authorAndrew Yourtchenko <ayourtch@gmail.com>
Fri, 12 Oct 2018 14:09:22 +0000 (16:09 +0200)
committerDamjan Marion <dmarion@me.com>
Tue, 19 Feb 2019 12:47:40 +0000 (12:47 +0000)
This commit adds a "gso" parameter to existing "create tap..." CLI,
and a "no-gso" parameter for the compatibility with the future,
when/if defaults change.

It makes use of the lowest bit of the "tap_flags" field in the API call
in order to allow creation of GSO interfaces via API as well.

It does the necessary syscalls to enable the GSO
and checksum offload support on the kernel side and sets two flags
on the interface: virtio-specific virtio_if_t.gso_enabled,
and vnet_hw_interface_t.flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO.

The first one, if enabled, triggers the marking of the GSO-encapsulated
packets on ingress with VNET_BUFFER_F_GSO flag, and
setting vnet_buffer2(b)->gso_size to the desired L4 payload size.

VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO determines the egress packet
processing in interface-output for such packets:

When the flag is set, they are sent out almost as usual (just taking
care to set the vnet header for virtio).

When the flag is not enabled (the case for most interfaces),
the egress path performs the re-segmentation such that
the L4 payload of the transmitted packets equals gso_size.

The operations in the datapath are enabled only when there is at least
one GSO-compatible interface in the system - this is done by tracking
the count in interface_main.gso_interface_count. This way the impact
of conditional checks for the setups that do not use GSO is minimized.

"show tap" CLI shows the state of the GSO flag on the interface, and
the total count of GSO-enabled interfaces (which is used to enable
the GSO-related processing in the packet path).

This commit lacks IPv6 extension header traversal support of any kind -
the L4 payload is assumed to follow the IPv6 header. Also it performs
the offloads only for TCP (TSO - TCP segmentation offload).
The UDP fragmentation offload (UFO) is not part of it.

For debug purposes it also adds the debug CLI:

 "set tap gso {<interface> | sw_if_index <sw_idx>} <enable|disable>"

Change-Id: Ifd562db89adcc2208094b3d1032cee8c307aaef9
Signed-off-by: Andrew Yourtchenko <ayourtch@gmail.com>
14 files changed:
src/vnet/buffer.h
src/vnet/devices/tap/cli.c
src/vnet/devices/tap/tap.c
src/vnet/devices/tap/tap.h
src/vnet/devices/virtio/device.c
src/vnet/devices/virtio/node.c
src/vnet/devices/virtio/virtio.c
src/vnet/devices/virtio/virtio.h
src/vnet/interface.c
src/vnet/interface.h
src/vnet/interface_funcs.h
src/vnet/interface_output.c
src/vnet/ip/ip4_forward.c
src/vnet/ip/ip6_forward.c

index 0669651..ee04627 100644 (file)
   _(17, FLOW_REPORT, "flow-report", 1)                  \
   _(18, IS_DVR, "dvr", 1)                               \
   _(19, QOS_DATA_VALID, "qos-data-valid", 0)            \
-  _(20, AVAIL1, "avail1", 1)                            \
-  _(21, AVAIL2, "avail2", 1)                            \
-  _(22, AVAIL3, "avail3", 1)                            \
-  _(23, AVAIL4, "avail4", 1)                            \
-  _(24, AVAIL5, "avail5", 1)                            \
-  _(25, AVAIL6, "avail6", 1)                            \
-  _(26, AVAIL7, "avail7", 1)                            \
-  _(27, AVAIL8, "avail8", 1)
+  _(20, GSO, "gso", 0)                                  \
+  _(21, AVAIL1, "avail1", 1)                            \
+  _(22, AVAIL2, "avail2", 1)                            \
+  _(23, AVAIL3, "avail3", 1)                            \
+  _(24, AVAIL4, "avail4", 1)                            \
+  _(25, AVAIL5, "avail5", 1)                            \
+  _(26, AVAIL6, "avail6", 1)                            \
+  _(27, AVAIL7, "avail7", 1)
 
 /*
  * Please allocate the FIRST available bit, redefine
@@ -396,6 +396,20 @@ typedef struct
     };
   } gbp;
 
+  /**
+   * The L4 payload size set on input on GSO enabled interfaces
+   * when we receive a GSO packet (a chain of buffers with the first one
+   * having GSO bit set), and needs to persist all the way to the interface-output,
+   * in case the egress interface is not GSO-enabled - then we need to perform
+   * the segmentation, and use this value to cut the payload appropriately.
+   */
+  u16 gso_size;
+  /* size of L4 prototol header */
+  u16 gso_l4_hdr_sz;
+
+  /* The union below has a u64 alignment, so this space is unused */
+  u32 __unused2[1];
+
   union
   {
     struct
@@ -410,7 +424,7 @@ typedef struct
       u64 pad[1];
       u64 pg_replay_timestamp;
     };
-    u32 unused[10];
+    u32 unused[8];
   };
 } vnet_buffer_opaque2_t;
 
@@ -424,6 +438,9 @@ STATIC_ASSERT (sizeof (vnet_buffer_opaque2_t) <=
               STRUCT_SIZE_OF (vlib_buffer_t, opaque2),
               "VNET buffer opaque2 meta-data too large for vlib_buffer");
 
+#define gso_mtu_sz(b) (vnet_buffer2(b)->gso_size + vnet_buffer2(b)->gso_l4_hdr_sz + vnet_buffer(b)->l4_hdr_offset)
+
+
 format_function_t format_vnet_buffer;
 
 #endif /* included_vnet_buffer_h */
index ee57a72..084fb90 100644 (file)
@@ -39,6 +39,7 @@ tap_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
   int ip_addr_set = 0;
 
   args.id = ~0;
+  args.tap_flags = 0;
 
   /* Get a line of input. */
   if (unformat_user (input, unformat_line_input, line_input))
@@ -75,6 +76,10 @@ tap_create_command_fn (vlib_main_t * vm, unformat_input_t * input,
            ;
          else if (unformat (line_input, "tx-ring-size %d", &args.tx_ring_sz))
            ;
+         else if (unformat (line_input, "no-gso"))
+           args.tap_flags &= ~TAP_FLAG_GSO;
+         else if (unformat (line_input, "gso"))
+           args.tap_flags |= TAP_FLAG_GSO;
          else if (unformat (line_input, "hw-addr %U",
                             unformat_ethernet_address, args.mac_addr))
            args.mac_addr_set = 1;
@@ -109,7 +114,7 @@ VLIB_CLI_COMMAND (tap_create_command, static) = {
     "[rx-ring-size <size>] [tx-ring-size <size>] [host-ns <netns>] "
     "[host-bridge <bridge-name>] [host-ip4-addr <ip4addr/mask>] "
     "[host-ip6-addr <ip6-addr>] [host-ip4-gw <ip4-addr>] "
-    "[host-ip6-gw <ip6-addr>] [host-if-name <name>]",
+    "[host-ip6-gw <ip6-addr>] [host-if-name <name>] [no-gso|gso]",
   .function = tap_create_command_fn,
 };
 /* *INDENT-ON* */
@@ -162,6 +167,59 @@ VLIB_CLI_COMMAND (tap_delete__command, static) =
 };
 /* *INDENT-ON* */
 
+static clib_error_t *
+tap_gso_command_fn (vlib_main_t * vm, unformat_input_t * input,
+                   vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  u32 sw_if_index = ~0;
+  vnet_main_t *vnm = vnet_get_main ();
+  int enable = 1;
+  int rv;
+
+  /* Get a line of input. */
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return clib_error_return (0, "Missing <interface>");
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+       ;
+      else if (unformat (line_input, "%U", unformat_vnet_sw_interface,
+                        vnm, &sw_if_index))
+       ;
+      else if (unformat (line_input, "enable"))
+       enable = 1;
+      else if (unformat (line_input, "disable"))
+       enable = 0;
+      else
+       return clib_error_return (0, "unknown input `%U'",
+                                 format_unformat_error, input);
+    }
+  unformat_free (line_input);
+
+  if (sw_if_index == ~0)
+    return clib_error_return (0,
+                             "please specify interface name or sw_if_index");
+
+  rv = tap_gso_enable_disable (vm, sw_if_index, enable);
+  if (rv == VNET_API_ERROR_INVALID_SW_IF_INDEX)
+    return clib_error_return (0, "not a tap interface");
+  else if (rv != 0)
+    return clib_error_return (0, "error on configuring GSO on tap interface");
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (tap_gso__command, static) =
+{
+  .path = "set tap gso",
+  .short_help = "set tap gso {<interface> | sw_if_index <sw_idx>} <enable|disable>",
+  .function = tap_gso_command_fn,
+};
+/* *INDENT-ON* */
+
 static clib_error_t *
 tap_show_command_fn (vlib_main_t * vm, unformat_input_t * input,
                     vlib_cli_command_t * cmd)
index 101576c..3739561 100644 (file)
@@ -176,6 +176,16 @@ tap_create_if (vlib_main_t * vm, tap_create_if_args_t * args)
 
   unsigned int offload = 0;
   hdrsz = sizeof (struct virtio_net_hdr_v1);
+  if (args->tap_flags & TAP_FLAG_GSO)
+    {
+      offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
+      vif->gso_enabled = 1;
+    }
+  else
+    {
+      vif->gso_enabled = 0;
+    }
+
   _IOCTL (vif->tap_fd, TUNSETOFFLOAD, offload);
   _IOCTL (vif->tap_fd, TUNSETVNETHDRSZ, &hdrsz);
   _IOCTL (vif->fd, VHOST_SET_OWNER, 0);
@@ -386,6 +396,11 @@ tap_create_if (vlib_main_t * vm, tap_create_if_args_t * args)
   args->sw_if_index = vif->sw_if_index;
   hw = vnet_get_hw_interface (vnm, vif->hw_if_index);
   hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
+  if (args->tap_flags & TAP_FLAG_GSO)
+    {
+      hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
+      vnm->interface_main.gso_interface_count++;
+    }
   vnet_hw_interface_set_input_node (vnm, vif->hw_if_index,
                                    virtio_input_node.index);
   vnet_hw_interface_assign_rx_thread (vnm, vif->hw_if_index, 0, ~0);
@@ -442,6 +457,10 @@ tap_delete_if (vlib_main_t * vm, u32 sw_if_index)
   if (vif->type != VIRTIO_IF_TYPE_TAP)
     return VNET_API_ERROR_INVALID_INTERFACE;
 
+  /* decrement if this was a GSO interface */
+  if (hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO)
+    vnm->interface_main.gso_interface_count--;
+
   /* bring down the interface */
   vnet_hw_interface_set_flags (vnm, vif->hw_if_index, 0);
   vnet_sw_interface_set_flags (vnm, vif->sw_if_index, 0);
@@ -466,6 +485,52 @@ tap_delete_if (vlib_main_t * vm, u32 sw_if_index)
   return 0;
 }
 
+int
+tap_gso_enable_disable (vlib_main_t * vm, u32 sw_if_index, int enable_disable)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  virtio_main_t *mm = &virtio_main;
+  virtio_if_t *vif;
+  vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+  clib_error_t *err = 0;
+
+  if (hw == NULL || virtio_device_class.index != hw->dev_class_index)
+    return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+  vif = pool_elt_at_index (mm->interfaces, hw->dev_instance);
+
+  const unsigned int gso_on = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
+  const unsigned int gso_off = 0;
+  unsigned int offload = enable_disable ? gso_on : gso_off;
+  _IOCTL (vif->tap_fd, TUNSETOFFLOAD, offload);
+  vif->gso_enabled = enable_disable ? 1 : 0;
+  if (enable_disable)
+    {
+      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) == 0)
+       {
+         vnm->interface_main.gso_interface_count++;
+         hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
+       }
+    }
+  else
+    {
+      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) != 0)
+       {
+         vnm->interface_main.gso_interface_count--;
+         hw->flags &= ~VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
+       }
+    }
+
+error:
+  if (err)
+    {
+      clib_warning ("Error %s gso on sw_if_index %d",
+                   enable_disable ? "enabling" : "disabling", sw_if_index);
+      return VNET_API_ERROR_SYSCALL_ERROR_3;
+    }
+  return 0;
+}
+
 int
 tap_dump_ifs (tap_interface_details_t ** out_tapids)
 {
index 19dc88d..745f9fc 100644 (file)
@@ -30,6 +30,7 @@ typedef struct
   u16 rx_ring_sz;
   u16 tx_ring_sz;
   u32 tap_flags;
+#define TAP_FLAG_GSO (1 << 0)
   u8 *host_namespace;
   u8 *host_if_name;
   u8 host_mac_addr[6];
@@ -78,6 +79,8 @@ typedef struct
 
 void tap_create_if (vlib_main_t * vm, tap_create_if_args_t * args);
 int tap_delete_if (vlib_main_t * vm, u32 sw_if_index);
+int tap_gso_enable_disable (vlib_main_t * vm, u32 sw_if_index,
+                           int enable_disable);
 int tap_dump_ifs (tap_interface_details_t ** out_tapids);
 
 #endif /* _VNET_DEVICES_VIRTIO_TAP_H_ */
index aa6a342..609ffb4 100644 (file)
@@ -117,7 +117,7 @@ virtio_free_used_desc (vlib_main_t * vm, virtio_vring_t * vring)
 static_always_inline u16
 add_buffer_to_slot (vlib_main_t * vm, virtio_if_t * vif,
                    virtio_vring_t * vring, u32 bi, u16 avail, u16 next,
-                   u16 mask)
+                   u16 mask, int do_gso)
 {
   u16 n_added = 0;
   int hdr_sz = vif->virtio_net_hdr_sz;
@@ -127,6 +127,25 @@ add_buffer_to_slot (vlib_main_t * vm, virtio_if_t * vif,
   struct virtio_net_hdr_v1 *hdr = vlib_buffer_get_current (b) - hdr_sz;
 
   clib_memset (hdr, 0, hdr_sz);
+  if (do_gso && (b->flags & VNET_BUFFER_F_GSO))
+    {
+      if (b->flags & VNET_BUFFER_F_IS_IP4)
+       {
+         hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+         hdr->gso_size = vnet_buffer2 (b)->gso_size;
+         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+         hdr->csum_start = vnet_buffer (b)->l4_hdr_offset;     // 0x22;
+         hdr->csum_offset = 0x10;
+       }
+      else
+       {
+         hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+         hdr->gso_size = vnet_buffer2 (b)->gso_size;
+         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+         hdr->csum_start = vnet_buffer (b)->l4_hdr_offset;     // 0x36;
+         hdr->csum_offset = 0x10;
+       }
+    }
 
   if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0))
     {
@@ -219,7 +238,8 @@ add_buffer_to_slot (vlib_main_t * vm, virtio_if_t * vif,
 
 static_always_inline uword
 virtio_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
-                           vlib_frame_t * frame, virtio_if_t * vif)
+                           vlib_frame_t * frame, virtio_if_t * vif,
+                           int do_gso)
 {
   u8 qid = 0;
   u16 n_left = frame->n_vectors;
@@ -246,7 +266,8 @@ virtio_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
     {
       u16 n_added = 0;
       n_added =
-       add_buffer_to_slot (vm, vif, vring, buffers[0], avail, next, mask);
+       add_buffer_to_slot (vm, vif, vring, buffers[0], avail, next, mask,
+                           do_gso);
       if (!n_added)
        break;
       avail += n_added;
@@ -286,7 +307,12 @@ virtio_interface_tx (vlib_main_t * vm,
   vnet_interface_output_runtime_t *rund = (void *) node->runtime_data;
   virtio_if_t *vif = pool_elt_at_index (nm->interfaces, rund->dev_instance);
 
-  return virtio_interface_tx_inline (vm, node, frame, vif);
+  vnet_main_t *vnm = vnet_get_main ();
+  if (vnm->interface_main.gso_interface_count > 0)
+    return virtio_interface_tx_inline (vm, node, frame, vif, 1 /* do_gso */ );
+  else
+    return virtio_interface_tx_inline (vm, node, frame, vif,
+                                      0 /* no do_gso */ );
 }
 
 static void
index 6b82c41..fcc0f8a 100644 (file)
@@ -30,6 +30,7 @@
 #include <vnet/feature/feature.h>
 #include <vnet/ip/ip4_packet.h>
 #include <vnet/ip/ip6_packet.h>
+#include <vnet/udp/udp_packet.h>
 #include <vnet/devices/virtio/virtio.h>
 
 
@@ -140,9 +141,86 @@ more:
   goto more;
 }
 
+static_always_inline void
+fill_gso_buffer_flags (vlib_buffer_t * b0, struct virtio_net_hdr_v1 *hdr)
+{
+  u8 l4_proto = 0;
+  u8 l4_hdr_sz = 0;
+  if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+
+    {
+      ethernet_header_t *eh = (ethernet_header_t *) b0->data;
+      u16 ethertype = clib_net_to_host_u16 (eh->type);
+      u16 l2hdr_sz = sizeof (ethernet_header_t);
+
+      vnet_buffer (b0)->l2_hdr_offset = 0;
+      vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz;
+      if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4))
+       {
+         ip4_header_t *ip4 = (ip4_header_t *) (b0->data + l2hdr_sz);
+         vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + ip4_header_bytes (ip4);
+         l4_proto = ip4->protocol;
+         b0->flags |=
+           (VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_L2_HDR_OFFSET_VALID
+            | VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
+            VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+       }
+      else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6))
+       {
+         ip6_header_t *ip6 = (ip6_header_t *) (b0->data + l2hdr_sz);
+         /* FIXME IPv6 EH traversal */
+         vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + sizeof (ip6_header_t);
+         l4_proto = ip6->protocol;
+         b0->flags |=
+           (VNET_BUFFER_F_IS_IP6 | VNET_BUFFER_F_L2_HDR_OFFSET_VALID
+            | VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
+            VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
+       }
+      if (l4_proto == IP_PROTOCOL_TCP)
+       {
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
+         tcp_header_t *tcp = (tcp_header_t *) (b0->data +
+                                               vnet_buffer
+                                               (b0)->l4_hdr_offset);
+         l4_hdr_sz = tcp_header_bytes (tcp);
+         tcp->checksum = 0;
+       }
+      else if (l4_proto == IP_PROTOCOL_UDP)
+       {
+         b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
+         udp_header_t *udp = (udp_header_t *) (b0->data +
+                                               vnet_buffer
+                                               (b0)->l4_hdr_offset);
+         l4_hdr_sz = sizeof (*udp);
+         udp->checksum = 0;
+       }
+    }
+
+  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV4)
+    {
+      ASSERT (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM);
+      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
+      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+      b0->flags |= VNET_BUFFER_F_GSO;
+      b0->flags |= VNET_BUFFER_F_IS_IP4;
+    }
+  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV6)
+    {
+      ASSERT (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM);
+      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
+      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
+      b0->flags |= VNET_BUFFER_F_GSO;
+      b0->flags |= VNET_BUFFER_F_IS_IP6;
+    }
+}
+
+
 static_always_inline uword
 virtio_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
-                           vlib_frame_t * frame, virtio_if_t * vif, u16 qid)
+                           vlib_frame_t * frame, virtio_if_t * vif, u16 qid,
+                           int gso_enabled)
 {
   vnet_main_t *vnm = vnet_get_main ();
   u32 thread_index = vm->thread_index;
@@ -187,6 +265,10 @@ virtio_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
          b0->current_length = len;
          b0->total_length_not_including_first_buffer = 0;
          b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+         if (gso_enabled)
+           fill_gso_buffer_flags (b0, hdr);
+
          vnet_buffer (b0)->sw_if_index[VLIB_RX] = vif->sw_if_index;
          vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
 
@@ -286,8 +368,12 @@ virtio_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
     mif = vec_elt_at_index (nm->interfaces, dq->dev_instance);
     if (mif->flags & VIRTIO_IF_FLAG_ADMIN_UP)
       {
-       n_rx += virtio_device_input_inline (vm, node, frame, mif,
-                                           dq->queue_id);
+       if (mif->gso_enabled)
+         n_rx += virtio_device_input_inline (vm, node, frame, mif,
+                                             dq->queue_id, 1);
+       else
+         n_rx += virtio_device_input_inline (vm, node, frame, mif,
+                                             dq->queue_id, 0);
       }
   }
 
index cfeb302..2648f29 100644 (file)
@@ -277,6 +277,7 @@ virtio_show (vlib_main_t * vm, u32 * hw_if_indices, u8 show_descr, u32 type)
            vlib_cli_output (vm, "  host-ns \"%s\"", vif->net_ns);
          vlib_cli_output (vm, "  fd %d", vif->fd);
          vlib_cli_output (vm, "  tap-fd %d", vif->tap_fd);
+         vlib_cli_output (vm, "  gso-enabled %d", vif->gso_enabled);
        }
       vlib_cli_output (vm, "  Mac Address: %U", format_ethernet_address,
                       vif->mac_addr);
index af61ca5..f728196 100644 (file)
@@ -173,6 +173,7 @@ typedef struct
   u8 host_ip4_prefix_len;
   ip6_address_t host_ip6_addr;
   u8 host_ip6_prefix_len;
+  int gso_enabled;
   int ifindex;
 } virtio_if_t;
 
index 12204bd..dbfe496 100644 (file)
@@ -894,6 +894,7 @@ vnet_register_interface (vnet_main_t * vnm,
        static char *e[] = {
          "interface is down",
          "interface is deleted",
+         "no buffers to segment GSO",
        };
 
        r.n_errors = ARRAY_LEN (e);
@@ -1328,6 +1329,11 @@ vnet_interface_init (vlib_main_t * vm)
       }
   }
 
+  im->gso_interface_count = 0;
+  /* init per-thread data */
+  vec_validate_aligned (im->per_thread_data, vlib_num_workers (),
+                       CLIB_CACHE_LINE_BYTES);
+
   if ((error = vlib_call_init_function (vm, vnet_interface_cli_init)))
     return error;
 
index 174e534..5c41859 100644 (file)
@@ -475,6 +475,9 @@ typedef enum vnet_hw_interface_flags_t_
 
   /* tx checksum offload */
   VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD = (1 << 17),
+
+  /* gso */
+  VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO = (1 << 18),
 } vnet_hw_interface_flags_t;
 
 #define VNET_HW_INTERFACE_FLAG_DUPLEX_SHIFT 1
@@ -789,6 +792,12 @@ typedef struct
   u32 tx_node_index;
 } vnet_hw_interface_nodes_t;
 
+typedef struct
+{
+  u32 *split_buffers;
+  u32 padding[14];
+} vnet_interface_per_thread_data_t;
+
 typedef struct
 {
   /* Hardware interfaces. */
@@ -827,6 +836,12 @@ typedef struct
   u32 pcap_pkts_to_capture;
   uword *pcap_drop_filter_hash;
 
+  /* per-thread data */
+  vnet_interface_per_thread_data_t *per_thread_data;
+
+  /* enable GSO processing in packet path if this count is > 0 */
+  u32 gso_interface_count;
+
   /* feature_arc_index */
   u8 output_feature_arc_index;
 } vnet_interface_main_t;
index 9a674b1..ef1fc16 100644 (file)
@@ -442,6 +442,8 @@ typedef enum
 {
   VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DOWN,
   VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DELETED,
+  VNET_INTERFACE_OUTPUT_ERROR_NO_BUFFERS_FOR_GSO,
+  VNET_INTERFACE_OUTPUT_ERROR_UNHANDLED_GSO_TYPE,
 } vnet_interface_output_error_t;
 
 /* Format for interface output traces. */
index beeb62a..251ff34 100644 (file)
 typedef struct
 {
   u32 sw_if_index;
-  u8 data[128 - sizeof (u32)];
+  u32 flags;
+  u16 gso_size;
+  u8 gso_l4_hdr_sz;
+  u8 data[128 - 3 * sizeof (u32)];
 }
 interface_output_trace_t;
 
@@ -69,24 +72,30 @@ format_vnet_interface_output_trace (u8 * s, va_list * va)
          (vnm->interface_main.sw_interfaces, t->sw_if_index))
        {
          /* the interface may have been deleted by the time the trace is printed */
-         s = format (s, "sw_if_index: %d\n%U%U",
-                     t->sw_if_index,
-                     format_white_space, indent,
-                     node->format_buffer ? node->
-                     format_buffer : format_hex_bytes, t->data,
-                     sizeof (t->data));
+         s = format (s, "sw_if_index: %d ", t->sw_if_index);
        }
       else
        {
          si = vnet_get_sw_interface (vnm, t->sw_if_index);
-
-         s = format (s, "%U\n%U%U",
-                     format_vnet_sw_interface_name, vnm, si,
-                     format_white_space, indent,
-                     node->format_buffer ? node->
-                     format_buffer : format_hex_bytes, t->data,
-                     sizeof (t->data));
+         s =
+           format (s, "%U ", format_vnet_sw_interface_name, vnm, si,
+                   t->flags);
+       }
+#define _(bit, name, v, x) \
+          if (v && (t->flags & VNET_BUFFER_F_##name)) \
+            s = format (s, "%s ", v);
+      foreach_vnet_buffer_flag
+#undef _
+       if (t->flags & VNET_BUFFER_F_GSO)
+       {
+         s = format (s, "\n%Ugso_sz %d gso_l4_hdr_sz %d",
+                     format_white_space, indent + 2, t->gso_size,
+                     t->gso_l4_hdr_sz);
        }
+      s =
+       format (s, "\n%U%U", format_white_space, indent,
+               node->format_buffer ? node->format_buffer : format_hex_bytes,
+               t->data, sizeof (t->data));
     }
   return s;
 }
@@ -121,6 +130,9 @@ vnet_interface_output_trace (vlib_main_t * vm,
        {
          t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
          t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+         t0->flags = b0->flags;
+         t0->gso_size = vnet_buffer2 (b0)->gso_size;
+         t0->gso_l4_hdr_sz = vnet_buffer2 (b0)->gso_l4_hdr_sz;
          clib_memcpy_fast (t0->data, vlib_buffer_get_current (b0),
                            sizeof (t0->data));
        }
@@ -128,6 +140,9 @@ vnet_interface_output_trace (vlib_main_t * vm,
        {
          t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
          t1->sw_if_index = vnet_buffer (b1)->sw_if_index[VLIB_TX];
+         t1->flags = b1->flags;
+         t1->gso_size = vnet_buffer2 (b1)->gso_size;
+         t1->gso_l4_hdr_sz = vnet_buffer2 (b1)->gso_l4_hdr_sz;
          clib_memcpy_fast (t1->data, vlib_buffer_get_current (b1),
                            sizeof (t1->data));
        }
@@ -149,6 +164,9 @@ vnet_interface_output_trace (vlib_main_t * vm,
        {
          t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
          t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+         t0->flags = b0->flags;
+         t0->gso_size = vnet_buffer2 (b0)->gso_size;
+         t0->gso_l4_hdr_sz = vnet_buffer2 (b0)->gso_l4_hdr_sz;
          clib_memcpy_fast (t0->data, vlib_buffer_get_current (b0),
                            sizeof (t0->data));
        }
@@ -192,9 +210,17 @@ calc_checksums (vlib_main_t * vm, vlib_buffer_t * b)
     {
       int bogus;
       if (b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM)
-       th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+       {
+         th->checksum = 0;
+         th->checksum =
+           ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+       }
       if (b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM)
-       uh->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+       {
+         uh->checksum = 0;
+         uh->checksum =
+           ip6_tcp_udp_icmp_compute_checksum (vm, b, ip6, &bogus);
+       }
     }
 
   b->flags &= ~VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
@@ -202,12 +228,245 @@ calc_checksums (vlib_main_t * vm, vlib_buffer_t * b)
   b->flags &= ~VNET_BUFFER_F_OFFLOAD_IP_CKSUM;
 }
 
+static_always_inline u16
+tso_alloc_tx_bufs (vlib_main_t * vm,
+                  vnet_interface_per_thread_data_t * ptd,
+                  vlib_buffer_t * b0, u16 l4_hdr_sz)
+{
+  u32 n_bytes_b0 = vlib_buffer_length_in_chain (vm, b0);
+  u16 gso_size = vnet_buffer2 (b0)->gso_size;
+  u16 l234_sz = vnet_buffer (b0)->l4_hdr_offset + l4_hdr_sz;
+  /* rounded-up division */
+  u16 n_bufs = (n_bytes_b0 - l234_sz + (gso_size - 1)) / gso_size;
+  u16 n_alloc;
+
+  ASSERT (n_bufs > 0);
+  vec_validate (ptd->split_buffers, n_bufs - 1);
+
+  n_alloc = vlib_buffer_alloc (vm, ptd->split_buffers, n_bufs);
+  if (n_alloc < n_bufs)
+    {
+      vlib_buffer_free (vm, ptd->split_buffers, n_alloc);
+      return 0;
+    }
+  return 1;
+}
+
+static_always_inline void
+tso_init_buf_from_template_base (vlib_buffer_t * nb0, vlib_buffer_t * b0,
+                                u32 flags, u16 length)
+{
+  nb0->current_data = 0;
+  nb0->total_length_not_including_first_buffer = 0;
+  nb0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID | flags;
+  clib_memcpy_fast (&nb0->opaque, &b0->opaque, sizeof (nb0->opaque));
+  clib_memcpy_fast (nb0->data, b0->data, length);
+  nb0->current_length = length;
+}
+
+static_always_inline void
+tso_init_buf_from_template (vlib_main_t * vm, vlib_buffer_t * nb0,
+                           vlib_buffer_t * b0, u16 template_data_sz,
+                           u16 gso_size, u8 ** p_dst_ptr, u16 * p_dst_left,
+                           u32 next_tcp_seq, u32 flags)
+{
+  tso_init_buf_from_template_base (nb0, b0, flags, template_data_sz);
+
+  *p_dst_left =
+    clib_min (gso_size,
+             vlib_buffer_get_default_data_size (vm) - template_data_sz);
+  *p_dst_ptr = nb0->data + template_data_sz;
+
+  tcp_header_t *tcp =
+    (tcp_header_t *) (nb0->data + vnet_buffer (nb0)->l4_hdr_offset);
+  tcp->seq_number = clib_host_to_net_u32 (next_tcp_seq);
+}
+
+static_always_inline void
+tso_fixup_segmented_buf (vlib_buffer_t * b0, u8 tcp_flags, int is_ip6)
+{
+  u16 l3_hdr_offset = vnet_buffer (b0)->l3_hdr_offset;
+  u16 l4_hdr_offset = vnet_buffer (b0)->l4_hdr_offset;
+  ip4_header_t *ip4 = (ip4_header_t *) (b0->data + l3_hdr_offset);
+  ip6_header_t *ip6 = (ip6_header_t *) (b0->data + l3_hdr_offset);
+  tcp_header_t *tcp = (tcp_header_t *) (b0->data + l4_hdr_offset);
+
+  tcp->flags = tcp_flags;
+
+  if (is_ip6)
+    ip6->payload_length =
+      clib_host_to_net_u16 (b0->current_length -
+                           vnet_buffer (b0)->l4_hdr_offset);
+  else
+    ip4->length =
+      clib_host_to_net_u16 (b0->current_length -
+                           vnet_buffer (b0)->l3_hdr_offset);
+}
+
+/**
+ * Allocate the necessary number of ptd->split_buffers,
+ * and segment the possibly chained buffer(s) from b0 into
+ * there.
+ *
+ * Return the cumulative number of bytes sent or zero
+ * if allocation failed.
+ */
+
+static_always_inline u32
+tso_segment_buffer (vlib_main_t * vm, vnet_interface_per_thread_data_t * ptd,
+                   int do_tx_offloads, u32 sbi0, vlib_buffer_t * sb0,
+                   u32 n_bytes_b0)
+{
+  u32 n_tx_bytes = 0;
+  int is_ip4 = sb0->flags & VNET_BUFFER_F_IS_IP4;
+  int is_ip6 = sb0->flags & VNET_BUFFER_F_IS_IP6;
+  ASSERT (is_ip4 || is_ip6);
+  ASSERT (sb0->flags & VNET_BUFFER_F_L2_HDR_OFFSET_VALID);
+  ASSERT (sb0->flags & VNET_BUFFER_F_L3_HDR_OFFSET_VALID);
+  ASSERT (sb0->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
+  u16 gso_size = vnet_buffer2 (sb0)->gso_size;
+
+  int l4_hdr_sz = vnet_buffer2 (sb0)->gso_l4_hdr_sz;
+  u8 save_tcp_flags = 0;
+  u8 tcp_flags_no_fin_psh = 0;
+  u32 next_tcp_seq = 0;
+
+  tcp_header_t *tcp =
+    (tcp_header_t *) (sb0->data + vnet_buffer (sb0)->l4_hdr_offset);
+  next_tcp_seq = clib_net_to_host_u32 (tcp->seq_number);
+  /* store original flags for last packet and reset FIN and PSH */
+  save_tcp_flags = tcp->flags;
+  tcp_flags_no_fin_psh = tcp->flags & ~(TCP_FLAG_FIN | TCP_FLAG_PSH);
+  tcp->checksum = 0;
+
+  u32 default_bflags =
+    sb0->flags & ~(VNET_BUFFER_F_GSO | VLIB_BUFFER_NEXT_PRESENT);
+  u16 l234_sz = vnet_buffer (sb0)->l4_hdr_offset + l4_hdr_sz;
+  int first_data_size = clib_min (gso_size, sb0->current_length - l234_sz);
+  next_tcp_seq += first_data_size;
+
+  if (PREDICT_FALSE (!tso_alloc_tx_bufs (vm, ptd, sb0, l4_hdr_sz)))
+    return 0;
+
+  vlib_buffer_t *b0 = vlib_get_buffer (vm, ptd->split_buffers[0]);
+  tso_init_buf_from_template_base (b0, sb0, default_bflags,
+                                  l4_hdr_sz + first_data_size);
+
+  u32 total_src_left = n_bytes_b0 - l234_sz - first_data_size;
+  if (total_src_left)
+    {
+      /* Need to copy more segments */
+      u8 *src_ptr, *dst_ptr;
+      u16 src_left, dst_left;
+      /* current source buffer */
+      vlib_buffer_t *csb0 = sb0;
+      u32 csbi0 = sbi0;
+      /* current dest buffer */
+      vlib_buffer_t *cdb0;
+      u16 dbi = 1;             /* the buffer [0] is b0 */
+
+      src_ptr = sb0->data + l234_sz + first_data_size;
+      src_left = sb0->current_length - l234_sz - first_data_size;
+      b0->current_length = l234_sz + first_data_size;
+
+      tso_fixup_segmented_buf (b0, tcp_flags_no_fin_psh, is_ip6);
+      if (do_tx_offloads)
+       calc_checksums (vm, b0);
+
+      /* grab a second buffer and prepare the loop */
+      ASSERT (dbi < vec_len (ptd->split_buffers));
+      cdb0 = vlib_get_buffer (vm, ptd->split_buffers[dbi++]);
+      tso_init_buf_from_template (vm, cdb0, b0, l234_sz, gso_size, &dst_ptr,
+                                 &dst_left, next_tcp_seq, default_bflags);
+
+      /* an arbitrary large number to catch the runaway loops */
+      int nloops = 2000;
+      while (total_src_left)
+       {
+         ASSERT (nloops-- > 0);
+         u16 bytes_to_copy = clib_min (src_left, dst_left);
+
+         clib_memcpy_fast (dst_ptr, src_ptr, bytes_to_copy);
+
+         src_left -= bytes_to_copy;
+         src_ptr += bytes_to_copy;
+         total_src_left -= bytes_to_copy;
+         dst_left -= bytes_to_copy;
+         dst_ptr += bytes_to_copy;
+         next_tcp_seq += bytes_to_copy;
+         cdb0->current_length += bytes_to_copy;
+
+         if (0 == src_left)
+           {
+             int has_next = (csb0->flags & VLIB_BUFFER_NEXT_PRESENT);
+             u32 next_bi = csb0->next_buffer;
+
+             /* init src to the next buffer in chain */
+             if (has_next)
+               {
+                 csbi0 = next_bi;
+                 csb0 = vlib_get_buffer (vm, csbi0);
+                 src_left = csb0->current_length;
+                 src_ptr = csb0->data;
+               }
+             else
+               {
+                 ASSERT (total_src_left == 0);
+                 break;
+               }
+           }
+         if (0 == dst_left && total_src_left)
+           {
+             if (do_tx_offloads)
+               calc_checksums (vm, cdb0);
+             n_tx_bytes += cdb0->current_length;
+             ASSERT (dbi < vec_len (ptd->split_buffers));
+             cdb0 = vlib_get_buffer (vm, ptd->split_buffers[dbi++]);
+             tso_init_buf_from_template (vm, cdb0, b0, l234_sz,
+                                         gso_size, &dst_ptr, &dst_left,
+                                         next_tcp_seq, default_bflags);
+           }
+       }
+
+      tso_fixup_segmented_buf (cdb0, save_tcp_flags, is_ip6);
+      if (do_tx_offloads)
+       calc_checksums (vm, cdb0);
+
+      n_tx_bytes += cdb0->current_length;
+    }
+  n_tx_bytes += b0->current_length;
+  return n_tx_bytes;
+}
+
+static_always_inline void
+drop_one_buffer_and_count (vlib_main_t * vm, vnet_main_t * vnm,
+                          vlib_node_runtime_t * node, u32 * pbi0,
+                          u32 drop_error_code)
+{
+  u32 thread_index = vm->thread_index;
+  vnet_interface_output_runtime_t *rt = (void *) node->runtime_data;
+
+  vlib_simple_counter_main_t *cm;
+  cm =
+    vec_elt_at_index (vnm->interface_main.sw_if_counters,
+                     VNET_INTERFACE_COUNTER_TX_ERROR);
+  vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, 1);
+
+  vlib_error_drop_buffers (vm, node, pbi0,
+                          /* buffer stride */ 1,
+                          /* n_buffers */ 1,
+                          VNET_INTERFACE_OUTPUT_NEXT_DROP,
+                          node->node_index, drop_error_code);
+}
+
 static_always_inline uword
-vnet_interface_output_node_inline (vlib_main_t * vm,
-                                  vlib_node_runtime_t * node,
-                                  vlib_frame_t * frame, vnet_main_t * vnm,
-                                  vnet_hw_interface_t * hi,
-                                  int do_tx_offloads)
+vnet_interface_output_node_inline_gso (vlib_main_t * vm,
+                                      vlib_node_runtime_t * node,
+                                      vlib_frame_t * frame,
+                                      vnet_main_t * vnm,
+                                      vnet_hw_interface_t * hi,
+                                      int do_tx_offloads,
+                                      int do_segmentation)
 {
   vnet_interface_output_runtime_t *rt = (void *) node->runtime_data;
   vnet_sw_interface_t *si;
@@ -219,6 +478,8 @@ vnet_interface_output_node_inline (vlib_main_t * vm,
   u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX;
   u32 current_config_index = ~0;
   u8 arc = im->output_feature_arc_index;
+  vnet_interface_per_thread_data_t *ptd =
+    vec_elt_at_index (im->per_thread_data, thread_index);
 
   n_buffers = frame->n_vectors;
 
@@ -300,15 +561,30 @@ vnet_interface_output_node_inline (vlib_main_t * vm,
          to_tx[1] = bi1;
          to_tx[2] = bi2;
          to_tx[3] = bi3;
-         from += 4;
-         to_tx += 4;
-         n_left_to_tx -= 4;
+         if (!do_segmentation)
+           {
+             from += 4;
+             to_tx += 4;
+             n_left_to_tx -= 4;
+           }
 
          b0 = vlib_get_buffer (vm, bi0);
          b1 = vlib_get_buffer (vm, bi1);
          b2 = vlib_get_buffer (vm, bi2);
          b3 = vlib_get_buffer (vm, bi3);
 
+         if (do_segmentation)
+           {
+             or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
+
+             /* go to single loop if we need TSO segmentation */
+             if (PREDICT_FALSE (or_flags & VNET_BUFFER_F_GSO))
+               break;
+             from += 4;
+             to_tx += 4;
+             n_left_to_tx -= 4;
+           }
+
          /* Be grumpy about zero length buffers for benefit of
             driver tx function. */
          ASSERT (b0->current_length > 0);
@@ -376,7 +652,8 @@ vnet_interface_output_node_inline (vlib_main_t * vm,
                                               n_bytes_b3);
            }
 
-         or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
+         if (!do_segmentation)
+           or_flags = b0->flags | b1->flags | b2->flags | b3->flags;
 
          if (do_tx_offloads)
            {
@@ -422,6 +699,85 @@ vnet_interface_output_node_inline (vlib_main_t * vm,
              b0->current_config_index = current_config_index;
            }
 
+         if (do_segmentation)
+           {
+             if (PREDICT_FALSE (b0->flags & VNET_BUFFER_F_GSO))
+               {
+                 /*
+                  * Undo the enqueue of the b0 - it is not going anywhere,
+                  * and will be freed either after it's segmented or
+                  * when dropped, if there is no buffers to segment into.
+                  */
+                 to_tx -= 1;
+                 n_left_to_tx += 1;
+                 /* undo the counting. */
+                 n_bytes -= n_bytes_b0;
+                 n_packets -= 1;
+
+                 u32 n_tx_bytes = 0;
+
+                 n_tx_bytes =
+                   tso_segment_buffer (vm, ptd, do_tx_offloads, bi0, b0,
+                                       n_bytes_b0);
+
+                 if (PREDICT_FALSE (n_tx_bytes == 0))
+                   {
+                     drop_one_buffer_and_count (vm, vnm, node, from - 1,
+                                                VNET_INTERFACE_OUTPUT_ERROR_NO_BUFFERS_FOR_GSO);
+                     continue;
+                   }
+
+                 u16 n_tx_bufs = vec_len (ptd->split_buffers);
+                 u32 *from_tx_seg = ptd->split_buffers;
+
+                 while (n_tx_bufs > 0)
+                   {
+                     if (n_tx_bufs >= n_left_to_tx)
+                       {
+                         while (n_left_to_tx > 0)
+                           {
+                             to_tx[0] = from_tx_seg[0];
+                             to_tx += 1;
+                             from_tx_seg += 1;
+                             n_left_to_tx -= 1;
+                             n_tx_bufs -= 1;
+                             n_packets += 1;
+                           }
+                         vlib_put_next_frame (vm, node, next_index,
+                                              n_left_to_tx);
+                         vlib_get_new_next_frame (vm, node, next_index,
+                                                  to_tx, n_left_to_tx);
+                       }
+                     else
+                       {
+                         while (n_tx_bufs > 0)
+                           {
+                             to_tx[0] = from_tx_seg[0];
+                             to_tx += 1;
+                             from_tx_seg += 1;
+                             n_left_to_tx -= 1;
+                             n_tx_bufs -= 1;
+                             n_packets += 1;
+                           }
+                       }
+                   }
+                 n_bytes += n_tx_bytes;
+                 if (PREDICT_FALSE (tx_swif0 != rt->sw_if_index))
+                   {
+
+                     vlib_increment_combined_counter
+                       (im->combined_sw_if_counters +
+                        VNET_INTERFACE_COUNTER_TX, thread_index, tx_swif0,
+                        _vec_len (ptd->split_buffers), n_tx_bytes);
+                   }
+                 /* The buffers were enqueued. Reset the length */
+                 _vec_len (ptd->split_buffers) = 0;
+                 /* Free the now segmented buffer */
+                 vlib_buffer_free_one (vm, bi0);
+                 continue;
+               }
+           }
+
          if (PREDICT_FALSE (tx_swif0 != rt->sw_if_index))
            {
 
@@ -446,6 +802,33 @@ vnet_interface_output_node_inline (vlib_main_t * vm,
   return n_buffers;
 }
 
+static_always_inline uword
+vnet_interface_output_node_inline (vlib_main_t * vm,
+                                  vlib_node_runtime_t * node,
+                                  vlib_frame_t * frame, vnet_main_t * vnm,
+                                  vnet_hw_interface_t * hi,
+                                  int do_tx_offloads)
+{
+  /*
+   * The 3-headed "if" is here because we want to err on the side
+   * of not impacting the non-GSO performance - so for the more
+   * common case of no GSO interfaces we want to prevent the
+   * segmentation codepath from being there altogether.
+   */
+  if (PREDICT_TRUE (vnm->interface_main.gso_interface_count == 0))
+    return vnet_interface_output_node_inline_gso (vm, node, frame, vnm, hi,
+                                                 do_tx_offloads,
+                                                 /* do_segmentation */ 0);
+  else if (hi->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO)
+    return vnet_interface_output_node_inline_gso (vm, node, frame, vnm, hi,
+                                                 do_tx_offloads,
+                                                 /* do_segmentation */ 0);
+  else
+    return vnet_interface_output_node_inline_gso (vm, node, frame, vnm, hi,
+                                                 do_tx_offloads,
+                                                 /* do_segmentation */ 1);
+}
+
 uword
 vnet_interface_output_node (vlib_main_t * vm, vlib_node_runtime_t * node,
                            vlib_frame_t * frame)
index b3ae29a..ec4eda4 100644 (file)
@@ -2186,10 +2186,11 @@ ip4_ttl_and_checksum_check (vlib_buffer_t * b, ip4_header_t * ip, u16 * next,
 
 
 always_inline uword
-ip4_rewrite_inline (vlib_main_t * vm,
-                   vlib_node_runtime_t * node,
-                   vlib_frame_t * frame,
-                   int do_counters, int is_midchain, int is_mcast)
+ip4_rewrite_inline_with_gso (vlib_main_t * vm,
+                            vlib_node_runtime_t * node,
+                            vlib_frame_t * frame,
+                            int do_counters, int is_midchain, int is_mcast,
+                            int do_gso)
 {
   ip_lookup_main_t *lm = &ip4_main.lookup_main;
   u32 *from = vlib_frame_vector_args (frame);
@@ -2267,12 +2268,20 @@ ip4_rewrite_inline (vlib_main_t * vm,
       CLIB_PREFETCH (p, CLIB_CACHE_LINE_BYTES, LOAD);
 
       /* Check MTU of outgoing interface. */
-      ip4_mtu_check (b[0], clib_net_to_host_u16 (ip0->length),
+      u16 ip0_len = clib_net_to_host_u16 (ip0->length);
+      u16 ip1_len = clib_net_to_host_u16 (ip1->length);
+
+      if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
+       ip0_len = gso_mtu_sz (b[0]);
+      if (do_gso && (b[1]->flags & VNET_BUFFER_F_GSO))
+       ip1_len = gso_mtu_sz (b[1]);
+
+      ip4_mtu_check (b[0], ip0_len,
                     adj0[0].rewrite_header.max_l3_packet_bytes,
                     ip0->flags_and_fragment_offset &
                     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
                     next + 0, &error0);
-      ip4_mtu_check (b[1], clib_net_to_host_u16 (ip1->length),
+      ip4_mtu_check (b[1], ip1_len,
                     adj1[0].rewrite_header.max_l3_packet_bytes,
                     ip1->flags_and_fragment_offset &
                     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
@@ -2395,7 +2404,11 @@ ip4_rewrite_inline (vlib_main_t * vm,
       vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0;
 
       /* Check MTU of outgoing interface. */
-      ip4_mtu_check (b[0], clib_net_to_host_u16 (ip0->length),
+      u16 ip0_len = clib_net_to_host_u16 (ip0->length);
+      if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO))
+       ip0_len = gso_mtu_sz (b[0]);
+
+      ip4_mtu_check (b[0], ip0_len,
                     adj0[0].rewrite_header.max_l3_packet_bytes,
                     ip0->flags_and_fragment_offset &
                     clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT),
@@ -2465,6 +2478,23 @@ ip4_rewrite_inline (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+always_inline uword
+ip4_rewrite_inline (vlib_main_t * vm,
+                   vlib_node_runtime_t * node,
+                   vlib_frame_t * frame,
+                   int do_counters, int is_midchain, int is_mcast)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  if (PREDICT_FALSE (vnm->interface_main.gso_interface_count > 0))
+    return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
+                                       is_midchain, is_mcast,
+                                       1 /* do_gso */ );
+  else
+    return ip4_rewrite_inline_with_gso (vm, node, frame, do_counters,
+                                       is_midchain, is_mcast,
+                                       0 /* no do_gso */ );
+}
+
 
 /** @brief IPv4 rewrite node.
     @node ip4-rewrite
index 8e96647..f599392 100644 (file)
@@ -1622,10 +1622,11 @@ ip6_mtu_check (vlib_buffer_t * b, u16 packet_bytes,
 }
 
 always_inline uword
-ip6_rewrite_inline (vlib_main_t * vm,
-                   vlib_node_runtime_t * node,
-                   vlib_frame_t * frame,
-                   int do_counters, int is_midchain, int is_mcast)
+ip6_rewrite_inline_with_gso (vlib_main_t * vm,
+                            vlib_node_runtime_t * node,
+                            vlib_frame_t * frame,
+                            int do_counters, int is_midchain, int is_mcast,
+                            int do_gso)
 {
   ip_lookup_main_t *lm = &ip6_main.lookup_main;
   u32 *from = vlib_frame_vector_args (frame);
@@ -1771,12 +1772,23 @@ ip6_rewrite_inline (vlib_main_t * vm,
            }
 
          /* Check MTU of outgoing interface. */
-         ip6_mtu_check (p0, clib_net_to_host_u16 (ip0->payload_length) +
-                        sizeof (ip6_header_t),
+         u16 ip0_len =
+           clib_net_to_host_u16 (ip0->payload_length) +
+           sizeof (ip6_header_t);
+         u16 ip1_len =
+           clib_net_to_host_u16 (ip1->payload_length) +
+           sizeof (ip6_header_t);
+         if (do_gso && (p0->flags & VNET_BUFFER_F_GSO))
+           ip0_len = gso_mtu_sz (p0);
+         if (do_gso && (p1->flags & VNET_BUFFER_F_GSO))
+           ip1_len = gso_mtu_sz (p1);
+
+
+
+         ip6_mtu_check (p0, ip0_len,
                         adj0[0].rewrite_header.max_l3_packet_bytes,
                         is_locally_originated0, &next0, &error0);
-         ip6_mtu_check (p1, clib_net_to_host_u16 (ip1->payload_length) +
-                        sizeof (ip6_header_t),
+         ip6_mtu_check (p1, ip1_len,
                         adj1[0].rewrite_header.max_l3_packet_bytes,
                         is_locally_originated1, &next1, &error1);
 
@@ -1915,8 +1927,13 @@ ip6_rewrite_inline (vlib_main_t * vm,
            }
 
          /* Check MTU of outgoing interface. */
-         ip6_mtu_check (p0, clib_net_to_host_u16 (ip0->payload_length) +
-                        sizeof (ip6_header_t),
+         u16 ip0_len =
+           clib_net_to_host_u16 (ip0->payload_length) +
+           sizeof (ip6_header_t);
+         if (do_gso && (p0->flags & VNET_BUFFER_F_GSO))
+           ip0_len = gso_mtu_sz (p0);
+
+         ip6_mtu_check (p0, ip0_len,
                         adj0[0].rewrite_header.max_l3_packet_bytes,
                         is_locally_originated0, &next0, &error0);
 
@@ -1974,6 +1991,23 @@ ip6_rewrite_inline (vlib_main_t * vm,
   return frame->n_vectors;
 }
 
+always_inline uword
+ip6_rewrite_inline (vlib_main_t * vm,
+                   vlib_node_runtime_t * node,
+                   vlib_frame_t * frame,
+                   int do_counters, int is_midchain, int is_mcast)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  if (PREDICT_FALSE (vnm->interface_main.gso_interface_count > 0))
+    return ip6_rewrite_inline_with_gso (vm, node, frame, do_counters,
+                                       is_midchain, is_mcast,
+                                       1 /* do_gso */ );
+  else
+    return ip6_rewrite_inline_with_gso (vm, node, frame, do_counters,
+                                       is_midchain, is_mcast,
+                                       0 /* no do_gso */ );
+}
+
 VLIB_NODE_FN (ip6_rewrite_node) (vlib_main_t * vm,
                                 vlib_node_runtime_t * node,
                                 vlib_frame_t * frame)