VPP-1001 - update AF Packet Driver to for modern kernels 28/8628/8
authorAnton Ivanov <anton.ivanov@cambridgegreys.com>
Tue, 3 Oct 2017 09:08:05 +0000 (10:08 +0100)
committerDamjan Marion <dmarion.lists@gmail.com>
Wed, 4 Oct 2017 09:42:23 +0000 (09:42 +0000)
1. Add VNET headers support for checksumming - required
to operate correctly on any recent Linux

2. Bypass QDISC on transmit - improves performance by ~ 5%.
Enabled only if the macro is detected - apparently not
present on archaic distributions.

This still does not solve all issues with TSO - it can be
fixed only by going to tpacket v3 and dynamic rx ring as
well as significant changes in the TX (sendmmsg?).

Change-Id: Iea14ade12586c0a8da49e6dd1012108a08bc85b3
Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
src/vnet/devices/af_packet/af_packet.c
src/vnet/devices/af_packet/af_packet.h
src/vnet/devices/af_packet/device.c
src/vnet/devices/af_packet/node.c

index 3269601..fbcd488 100644 (file)
@@ -23,6 +23,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <fcntl.h>
+#include <linux/virtio_net.h>
 
 #include <vppinfra/linux/sysfs.h>
 #include <vlib/vlib.h>
@@ -128,6 +129,7 @@ static int
 create_packet_v2_sock (int host_if_index, tpacket_req_t * rx_req,
                       tpacket_req_t * tx_req, int *fd, u8 ** ring)
 {
+  af_packet_main_t *apm = &af_packet_main;
   int ret, err;
   struct sockaddr_ll sll;
   int ver = TPACKET_V2;
@@ -141,7 +143,31 @@ create_packet_v2_sock (int host_if_index, tpacket_req_t * rx_req,
       ret = VNET_API_ERROR_SYSCALL_ERROR_1;
       goto error;
     }
-
+  int opt = 1;
+  if (setsockopt (*fd, SOL_PACKET, PACKET_VNET_HDR, &opt, sizeof (opt)) != 0)
+    {
+      DBG_SOCK ("Failed to enable vnet headers on the socket");
+      if ((apm->flags & AF_PACKET_USES_VNET_HEADERS) != 0)
+       {
+         /* Should never happen - vnet was already enabled once,
+          * but we fail to reenable it on a new interface
+          **/
+         ret = VNET_API_ERROR_SYSCALL_ERROR_1;
+         goto error;
+       }
+    }
+  else
+    {
+      apm->flags |= AF_PACKET_USES_VNET_HEADERS;
+    }
+#ifdef PACKET_QDISC_BYPASS
+  opt = 1;
+  if (setsockopt (*fd, SOL_PACKET, PACKET_QDISC_BYPASS, &opt, sizeof (opt)) !=
+      0)
+    {
+      DBG_SOCK ("Failed to bypass Linux QDISC");
+    }
+#endif
   if ((err =
        setsockopt (*fd, SOL_PACKET, PACKET_VERSION, &ver, sizeof (ver))) < 0)
     {
@@ -150,7 +176,7 @@ create_packet_v2_sock (int host_if_index, tpacket_req_t * rx_req,
       goto error;
     }
 
-  int opt = 1;
+  opt = 1;
   if ((err =
        setsockopt (*fd, SOL_PACKET, PACKET_LOSS, &opt, sizeof (opt))) < 0)
     {
index 95c7e7c..f731427 100644 (file)
@@ -19,6 +19,8 @@
 
 #include <vppinfra/lock.h>
 
+#define AF_PACKET_USES_VNET_HEADERS 1
+
 typedef struct
 {
   CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
@@ -54,6 +56,7 @@ typedef struct
 
   /* hash of host interface names */
   mhash_t if_index_by_host_if_name;
+  u32 flags;
 } af_packet_main_t;
 
 af_packet_main_t af_packet_main;
index e01b1c7..a48ae5c 100644 (file)
@@ -23,6 +23,8 @@
 #include <net/if.h>
 #include <net/if_arp.h>
 
+#include <linux/virtio_net.h>
+
 #include <vlib/vlib.h>
 #include <vlib/unix/unix.h>
 #include <vnet/ip/ip.h>
@@ -50,7 +52,6 @@ static char *af_packet_tx_func_error_strings[] = {
 #undef _
 };
 
-
 static u8 *
 format_af_packet_device_name (u8 * s, va_list * args)
 {
@@ -76,6 +77,23 @@ format_af_packet_tx_trace (u8 * s, va_list * args)
   return s;
 }
 
+
+static_always_inline void
+af_packet_buffer_tx_offload (vlib_buffer_t * b, struct virtio_net_hdr *vhdr)
+{
+  /* For now - just mark the data as valid,
+   * DPDK csums on input, tap presently operates in legacy
+   * compatibility mode where the kernel checksums CSUM_PARTIAL
+   * for it and we have fixed the af_packet input
+   *
+   * In the future, locally originated frames, etc can be made
+   * to fit this convention so that they are not checksummed
+   * unless needed.
+   **/
+  vhdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
+}
+
+
 static uword
 af_packet_interface_tx (vlib_main_t * vm,
                        vlib_node_runtime_t * node, vlib_frame_t * frame)
@@ -102,6 +120,10 @@ af_packet_interface_tx (vlib_main_t * vm,
     {
       u32 len;
       u32 offset = 0;
+      if (PREDICT_TRUE ((apm->flags & AF_PACKET_USES_VNET_HEADERS) != 0))
+       {
+         offset = sizeof (struct virtio_net_hdr);
+       }
       vlib_buffer_t *b0;
       n_left--;
       u32 bi = buffers[0];
@@ -119,6 +141,12 @@ af_packet_interface_tx (vlib_main_t * vm,
       do
        {
          b0 = vlib_get_buffer (vm, bi);
+         if (PREDICT_TRUE ((apm->flags & AF_PACKET_USES_VNET_HEADERS) != 0))
+           {
+             u8 *vh =
+               (u8 *) tph + TPACKET_ALIGN (sizeof (struct tpacket2_hdr));
+             af_packet_buffer_tx_offload (b0, (struct virtio_net_hdr *) vh);
+           }
          len = b0->current_length;
          clib_memcpy ((u8 *) tph +
                       TPACKET_ALIGN (sizeof (struct tpacket2_hdr)) + offset,
index 99c91f3..5301ad2 100644 (file)
@@ -1,5 +1,4 @@
-/*
- *------------------------------------------------------------------
+/*------------------------------------------------------------------
  * af_packet.c - linux kernel packet interface
  *
  * Copyright (c) 2016 Cisco and/or its affiliates.
@@ -18,6 +17,7 @@
  */
 
 #include <linux/if_packet.h>
+#include <linux/virtio_net.h>
 
 #include <vlib/vlib.h>
 #include <vlib/unix/unix.h>
@@ -155,9 +155,18 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
       while ((tph->tp_status & TP_STATUS_USER) && (n_free_bufs > min_bufs) &&
             n_left_to_next)
        {
+
+         struct virtio_net_hdr *vh =
+           (struct virtio_net_hdr *) (((u8 *) tph) + tph->tp_mac -
+                                      sizeof (struct virtio_net_hdr));
          u32 data_len = tph->tp_snaplen;
          u32 offset = 0;
          u32 bi0 = 0, first_bi0 = 0, prev_bi0;
+         u32 vlan_len = 0;
+         ip_csum_t wsum = 0;
+         u16 *wsum_addr = NULL;
+         u32 do_vnet = apm->flags & AF_PACKET_USES_VNET_HEADERS;
+         u32 do_csum = tph->tp_status & TP_STATUS_CSUMNOTREADY;
 
          while (data_len)
            {
@@ -173,7 +182,6 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
              /* copy data */
              u32 bytes_to_copy =
                data_len > n_buffer_bytes ? n_buffer_bytes : data_len;
-             u32 vlan_len = 0;
              u32 bytes_copied = 0;
              b0->current_data = 0;
              /* Kernel removes VLAN headers, so reconstruct VLAN */
@@ -195,10 +203,50 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
                      bytes_copied = sizeof (ethernet_header_t);
                    }
                }
-             clib_memcpy (((u8 *) vlib_buffer_get_current (b0)) +
-                          bytes_copied + vlan_len,
-                          (u8 *) tph + tph->tp_mac + offset + bytes_copied,
-                          (bytes_to_copy - bytes_copied));
+             /* Check if the incoming skb is marked as CSUM_PARTIAL,
+              * If VNET Headers are enabled TP_STATUS_CSUMNOTREADY is
+              * equivalent to the vnet csum flag.
+              **/
+             if (PREDICT_TRUE ((do_vnet != 0) && (do_csum != 0)))
+               {
+                 wsum_addr = (u16 *) (((u8 *) vlib_buffer_get_current (b0)) +
+                                      vlan_len + vh->csum_start +
+                                      vh->csum_offset);
+                 if (bytes_copied <= vh->csum_start)
+                   {
+                     clib_memcpy (((u8 *) vlib_buffer_get_current (b0)) +
+                                  bytes_copied + vlan_len,
+                                  (u8 *) tph + tph->tp_mac + offset +
+                                  bytes_copied,
+                                  (vh->csum_start - bytes_copied));
+                     wsum =
+                       ip_csum_and_memcpy (wsum,
+                                           ((u8 *)
+                                            vlib_buffer_get_current (b0)) +
+                                           vh->csum_start + vlan_len,
+                                           (u8 *) tph + tph->tp_mac +
+                                           offset + vh->csum_start,
+                                           (bytes_to_copy - vh->csum_start));
+                   }
+                 else
+                   {
+                     wsum =
+                       ip_csum_and_memcpy (wsum,
+                                           ((u8 *)
+                                            vlib_buffer_get_current (b0)) +
+                                           bytes_copied + vlan_len,
+                                           (u8 *) tph + tph->tp_mac +
+                                           offset + bytes_copied,
+                                           (bytes_to_copy - bytes_copied));
+                   }
+               }
+             else
+               {
+                 clib_memcpy (((u8 *) vlib_buffer_get_current (b0)) +
+                              bytes_copied + vlan_len,
+                              (u8 *) tph + tph->tp_mac + offset +
+                              bytes_copied, (bytes_to_copy - bytes_copied));
+               }
 
              /* fill buffer header */
              b0->current_length = bytes_to_copy + vlan_len;
@@ -218,6 +266,10 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
              offset += bytes_to_copy;
              data_len -= bytes_to_copy;
            }
+         if (PREDICT_TRUE ((do_vnet != 0) && (do_csum != 0)))
+           {
+             *wsum_addr = ~ip_csum_fold (wsum);
+           }
          n_rx_packets++;
          n_rx_bytes += tph->tp_snaplen;
          to_next[0] = first_bi0;