virtio: support virtio 1.1 packed ring in vhost
[vpp.git] / src / vnet / devices / virtio / vhost_user.c
index 7094a00..d24e516 100644 (file)
@@ -466,6 +466,8 @@ vhost_user_socket_read (clib_file_t * uf)
 
       if (vui->enable_gso)
        msg.u64 |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS;
+      if (vui->enable_packed)
+       msg.u64 |= (1ULL << FEAT_VIRTIO_F_RING_PACKED);
 
       msg.size = sizeof (msg.u64);
       vu_log_debug (vui, "if %d msg VHOST_USER_GET_FEATURES - reply "
@@ -655,7 +657,11 @@ vhost_user_socket_read (clib_file_t * uf)
        vui->vrings[msg.state.index].used->idx;
 
       /* tell driver that we don't want interrupts */
-      vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
+      if (vhost_user_is_packed_ring_supported (vui))
+       vui->vrings[msg.state.index].used_event->flags =
+         VRING_EVENT_F_DISABLE;
+      else
+       vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY;
       vlib_worker_thread_barrier_release (vm);
       vhost_user_update_iface_state (vui);
       break;
@@ -762,10 +768,47 @@ vhost_user_socket_read (clib_file_t * uf)
       break;
 
     case VHOST_USER_SET_VRING_BASE:
-      vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+      vu_log_debug (vui,
+                   "if %d msg VHOST_USER_SET_VRING_BASE idx %d num 0x%x",
                    vui->hw_if_index, msg.state.index, msg.state.num);
       vlib_worker_thread_barrier_sync (vm);
       vui->vrings[msg.state.index].last_avail_idx = msg.state.num;
+      if (vhost_user_is_packed_ring_supported (vui))
+       {
+         /*
+          *  0                   1                   2                   3
+          *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+          * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+          * |    last avail idx           | |     last used idx           | |
+          * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+          *                                ^                               ^
+          *                                |                               |
+          *                         avail wrap counter       used wrap counter
+          */
+         /* last avail idx at bit 0-14. */
+         vui->vrings[msg.state.index].last_avail_idx =
+           msg.state.num & 0x7fff;
+         /* avail wrap counter at bit 15 */
+         vui->vrings[msg.state.index].avail_wrap_counter =
+           ! !(msg.state.num & (1 << 15));
+
+         /*
+          * Although last_used_idx is passed in the upper 16 bits in qemu
+          * implementation, in practice, last_avail_idx and last_used_idx are
+          * usually the same. As a result, DPDK does not bother to pass us
+          * last_used_idx. The spec is not clear on thex coding. I figured it
+          * out by reading the qemu code. So let's just read last_avail_idx
+          * and set last_used_idx equals to last_avail_idx.
+          */
+         vui->vrings[msg.state.index].last_used_idx =
+           vui->vrings[msg.state.index].last_avail_idx;
+         vui->vrings[msg.state.index].used_wrap_counter =
+           vui->vrings[msg.state.index].avail_wrap_counter;
+
+         if (vui->vrings[msg.state.index].avail_wrap_counter == 1)
+           vui->vrings[msg.state.index].avail_wrap_counter =
+             VIRTQ_DESC_F_AVAIL;
+       }
       vlib_worker_thread_barrier_release (vm);
       break;
 
@@ -784,6 +827,15 @@ vhost_user_socket_read (clib_file_t * uf)
        * closing the vring also initializes the vring last_avail_idx
        */
       msg.state.num = vui->vrings[msg.state.index].last_avail_idx;
+      if (vhost_user_is_packed_ring_supported (vui))
+       {
+         msg.state.num =
+           (vui->vrings[msg.state.index].last_avail_idx & 0x7fff) |
+           (! !vui->vrings[msg.state.index].avail_wrap_counter << 15);
+         msg.state.num |=
+           ((vui->vrings[msg.state.index].last_used_idx & 0x7fff) |
+            (! !vui->vrings[msg.state.index].used_wrap_counter << 15)) << 16;
+       }
       msg.flags |= 4;
       msg.size = sizeof (msg.state);
 
@@ -793,7 +845,8 @@ vhost_user_socket_read (clib_file_t * uf)
        */
       vhost_user_vring_close (vui, msg.state.index);
       vlib_worker_thread_barrier_release (vm);
-      vu_log_debug (vui, "if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+      vu_log_debug (vui,
+                   "if %d msg VHOST_USER_GET_VRING_BASE idx %d num 0x%x",
                    vui->hw_if_index, msg.state.index, msg.state.num);
       n =
        send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
@@ -1440,7 +1493,8 @@ vhost_user_vui_init (vnet_main_t * vnm,
                     vhost_user_intf_t * vui,
                     int server_sock_fd,
                     const char *sock_filename,
-                    u64 feature_mask, u32 * sw_if_index, u8 enable_gso)
+                    u64 feature_mask, u32 * sw_if_index, u8 enable_gso,
+                    u8 enable_packed)
 {
   vnet_sw_interface_t *sw;
   int q;
@@ -1472,6 +1526,7 @@ vhost_user_vui_init (vnet_main_t * vnm,
   vui->log_base_addr = 0;
   vui->if_index = vui - vum->vhost_user_interfaces;
   vui->enable_gso = enable_gso;
+  vui->enable_packed = enable_packed;
   /*
    * enable_gso takes precedence over configurable feature mask if there
    * is a clash.
@@ -1519,7 +1574,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
                      u32 * sw_if_index,
                      u64 feature_mask,
                      u8 renumber, u32 custom_dev_instance, u8 * hwaddr,
-                     u8 enable_gso)
+                     u8 enable_gso, u8 enable_packed)
 {
   vhost_user_intf_t *vui = NULL;
   u32 sw_if_idx = ~0;
@@ -1560,7 +1615,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
   vlib_worker_thread_barrier_release (vm);
 
   vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename,
-                      feature_mask, &sw_if_idx, enable_gso);
+                      feature_mask, &sw_if_idx, enable_gso, enable_packed);
   vnet_sw_interface_set_mtu (vnm, vui->sw_if_index, 9000);
   vhost_user_rx_thread_placement (vui, 1);
 
@@ -1582,7 +1637,7 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
                      u8 is_server,
                      u32 sw_if_index,
                      u64 feature_mask, u8 renumber, u32 custom_dev_instance,
-                     u8 enable_gso)
+                     u8 enable_gso, u8 enable_packed)
 {
   vhost_user_main_t *vum = &vhost_user_main;
   vhost_user_intf_t *vui = NULL;
@@ -1619,7 +1674,8 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
 
   vhost_user_term_if (vui);
   vhost_user_vui_init (vnm, vui, server_sock_fd,
-                      sock_filename, feature_mask, &sw_if_idx, enable_gso);
+                      sock_filename, feature_mask, &sw_if_idx, enable_gso,
+                      enable_packed);
 
   if (renumber)
     vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
@@ -1645,7 +1701,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
   u8 hwaddr[6];
   u8 *hw = NULL;
   clib_error_t *error = NULL;
-  u8 enable_gso = 0;
+  u8 enable_gso = 0, enable_packed = 0;
 
   /* Get a line of input. */
   if (!unformat_user (input, unformat_line_input, line_input))
@@ -1653,6 +1709,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
 
   /* GSO feature is disable by default */
   feature_mask &= ~FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS;
+  /* packed-ring feature is disable by default */
+  feature_mask &= ~(1ULL << FEAT_VIRTIO_F_RING_PACKED);
   while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
     {
       if (unformat (line_input, "socket %s", &sock_filename))
@@ -1661,6 +1719,8 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
        is_server = 1;
       else if (unformat (line_input, "gso"))
        enable_gso = 1;
+      else if (unformat (line_input, "packed"))
+       enable_packed = 1;
       else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
        ;
       else
@@ -1685,7 +1745,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm,
   if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename,
                                  is_server, &sw_if_index, feature_mask,
                                  renumber, custom_dev_instance, hw,
-                                 enable_gso)))
+                                 enable_gso, enable_packed)))
     {
       error = clib_error_return (0, "vhost_user_create_if returned %d", rv);
       goto done;
@@ -1799,6 +1859,186 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm,
   return rv;
 }
 
+static u8 *
+format_vhost_user_desc (u8 * s, va_list * args)
+{
+  char *fmt = va_arg (*args, char *);
+  vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+  vring_desc_t *desc_table = va_arg (*args, vring_desc_t *);
+  int idx = va_arg (*args, int);
+  u32 *mem_hint = va_arg (*args, u32 *);
+
+  s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len,
+             desc_table[idx].flags, desc_table[idx].next,
+             pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr,
+                                              mem_hint)));
+  return s;
+}
+
+static u8 *
+format_vhost_user_vring (u8 * s, va_list * args)
+{
+  char *fmt = va_arg (*args, char *);
+  vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+  int q = va_arg (*args, int);
+
+  s = format (s, fmt, vui->vrings[q].avail->flags, vui->vrings[q].avail->idx,
+             vui->vrings[q].used->flags, vui->vrings[q].used->idx);
+  return s;
+}
+
+static void
+vhost_user_show_fds (vlib_main_t * vm, vhost_user_intf_t * vui, int q)
+{
+  int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
+  int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
+
+  vlib_cli_output (vm, "  kickfd %d callfd %d errfd %d\n", kickfd, callfd,
+                  vui->vrings[q].errfd);
+}
+
+static void
+vhost_user_show_desc (vlib_main_t * vm, vhost_user_intf_t * vui, int q,
+                     int show_descr, int show_verbose)
+{
+  int j;
+  u32 mem_hint = 0;
+  u32 idx;
+  u32 n_entries;
+  vring_desc_t *desc_table;
+
+  if (vui->vrings[q].avail && vui->vrings[q].used)
+    vlib_cli_output (vm, "%U", format_vhost_user_vring,
+                    "  avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+                    vui, q);
+
+  vhost_user_show_fds (vm, vui, q);
+
+  if (show_descr)
+    {
+      vlib_cli_output (vm, "\n  descriptor table:\n");
+      vlib_cli_output (vm,
+                      "  slot         addr         len  flags  next      "
+                      "user_addr\n");
+      vlib_cli_output (vm,
+                      "  ===== ================== ===== ====== ===== "
+                      "==================\n");
+      for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
+       {
+         desc_table = vui->vrings[q].desc;
+         vlib_cli_output (vm, "%U", format_vhost_user_desc,
+                          "  %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", vui,
+                          desc_table, j, &mem_hint);
+         if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT))
+           {
+             n_entries = desc_table[j].len / sizeof (vring_desc_t);
+             desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint);
+             if (desc_table)
+               {
+                 for (idx = 0; idx < clib_min (20, n_entries); idx++)
+                   {
+                     vlib_cli_output
+                       (vm, "%U", format_vhost_user_desc,
+                        ">  %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+                        desc_table, idx, &mem_hint);
+                   }
+                 if (n_entries >= 20)
+                   vlib_cli_output (vm, "Skip displaying entries 20...%u\n",
+                                    n_entries);
+               }
+           }
+       }
+    }
+}
+
+static u8 *
+format_vhost_user_packed_desc (u8 * s, va_list * args)
+{
+  char *fmt = va_arg (*args, char *);
+  vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+  vring_packed_desc_t *desc_table = va_arg (*args, vring_packed_desc_t *);
+  int idx = va_arg (*args, int);
+  u32 *mem_hint = va_arg (*args, u32 *);
+
+  s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len,
+             desc_table[idx].flags, desc_table[idx].id,
+             pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr,
+                                              mem_hint)));
+  return s;
+}
+
+static u8 *
+format_vhost_user_vring_packed (u8 * s, va_list * args)
+{
+  char *fmt = va_arg (*args, char *);
+  vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *);
+  int q = va_arg (*args, int);
+
+  s = format (s, fmt, vui->vrings[q].avail_event->flags,
+             vui->vrings[q].avail_event->off_wrap,
+             vui->vrings[q].used_event->flags,
+             vui->vrings[q].used_event->off_wrap,
+             vui->vrings[q].avail_wrap_counter,
+             vui->vrings[q].used_wrap_counter);
+  return s;
+}
+
+static void
+vhost_user_show_desc_packed (vlib_main_t * vm, vhost_user_intf_t * vui, int q,
+                            int show_descr, int show_verbose)
+{
+  int j;
+  u32 mem_hint = 0;
+  u32 idx;
+  u32 n_entries;
+  vring_packed_desc_t *desc_table;
+
+  if (vui->vrings[q].avail_event && vui->vrings[q].used_event)
+    vlib_cli_output (vm, "%U", format_vhost_user_vring_packed,
+                    "  avail_event.flags %x avail_event.off_wrap %u "
+                    "used_event.flags %x used_event.off_wrap %u\n"
+                    "  avail wrap counter %u, used wrap counter %u\n",
+                    vui, q);
+
+  vhost_user_show_fds (vm, vui, q);
+
+  if (show_descr)
+    {
+      vlib_cli_output (vm, "\n  descriptor table:\n");
+      vlib_cli_output (vm,
+                      "  slot         addr         len  flags  id    "
+                      "user_addr\n");
+      vlib_cli_output (vm,
+                      "  ===== ================== ===== ====== ===== "
+                      "==================\n");
+      for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
+       {
+         desc_table = vui->vrings[q].packed_desc;
+         vlib_cli_output (vm, "%U", format_vhost_user_packed_desc,
+                          "  %-5u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+                          desc_table, j, &mem_hint);
+         if (show_verbose && (desc_table[j].flags & VIRTQ_DESC_F_INDIRECT))
+           {
+             n_entries = desc_table[j].len >> 4;
+             desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint);
+             if (desc_table)
+               {
+                 for (idx = 0; idx < clib_min (20, n_entries); idx++)
+                   {
+                     vlib_cli_output
+                       (vm, "%U", format_vhost_user_packed_desc,
+                        ">  %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui,
+                        desc_table, idx, &mem_hint);
+                   }
+                 if (n_entries >= 20)
+                   vlib_cli_output (vm, "Skip displaying entries 20...%u\n",
+                                    n_entries);
+               }
+           }
+       }
+    }
+}
+
 clib_error_t *
 show_vhost_user_command_fn (vlib_main_t * vm,
                            unformat_input_t * input,
@@ -1814,6 +2054,7 @@ show_vhost_user_command_fn (vlib_main_t * vm,
   u32 ci;
   int i, j, q;
   int show_descr = 0;
+  int show_verbose = 0;
   struct feat_struct
   {
     u8 bit;
@@ -1855,6 +2096,8 @@ show_vhost_user_command_fn (vlib_main_t * vm,
        }
       else if (unformat (input, "descriptors") || unformat (input, "desc"))
        show_descr = 1;
+      else if (unformat (input, "verbose"))
+       show_verbose = 1;
       else
        {
          error = clib_error_return (0, "unknown input `%U'",
@@ -1884,6 +2127,8 @@ show_vhost_user_command_fn (vlib_main_t * vm,
                       hw_if_indices[i]);
       if (vui->enable_gso)
        vlib_cli_output (vm, "  GSO enable");
+      if (vui->enable_packed)
+       vlib_cli_output (vm, "  Packed ring enable");
 
       vlib_cli_output (vm, "virtio_net_hdr_sz %d\n"
                       " features mask (0x%llx): \n"
@@ -1985,41 +2230,11 @@ show_vhost_user_command_fn (vlib_main_t * vm,
                           vui->vrings[q].last_avail_idx,
                           vui->vrings[q].last_used_idx);
 
-         if (vui->vrings[q].avail && vui->vrings[q].used)
-           vlib_cli_output (vm,
-                            "  avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
-                            vui->vrings[q].avail->flags,
-                            vui->vrings[q].avail->idx,
-                            vui->vrings[q].used->flags,
-                            vui->vrings[q].used->idx);
-
-         int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx);
-         int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx);
-         vlib_cli_output (vm, "  kickfd %d callfd %d errfd %d\n",
-                          kickfd, callfd, vui->vrings[q].errfd);
-
-         if (show_descr)
-           {
-             vlib_cli_output (vm, "\n  descriptor table:\n");
-             vlib_cli_output (vm,
-                              "   id          addr         len  flags  next      user_addr\n");
-             vlib_cli_output (vm,
-                              "  ===== ================== ===== ====== ===== ==================\n");
-             for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++)
-               {
-                 u32 mem_hint = 0;
-                 vlib_cli_output (vm,
-                                  "  %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
-                                  j, vui->vrings[q].desc[j].addr,
-                                  vui->vrings[q].desc[j].len,
-                                  vui->vrings[q].desc[j].flags,
-                                  vui->vrings[q].desc[j].next,
-                                  pointer_to_uword (map_guest_mem
-                                                    (vui,
-                                                     vui->vrings[q].desc[j].
-                                                     addr, &mem_hint)));
-               }
-           }
+         if (vhost_user_is_packed_ring_supported (vui))
+           vhost_user_show_desc_packed (vm, vui, q, show_descr,
+                                        show_verbose);
+         else
+           vhost_user_show_desc (vm, vui, q, show_descr, show_verbose);
        }
       vlib_cli_output (vm, "\n");
     }
@@ -2090,7 +2305,8 @@ done:
 VLIB_CLI_COMMAND (vhost_user_connect_command, static) = {
     .path = "create vhost-user",
     .short_help = "create vhost-user socket <socket-filename> [server] "
-    "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso]",
+    "[feature-mask <hex>] [hwaddr <mac-addr>] [renumber <dev_instance>] [gso] "
+    "[packed]",
     .function = vhost_user_connect_command_fn,
     .is_mp_safe = 1,
 };
@@ -2251,7 +2467,8 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = {
 /* *INDENT-OFF* */
 VLIB_CLI_COMMAND (show_vhost_user_command, static) = {
     .path = "show vhost-user",
-    .short_help = "show vhost-user [<interface> [<interface> [..]]] [descriptors]",
+    .short_help = "show vhost-user [<interface> [<interface> [..]]] "
+    "[[descriptors] [verbose]]",
     .function = show_vhost_user_command_fn,
 };
 /* *INDENT-ON* */