X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Fdevices%2Fvirtio%2Fvhost-user.c;h=5460f10b74e1072347ca8a6019a2ee46ae958035;hb=fe7d4a2e31529eed5416b38b520fdc84687df03c;hp=f9bbae4f90f73ca7229ad46e023d907a314403e1;hpb=3cd9eed64b82bd50735434e0679e7fd085ec2884;p=vpp.git diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index f9bbae4f90f..5460f10b74e 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -49,14 +49,14 @@ */ -#define VHOST_USER_DEBUG_SOCKET 0 #define VHOST_DEBUG_VQ 0 -#if VHOST_USER_DEBUG_SOCKET == 1 -#define DBG_SOCK(args...) clib_warning(args); -#else -#define DBG_SOCK(args...) -#endif +#define DBG_SOCK(args...) \ + { \ + vhost_user_main_t *_vum = &vhost_user_main; \ + if (_vum->debug) \ + clib_warning(args); \ + }; #if VHOST_DEBUG_VQ == 1 #define DBG_VQ(args...) clib_warning(args); @@ -86,11 +86,22 @@ * The value 64 was obtained by testing (48 and 128 were not as good). */ #define VHOST_USER_RX_COPY_THRESHOLD 64 +/* + * On the transmit side, we keep processing the buffers from vlib in the while + * loop and prepare the copy order to be executed later. However, the static + * array which we keep the copy order is limited to VHOST_USER_COPY_ARRAY_N + * entries. In order to not corrupt memory, we have to do the copy when the + * static array reaches the copy threshold. We subtract 40 in case the code + * goes into the inner loop for a maximum of 64k frames which may require + * more array entries. + */ +#define VHOST_USER_TX_COPY_THRESHOLD (VHOST_USER_COPY_ARRAY_N - 40) -#define UNIX_GET_FD(unixfd_idx) \ - (unixfd_idx != ~0) ? \ - pool_elt_at_index (unix_main.file_pool, \ - unixfd_idx)->file_descriptor : -1; +#define UNIX_GET_FD(unixfd_idx) ({ \ + typeof(unixfd_idx) __unixfd_idx = (unixfd_idx); \ + (__unixfd_idx != ~0) ? \ + pool_elt_at_index (file_main.file_pool, \ + __unixfd_idx)->file_descriptor : -1; }) #define foreach_virtio_trace_flags \ _ (SIMPLE_CHAINED, 0, "Simple descriptor chaining") \ @@ -237,7 +248,8 @@ map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint) r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x88); r = _mm_shuffle_epi8 (r, _mm_set_epi64x (0, 0x0e060c040a020800)); - i = __builtin_ctzll (_mm_movemask_epi8 (r)); + i = __builtin_ctzll (_mm_movemask_epi8 (r) | + (1 << VHOST_MEMORY_MAX_NREGIONS)); if (i < vui->nregions) { @@ -245,7 +257,76 @@ map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint) return (void *) (vui->region_mmap_addr[i] + addr - vui->regions[i].guest_phys_addr); } +#elif __aarch64__ && __ARM_NEON + uint64x2_t al, ah, rl, rh, r; + uint32_t u32 = 0; + + al = vdupq_n_u64 (addr + 1); + ah = vdupq_n_u64 (addr); + + /*First Iteration */ + rl = vld1q_u64 (&vui->region_guest_addr_lo[0]); + rl = vcgtq_u64 (al, rl); + rh = vld1q_u64 (&vui->region_guest_addr_hi[0]); + rh = vcgtq_u64 (rh, ah); + r = vandq_u64 (rl, rh); + u32 |= (vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1); + u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 1); + if (u32) + { + i = __builtin_ctzll (u32); + goto vhost_map_guest_mem_done; + } + + /*Second Iteration */ + rl = vld1q_u64 (&vui->region_guest_addr_lo[2]); + rl = vcgtq_u64 (al, rl); + rh = vld1q_u64 (&vui->region_guest_addr_hi[2]); + rh = vcgtq_u64 (rh, ah); + r = vandq_u64 (rl, rh); + u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 2); + u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 3); + + if (u32) + { + i = __builtin_ctzll (u32); + goto vhost_map_guest_mem_done; + } + + /*Third Iteration */ + rl = vld1q_u64 (&vui->region_guest_addr_lo[4]); + rl = vcgtq_u64 (al, rl); + rh = vld1q_u64 (&vui->region_guest_addr_hi[4]); + rh = vcgtq_u64 (rh, ah); + r = vandq_u64 (rl, rh); + u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 4); + u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 5); + + if (u32) + { + i = __builtin_ctzll (u32); + goto vhost_map_guest_mem_done; + } + + /*Fourth Iteration */ + rl = vld1q_u64 (&vui->region_guest_addr_lo[6]); + rl = vcgtq_u64 (al, rl); + rh = vld1q_u64 (&vui->region_guest_addr_hi[6]); + rh = vcgtq_u64 (rh, ah); + r = vandq_u64 (rl, rh); + u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 6); + u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 7); + + i = __builtin_ctzll (u32 | (1 << VHOST_MEMORY_MAX_NREGIONS)); + +vhost_map_guest_mem_done: + if (i < vui->nregions) + { + *hint = i; + return (void *) (vui->region_mmap_addr[i] + addr - + vui->regions[i].guest_phys_addr); + } #else for (i = 0; i < vui->nregions; i++) { @@ -295,14 +376,14 @@ unmap_all_mem_regions (vhost_user_intf_t * vui) int i, r; for (i = 0; i < vui->nregions; i++) { - if (vui->region_mmap_addr[i] != (void *) -1) + if (vui->region_mmap_addr[i] != MAP_FAILED) { long page_sz = get_huge_page_size (vui->region_mmap_fd[i]); ssize_t map_sz = (vui->regions[i].memory_size + vui->regions[i].mmap_offset + - page_sz) & ~(page_sz - 1); + page_sz - 1) & ~(page_sz - 1); r = munmap (vui->region_mmap_addr[i] - vui->regions[i].mmap_offset, @@ -312,7 +393,7 @@ unmap_all_mem_regions (vhost_user_intf_t * vui) ("unmap memory region %d addr 0x%lx len 0x%lx page_sz 0x%x", i, vui->region_mmap_addr[i], map_sz, page_sz); - vui->region_mmap_addr[i] = (void *) -1; + vui->region_mmap_addr[i] = MAP_FAILED; if (r == -1) { @@ -330,7 +411,7 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) { //Let's try to assign one queue to each thread u32 qid = 0; - u32 cpu_index = 0; + u32 thread_index = 0; vui->use_tx_spinlock = 0; while (1) { @@ -340,118 +421,92 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) if (!rxvq->started || !rxvq->enabled) continue; - vui->per_cpu_tx_qid[cpu_index] = qid; - cpu_index++; - if (cpu_index == vlib_get_thread_main ()->n_vlib_mains) + vui->per_cpu_tx_qid[thread_index] = qid; + thread_index++; + if (thread_index == vlib_get_thread_main ()->n_vlib_mains) return; } //We need to loop, meaning the spinlock has to be used vui->use_tx_spinlock = 1; - if (cpu_index == 0) + if (thread_index == 0) { //Could not find a single valid one - for (cpu_index = 0; - cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++) + for (thread_index = 0; + thread_index < vlib_get_thread_main ()->n_vlib_mains; + thread_index++) { - vui->per_cpu_tx_qid[cpu_index] = 0; + vui->per_cpu_tx_qid[thread_index] = 0; } return; } } } +/** + * @brief Unassign existing interface/queue to thread mappings and re-assign + * new interface/queue to thread mappings + */ static void vhost_user_rx_thread_placement () { vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui; - vhost_cpu_t *vhc; - u32 *workers = 0; - - //Let's list all workers cpu indexes - u32 i; - for (i = vum->input_cpu_first_index; - i < vum->input_cpu_first_index + vum->input_cpu_count; i++) - { - vlib_node_set_state (vlib_mains ? vlib_mains[i] : &vlib_global_main, - vhost_user_input_node.index, - VLIB_NODE_STATE_DISABLED); - vec_add1 (workers, i); - } + vhost_user_vring_t *txvq; + vnet_main_t *vnm = vnet_get_main (); + u32 qid; + int rv; + u16 *queue; - vec_foreach (vhc, vum->cpus) - { - vec_reset_length (vhc->rx_queues); - } + // Scrap all existing mappings for all interfaces/queues + /* *INDENT-OFF* */ + pool_foreach (vui, vum->vhost_user_interfaces, { + vec_foreach (queue, vui->rx_queues) + { + rv = vnet_hw_interface_unassign_rx_thread (vnm, vui->hw_if_index, + *queue); + if (rv) + clib_warning ("Warning: unable to unassign interface %d, " + "queue %d: rc=%d", vui->hw_if_index, *queue, rv); + } + vec_reset_length (vui->rx_queues); + }); + /* *INDENT-ON* */ - i = 0; - vhost_iface_and_queue_t iaq; + // Create the rx_queues for all interfaces /* *INDENT-OFF* */ pool_foreach (vui, vum->vhost_user_interfaces, { - u32 *vui_workers = vec_len (vui->workers) ? vui->workers : workers; - u32 qid; for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++) { - vhost_user_vring_t *txvq = - &vui->vrings[VHOST_VRING_IDX_TX (qid)]; - if (!txvq->started) - continue; - - i %= vec_len (vui_workers); - u32 cpu_index = vui_workers[i]; - i++; - vhc = &vum->cpus[cpu_index]; - - iaq.qid = qid; - iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; - vec_add1 (vhc->rx_queues, iaq); - vlib_node_set_state (vlib_mains ? vlib_mains[cpu_index] : - &vlib_global_main, vhost_user_input_node.index, - VLIB_NODE_STATE_POLLING); + txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; + if (txvq->started) + { + if (txvq->mode == VNET_HW_INTERFACE_RX_MODE_UNKNOWN) + /* Set polling as the default */ + txvq->mode = VNET_HW_INTERFACE_RX_MODE_POLLING; + vec_add1 (vui->rx_queues, qid); + } } }); /* *INDENT-ON* */ -} -static int -vhost_user_thread_placement (u32 sw_if_index, u32 worker_thread_index, u8 del) -{ - vhost_user_main_t *vum = &vhost_user_main; - vhost_user_intf_t *vui; - vnet_hw_interface_t *hw; - - if (worker_thread_index < vum->input_cpu_first_index || - worker_thread_index >= - vum->input_cpu_first_index + vum->input_cpu_count) - return -1; - - if (!(hw = vnet_get_sup_hw_interface (vnet_get_main (), sw_if_index))) - return -2; - - vui = pool_elt_at_index (vum->vhost_user_interfaces, hw->dev_instance); - u32 found = ~0, *w; - vec_foreach (w, vui->workers) - { - if (*w == worker_thread_index) - { - found = w - vui->workers; - break; - } - } - - if (del) - { - if (found == ~0) - return -3; - vec_del1 (vui->workers, found); - } - else if (found == ~0) - { - vec_add1 (vui->workers, worker_thread_index); - } - - vhost_user_rx_thread_placement (); - return 0; + // Assign new mappings for all interfaces/queues + /* *INDENT-OFF* */ + pool_foreach (vui, vum->vhost_user_interfaces, { + vnet_hw_interface_set_input_node (vnm, vui->hw_if_index, + vhost_user_input_node.index); + vec_foreach (queue, vui->rx_queues) + { + vnet_hw_interface_assign_rx_thread (vnm, vui->hw_if_index, *queue, + ~0); + txvq = &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; + rv = vnet_hw_interface_set_rx_mode (vnm, vui->hw_if_index, *queue, + txvq->mode); + if (rv) + clib_warning ("Warning: unable to set rx mode for interface %d, " + "queue %d: rc=%d", vui->hw_if_index, *queue, rv); + } + }); + /* *INDENT-ON* */ } /** @brief Returns whether at least one TX and one RX vring are enabled */ @@ -485,17 +540,35 @@ vhost_user_update_iface_state (vhost_user_intf_t * vui) vhost_user_tx_thread_placement (vui); } +static void +vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) +{ + u32 qid; + vnet_main_t *vnm = vnet_get_main (); + + qid = ifq & 0xff; + if ((qid & 1) == 0) + /* Only care about the odd number, or TX, virtqueue */ + return; + + if (vhost_user_intf_ready (vui)) + // qid >> 1 is to convert virtqueue number to vring queue index + vnet_device_input_set_interrupt_pending (vnm, vui->hw_if_index, qid >> 1); +} + static clib_error_t * -vhost_user_callfd_read_ready (unix_file_t * uf) +vhost_user_callfd_read_ready (clib_file_t * uf) { __attribute__ ((unused)) int n; u8 buff[8]; + n = read (uf->file_descriptor, ((char *) &buff), 8); + return 0; } static clib_error_t * -vhost_user_kickfd_read_ready (unix_file_t * uf) +vhost_user_kickfd_read_ready (clib_file_t * uf) { __attribute__ ((unused)) int n; u8 buff[8]; @@ -503,13 +576,19 @@ vhost_user_kickfd_read_ready (unix_file_t * uf) pool_elt_at_index (vhost_user_main.vhost_user_interfaces, uf->private_data >> 8); u32 qid = uf->private_data & 0xff; + n = read (uf->file_descriptor, ((char *) &buff), 8); DBG_SOCK ("if %d KICK queue %d", uf->private_data >> 8, qid); + if (!vui->vrings[qid].started || + (vhost_user_intf_ready (vui) != vui->is_up)) + { + vlib_worker_thread_barrier_sync (vlib_get_main ()); + vui->vrings[qid].started = 1; + vhost_user_update_iface_state (vui); + vlib_worker_thread_barrier_release (vlib_get_main ()); + } - vlib_worker_thread_barrier_sync (vlib_get_main ()); - vui->vrings[qid].started = 1; - vhost_user_update_iface_state (vui); - vlib_worker_thread_barrier_release (vlib_get_main ()); + vhost_user_set_interrupt_pending (vui, uf->private_data); return 0; } @@ -570,20 +649,23 @@ vhost_user_vring_close (vhost_user_intf_t * vui, u32 qid) vhost_user_vring_t *vring = &vui->vrings[qid]; if (vring->kickfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vring->kickfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vring->kickfd_idx = ~0; } if (vring->callfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vring->callfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vring->callfd_idx = ~0; } if (vring->errfd != -1) - close (vring->errfd); + { + close (vring->errfd); + vring->errfd = -1; + } vhost_user_vring_init (vui, qid); } @@ -595,10 +677,10 @@ vhost_user_if_disconnect (vhost_user_intf_t * vui) vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); - if (vui->unix_file_index != ~0) + if (vui->clib_file_index != ~0) { - unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index); - vui->unix_file_index = ~0; + clib_file_del (&file_main, file_main.file_pool + vui->clib_file_index); + vui->clib_file_index = ~0; } vui->is_up = 0; @@ -622,7 +704,7 @@ vhost_user_log_dirty_pages_2 (vhost_user_intf_t * vui, } if (is_host_address) { - addr = (u64) map_user_mem (vui, (uword) addr); + addr = pointer_to_uword (map_user_mem (vui, (uword) addr)); } if (PREDICT_FALSE ((addr + len - 1) / VHOST_LOG_PAGE / 8 >= vui->log_size)) { @@ -652,7 +734,7 @@ vhost_user_log_dirty_pages (vhost_user_intf_t * vui, u64 addr, u64 len) } static clib_error_t * -vhost_user_socket_read (unix_file_t * uf) +vhost_user_socket_read (clib_file_t * uf) { int n, i; int fd, number_of_fds = 0; @@ -664,7 +746,7 @@ vhost_user_socket_read (unix_file_t * uf) vhost_user_intf_t *vui; struct cmsghdr *cmsg; u8 q; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; vnet_main_t *vnm = vnet_get_main (); vui = pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data); @@ -817,10 +899,10 @@ vhost_user_socket_read (unix_file_t * uf) long page_sz = get_huge_page_size (fds[i]); - /* align size to 2M page */ + /* align size to page */ ssize_t map_sz = (vui->regions[i].memory_size + vui->regions[i].mmap_offset + - page_sz) & ~(page_sz - 1); + page_sz - 1) & ~(page_sz - 1); vui->region_mmap_addr[i] = mmap (0, map_sz, PROT_READ | PROT_WRITE, MAP_SHARED, fds[i], 0); @@ -840,8 +922,9 @@ vhost_user_socket_read (unix_file_t * uf) } vui->region_mmap_addr[i] += vui->regions[i].mmap_offset; vui->region_mmap_fd[i] = fds[i]; + + vui->nregions++; } - vui->nregions = msg.memory.nregions; break; case VHOST_USER_SET_VRING_NUM: @@ -852,7 +935,7 @@ vhost_user_socket_read (unix_file_t * uf) (msg.state.num == 0) || /* it cannot be zero */ ((msg.state.num - 1) & msg.state.num)) /* must be power of 2 */ goto close_socket; - vui->vrings[msg.state.index].qsz = msg.state.num; + vui->vrings[msg.state.index].qsz_mask = msg.state.num - 1; break; case VHOST_USER_SET_VRING_ADDR: @@ -917,7 +1000,7 @@ vhost_user_socket_read (unix_file_t * uf) break; case VHOST_USER_SET_VRING_CALL: - DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_CALL u64 %d", + DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_CALL %d", vui->hw_if_index, msg.u64); q = (u8) (msg.u64 & 0xFF); @@ -925,13 +1008,13 @@ vhost_user_socket_read (unix_file_t * uf) /* if there is old fd, delete and close it */ if (vui->vrings[q].callfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vui->vrings[q].callfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vui->vrings[q].callfd_idx = ~0; } - if (!(msg.u64 & 0x100)) + if (!(msg.u64 & VHOST_USER_VRING_NOFD_MASK)) { if (number_of_fds != 1) { @@ -943,27 +1026,27 @@ vhost_user_socket_read (unix_file_t * uf) template.file_descriptor = fds[0]; template.private_data = ((vui - vhost_user_main.vhost_user_interfaces) << 8) + q; - vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template); + vui->vrings[q].callfd_idx = clib_file_add (&file_main, &template); } else vui->vrings[q].callfd_idx = ~0; break; case VHOST_USER_SET_VRING_KICK: - DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_KICK u64 %d", + DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_KICK %d", vui->hw_if_index, msg.u64); q = (u8) (msg.u64 & 0xFF); if (vui->vrings[q].kickfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vui->vrings[q].kickfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vui->vrings[q].kickfd_idx = ~0; } - if (!(msg.u64 & 0x100)) + if (!(msg.u64 & VHOST_USER_VRING_NOFD_MASK)) { if (number_of_fds != 1) { @@ -976,7 +1059,7 @@ vhost_user_socket_read (unix_file_t * uf) template.private_data = (((uword) (vui - vhost_user_main.vhost_user_interfaces)) << 8) + q; - vui->vrings[q].kickfd_idx = unix_file_add (&unix_main, &template); + vui->vrings[q].kickfd_idx = clib_file_add (&file_main, &template); } else { @@ -988,7 +1071,7 @@ vhost_user_socket_read (unix_file_t * uf) break; case VHOST_USER_SET_VRING_ERR: - DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_ERR u64 %d", + DBG_SOCK ("if %d msg VHOST_USER_SET_VRING_ERR %d", vui->hw_if_index, msg.u64); q = (u8) (msg.u64 & 0xFF); @@ -996,7 +1079,7 @@ vhost_user_socket_read (unix_file_t * uf) if (vui->vrings[q].errfd != -1) close (vui->vrings[q].errfd); - if (!(msg.u64 & 0x100)) + if (!(msg.u64 & VHOST_USER_VRING_NOFD_MASK)) { if (number_of_fds != 1) goto close_socket; @@ -1016,9 +1099,6 @@ vhost_user_socket_read (unix_file_t * uf) break; case VHOST_USER_GET_VRING_BASE: - DBG_SOCK ("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", - vui->hw_if_index, msg.state.index, msg.state.num); - if (msg.state.index >= VHOST_VRING_MAX_N) { DBG_SOCK ("invalid vring index VHOST_USER_GET_VRING_BASE:" @@ -1026,12 +1106,18 @@ vhost_user_socket_read (unix_file_t * uf) goto close_socket; } - /* Spec says: Client must [...] stop ring upon receiving VHOST_USER_GET_VRING_BASE. */ - vhost_user_vring_close (vui, msg.state.index); - + /* + * Copy last_avail_idx from the vring before closing it because + * closing the vring also initializes the vring last_avail_idx + */ msg.state.num = vui->vrings[msg.state.index].last_avail_idx; msg.flags |= 4; msg.size = sizeof (msg.state); + + /* Spec says: Client must [...] stop ring upon receiving VHOST_USER_GET_VRING_BASE. */ + vhost_user_vring_close (vui, msg.state.index); + DBG_SOCK ("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", + vui->hw_if_index, msg.state.index, msg.state.num); break; case VHOST_USER_NONE: @@ -1060,10 +1146,10 @@ vhost_user_socket_read (unix_file_t * uf) } fd = fds[0]; - /* align size to 2M page */ + /* align size to page */ long page_sz = get_huge_page_size (fd); ssize_t map_sz = - (msg.log.size + msg.log.offset + page_sz) & ~(page_sz - 1); + (msg.log.size + msg.log.offset + page_sz - 1) & ~(page_sz - 1); vui->log_base_addr = mmap (0, map_sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); @@ -1093,28 +1179,30 @@ vhost_user_socket_read (unix_file_t * uf) break; case VHOST_USER_GET_PROTOCOL_FEATURES: - DBG_SOCK ("if %d msg VHOST_USER_GET_PROTOCOL_FEATURES", - vui->hw_if_index); - msg.flags |= 4; msg.u64 = (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD) | (1 << VHOST_USER_PROTOCOL_F_MQ); msg.size = sizeof (msg.u64); + DBG_SOCK + ("if %d msg VHOST_USER_GET_PROTOCOL_FEATURES - reply 0x%016llx", + vui->hw_if_index, msg.u64); break; case VHOST_USER_SET_PROTOCOL_FEATURES: - DBG_SOCK ("if %d msg VHOST_USER_SET_PROTOCOL_FEATURES features 0x%lx", - vui->hw_if_index, msg.u64); + DBG_SOCK + ("if %d msg VHOST_USER_SET_PROTOCOL_FEATURES features 0x%016llx", + vui->hw_if_index, msg.u64); vui->protocol_features = msg.u64; break; case VHOST_USER_GET_QUEUE_NUM: - DBG_SOCK ("if %d msg VHOST_USER_GET_QUEUE_NUM", vui->hw_if_index); msg.flags |= 4; msg.u64 = VHOST_VRING_MAX_N; msg.size = sizeof (msg.u64); + DBG_SOCK ("if %d msg VHOST_USER_GET_QUEUE_NUM - reply %d", + vui->hw_if_index, msg.u64); break; case VHOST_USER_SET_VRING_ENABLE: @@ -1161,7 +1249,7 @@ close_socket: } static clib_error_t * -vhost_user_socket_error (unix_file_t * uf) +vhost_user_socket_error (clib_file_t * uf) { vlib_main_t *vm = vlib_get_main (); vhost_user_main_t *vum = &vhost_user_main; @@ -1177,11 +1265,11 @@ vhost_user_socket_error (unix_file_t * uf) } static clib_error_t * -vhost_user_socksvr_accept_ready (unix_file_t * uf) +vhost_user_socksvr_accept_ready (clib_file_t * uf) { int client_fd, client_len; struct sockaddr_un client; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui; @@ -1195,12 +1283,20 @@ vhost_user_socksvr_accept_ready (unix_file_t * uf) if (client_fd < 0) return clib_error_return_unix (0, "accept"); - DBG_SOCK ("New client socket for vhost interface %d", vui->sw_if_index); + if (vui->clib_file_index != ~0) + { + DBG_SOCK ("Close client socket for vhost interface %d, fd %d", + vui->sw_if_index, UNIX_GET_FD (vui->clib_file_index)); + clib_file_del (&file_main, file_main.file_pool + vui->clib_file_index); + } + + DBG_SOCK ("New client socket for vhost interface %d, fd %d", + vui->sw_if_index, client_fd); template.read_function = vhost_user_socket_read; template.error_function = vhost_user_socket_error; template.file_descriptor = client_fd; template.private_data = vui - vhost_user_main.vhost_user_interfaces; - vui->unix_file_index = unix_file_add (&unix_main, &template); + vui->clib_file_index = clib_file_add (&file_main, &template); return 0; } @@ -1210,8 +1306,6 @@ vhost_user_init (vlib_main_t * vm) clib_error_t *error; vhost_user_main_t *vum = &vhost_user_main; vlib_thread_main_t *tm = vlib_get_thread_main (); - vlib_thread_registration_t *tr; - uword *p; error = vlib_call_init_function (vm, ip4_init); if (error) @@ -1230,33 +1324,14 @@ vhost_user_init (vlib_main_t * vm) cpu->rx_buffers_len = 0; } - /* find out which cpus will be used for input */ - vum->input_cpu_first_index = 0; - vum->input_cpu_count = 1; - p = hash_get_mem (tm->thread_registrations_by_name, "workers"); - tr = p ? (vlib_thread_registration_t *) p[0] : 0; - - if (tr && tr->count > 0) - { - vum->input_cpu_first_index = tr->first_index; - vum->input_cpu_count = tr->count; - } - vum->random = random_default_seed (); - return 0; -} - -VLIB_INIT_FUNCTION (vhost_user_init); + mhash_init_c_string (&vum->if_index_by_sock_name, sizeof (uword)); -static clib_error_t * -vhost_user_exit (vlib_main_t * vm) -{ - /* TODO cleanup */ return 0; } -VLIB_MAIN_LOOP_EXIT_FUNCTION (vhost_user_exit); +VLIB_INIT_FUNCTION (vhost_user_init); static u8 * format_vhost_trace (u8 * s, va_list * va) @@ -1271,7 +1346,7 @@ format_vhost_trace (u8 * s, va_list * va) vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, vui->sw_if_index); - uword indent = format_get_indent (s); + u32 indent = format_get_indent (s); s = format (s, "%U %U queue %d\n", format_white_space, indent, format_vnet_sw_interface_name, vnm, sw, t->qid); @@ -1302,9 +1377,8 @@ vhost_user_rx_trace (vhost_trace_t * t, vlib_buffer_t * b, vhost_user_vring_t * txvq) { vhost_user_main_t *vum = &vhost_user_main; - u32 qsz_mask = txvq->qsz - 1; u32 last_avail_idx = txvq->last_avail_idx; - u32 desc_current = txvq->avail->ring[last_avail_idx & qsz_mask]; + u32 desc_current = txvq->avail->ring[last_avail_idx & txvq->qsz_mask]; vring_desc_t *hdr_desc = 0; virtio_net_hdr_mrg_rxbuf_t *hdr; u32 hint = 0; @@ -1349,9 +1423,16 @@ vhost_user_send_call (vlib_main_t * vm, vhost_user_vring_t * vq) vhost_user_main_t *vum = &vhost_user_main; u64 x = 1; int fd = UNIX_GET_FD (vq->callfd_idx); - int rv __attribute__ ((unused)); - /* TODO: pay attention to rv */ + int rv; + rv = write (fd, &x, sizeof (x)); + if (rv <= 0) + { + clib_unix_warning + ("Error: Could not write to unix socket for callfd %d", fd); + return; + } + vq->n_since_last_int = 0; vq->int_deadline = vlib_time_now (vm) + vum->coalesce_time; } @@ -1416,19 +1497,19 @@ vhost_user_rx_discard_packet (vlib_main_t * vm, */ u32 discarded_packets = 0; u32 avail_idx = txvq->avail->idx; - u16 qsz_mask = txvq->qsz - 1; while (discarded_packets != discard_max) { if (avail_idx == txvq->last_avail_idx) goto out; u16 desc_chain_head = - txvq->avail->ring[txvq->last_avail_idx & qsz_mask]; + txvq->avail->ring[txvq->last_avail_idx & txvq->qsz_mask]; txvq->last_avail_idx++; - txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head; - txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0; + txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].id = + desc_chain_head; + txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].len = 0; vhost_user_log_dirty_ring (vui, txvq, - ring[txvq->last_used_idx & qsz_mask]); + ring[txvq->last_used_idx & txvq->qsz_mask]); txvq->last_used_idx++; discarded_packets++; } @@ -1459,13 +1540,15 @@ vhost_user_input_rewind_buffers (vlib_main_t * vm, b_current->current_length = 0; b_current->flags = 0; } + cpu->rx_buffers_len++; } static u32 vhost_user_if_input (vlib_main_t * vm, vhost_user_main_t * vum, vhost_user_intf_t * vui, - u16 qid, vlib_node_runtime_t * node) + u16 qid, vlib_node_runtime_t * node, + vnet_hw_interface_rx_mode mode) { vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; u16 n_rx_packets = 0; @@ -1474,9 +1557,8 @@ vhost_user_if_input (vlib_main_t * vm, u32 n_left_to_next, *to_next; u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; u32 n_trace = vlib_get_trace_count (vm, node); - u16 qsz_mask; u32 map_hint = 0; - u16 cpu_index = os_get_cpu_number (); + u16 thread_index = vlib_get_thread_index (); u16 copy_len = 0; { @@ -1491,6 +1573,26 @@ vhost_user_if_input (vlib_main_t * vm, vhost_user_send_call (vm, rxvq); } + /* + * For adaptive mode, it is optimized to reduce interrupts. + * If the scheduler switches the input node to polling due + * to burst of traffic, we tell the driver no interrupt. + * When the traffic subsides, the scheduler switches the node back to + * interrupt mode. We must tell the driver we want interrupt. + */ + if (PREDICT_FALSE (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) + { + if ((node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) || + !(node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + /* Tell driver we want notification */ + txvq->used->flags = 0; + else + /* Tell driver we don't want notification */ + txvq->used->flags = VRING_USED_F_NO_NOTIFY; + } + if (PREDICT_FALSE (txvq->avail->flags & 0xFFFE)) return 0; @@ -1514,7 +1616,7 @@ vhost_user_if_input (vlib_main_t * vm, return 0; } - if (PREDICT_FALSE (n_left == txvq->qsz)) + if (PREDICT_FALSE (n_left == (txvq->qsz_mask + 1))) { /* * Informational error logging when VPP is not @@ -1524,8 +1626,6 @@ vhost_user_if_input (vlib_main_t * vm, VHOST_USER_INPUT_FUNC_ERROR_FULL_RX_QUEUE, 1); } - qsz_mask = txvq->qsz - 1; - if (n_left > VLIB_FRAME_SIZE) n_left = VLIB_FRAME_SIZE; @@ -1534,33 +1634,35 @@ vhost_user_if_input (vlib_main_t * vm, * per packet. In case packets are bigger, we will just yeld at some point * in the loop and come back later. This is not an issue as for big packet, * processing cost really comes from the memory copy. + * The assumption is that big packets will fit in 40 buffers. */ - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len < n_left + 1 || + vum->cpus[thread_index].rx_buffers_len < 40)) { - u32 curr_len = vum->cpus[cpu_index].rx_buffers_len; - vum->cpus[cpu_index].rx_buffers_len += + u32 curr_len = vum->cpus[thread_index].rx_buffers_len; + vum->cpus[thread_index].rx_buffers_len += vlib_buffer_alloc_from_free_list (vm, - vum->cpus[cpu_index].rx_buffers + + vum->cpus[thread_index].rx_buffers + curr_len, VHOST_USER_RX_BUFFERS_N - curr_len, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len < + (vum->cpus[thread_index].rx_buffers_len < VHOST_USER_RX_BUFFER_STARVATION)) { /* In case of buffer starvation, discard some packets from the queue * and log the event. * We keep doing best effort for the remaining packets. */ - u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ? - n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1; + u32 flush = (n_left + 1 > vum->cpus[thread_index].rx_buffers_len) ? + n_left + 1 - vum->cpus[thread_index].rx_buffers_len : 1; flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush); n_left -= flush; vlib_increment_simple_counter (vnet_main. interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), + vlib_get_thread_index (), vui->sw_if_index, flush); vlib_error_count (vm, vhost_user_input_node.index, @@ -1580,7 +1682,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 desc_data_offset; vring_desc_t *desc_table = txvq->desc; - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len <= 1)) { /* Not enough rx_buffers * Note: We yeld on 1 so we don't need to do an additional @@ -1590,25 +1692,29 @@ vhost_user_if_input (vlib_main_t * vm, break; } - desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask]; - vum->cpus[cpu_index].rx_buffers_len--; - bi_current = (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index].rx_buffers_len]; + desc_current = + txvq->avail->ring[txvq->last_avail_idx & txvq->qsz_mask]; + vum->cpus[thread_index].rx_buffers_len--; + bi_current = (vum->cpus[thread_index].rx_buffers) + [vum->cpus[thread_index].rx_buffers_len]; b_head = b_current = vlib_get_buffer (vm, bi_current); to_next[0] = bi_current; //We do that now so we can forget about bi_current to_next++; n_left_to_next--; vlib_prefetch_buffer_with_index (vm, - (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index]. + (vum-> + cpus[thread_index].rx_buffers) + [vum->cpus[thread_index]. rx_buffers_len - 1], LOAD); /* Just preset the used descriptor id and length for later */ - txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_current; - txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0; + txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].id = + desc_current; + txvq->used->ring[txvq->last_used_idx & txvq->qsz_mask].len = 0; vhost_user_log_dirty_ring (vui, txvq, - ring[txvq->last_used_idx & qsz_mask]); + ring[txvq->last_used_idx & + txvq->qsz_mask]); /* The buffer should already be initialized */ b_head->total_length_not_including_first_buffer = 0; @@ -1636,7 +1742,8 @@ vhost_user_if_input (vlib_main_t * vm, desc_current = 0; if (PREDICT_FALSE (desc_table == 0)) { - //FIXME: Handle error by shutdown the queue + vlib_error_count (vm, node->node_index, + VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1); goto out; } } @@ -1675,8 +1782,12 @@ vhost_user_if_input (vlib_main_t * vm, (b_current->current_length == VLIB_BUFFER_DATA_SIZE)) { if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len == 0)) + (vum->cpus[thread_index].rx_buffers_len == 0)) { + /* Cancel speculation */ + to_next--; + n_left_to_next++; + /* * Checking if there are some left buffers. * If not, just rewind the used buffers and stop. @@ -1685,17 +1796,18 @@ vhost_user_if_input (vlib_main_t * vm, * but valid. */ vhost_user_input_rewind_buffers (vm, - &vum->cpus[cpu_index], + &vum->cpus + [thread_index], b_head); n_left = 0; goto stop; } /* Get next output */ - vum->cpus[cpu_index].rx_buffers_len--; + vum->cpus[thread_index].rx_buffers_len--; u32 bi_next = - (vum->cpus[cpu_index].rx_buffers)[vum->cpus - [cpu_index].rx_buffers_len]; + (vum->cpus[thread_index].rx_buffers)[vum->cpus + [thread_index].rx_buffers_len]; b_current->next_buffer = bi_next; b_current->flags |= VLIB_BUFFER_NEXT_PRESENT; bi_current = bi_next; @@ -1703,13 +1815,14 @@ vhost_user_if_input (vlib_main_t * vm, } /* Prepare a copy order executed later for the data */ - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; u32 desc_data_l = desc_table[desc_current].len - desc_data_offset; cpy->len = VLIB_BUFFER_DATA_SIZE - b_current->current_length; cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len; - cpy->dst = (uword) vlib_buffer_get_current (b_current); + cpy->dst = (uword) (vlib_buffer_get_current (b_current) + + b_current->current_length); cpy->src = desc_table[desc_current].addr + desc_data_offset; desc_data_offset += cpy->len; @@ -1742,7 +1855,7 @@ vhost_user_if_input (vlib_main_t * vm, /* redirect if feature path enabled */ vnet_feature_start_device_input_x1 (vui->sw_if_index, &next0, - b_head, 0); + b_head); u32 bi = to_next[-1]; //Cannot use to_next[-1] in the macro vlib_validate_buffer_enqueue_x1 (vm, node, next_index, @@ -1760,16 +1873,11 @@ vhost_user_if_input (vlib_main_t * vm, if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD)) { if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { - clib_warning - ("Memory mapping error on interface hw_if_index=%d " - "(Shutting down - Switch interface down and up to restart)", - vui->hw_if_index); - vui->admin_up = 0; - copy_len = 0; - break; + vlib_error_count (vm, node->node_index, + VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1); } copy_len = 0; @@ -1785,13 +1893,11 @@ vhost_user_if_input (vlib_main_t * vm, /* Do the memory copies */ if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { - clib_warning ("Memory mapping error on interface hw_if_index=%d " - "(Shutting down - Switch interface down and up to restart)", - vui->hw_if_index); - vui->admin_up = 0; + vlib_error_count (vm, node->node_index, + VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1); } /* give buffers back to driver */ @@ -1800,7 +1906,8 @@ vhost_user_if_input (vlib_main_t * vm, vhost_user_log_dirty_ring (vui, txvq, idx); /* interrupt (call) handling */ - if ((txvq->callfd_idx != ~0) && !(txvq->avail->flags & 1)) + if ((txvq->callfd_idx != ~0) && + !(txvq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { txvq->n_since_last_int += n_rx_packets; @@ -1812,7 +1919,9 @@ vhost_user_if_input (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), vui->sw_if_index, n_rx_packets, n_rx_bytes); + + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -1823,15 +1932,21 @@ vhost_user_input (vlib_main_t * vm, { vhost_user_main_t *vum = &vhost_user_main; uword n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); - + vhost_user_intf_t *vui; + vnet_device_input_runtime_t *rt = + (vnet_device_input_runtime_t *) node->runtime_data; + vnet_device_and_queue_t *dq; - vhost_iface_and_queue_t *vhiq; - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (dq, rt->devices_and_queues) { - vhost_user_intf_t *vui = - &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; - n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); + if (clib_smp_swap (&dq->interrupt_pending, 0) || + (node->state == VLIB_NODE_STATE_POLLING)) + { + vui = + pool_elt_at_index (vum->vhost_user_interfaces, dq->dev_instance); + n_rx_packets = vhost_user_if_input (vm, vum, vui, dq->queue_id, node, + dq->mode); + } } return n_rx_packets; @@ -1864,9 +1979,8 @@ vhost_user_tx_trace (vhost_trace_t * t, vlib_buffer_t * b, vhost_user_vring_t * rxvq) { vhost_user_main_t *vum = &vhost_user_main; - u32 qsz_mask = rxvq->qsz - 1; u32 last_avail_idx = rxvq->last_avail_idx; - u32 desc_current = rxvq->avail->ring[last_avail_idx & qsz_mask]; + u32 desc_current = rxvq->avail->ring[last_avail_idx & rxvq->qsz_mask]; vring_desc_t *hdr_desc = 0; u32 hint = 0; @@ -1954,9 +2068,8 @@ vhost_user_tx (vlib_main_t * vm, pool_elt_at_index (vum->vhost_user_interfaces, rd->dev_instance); u32 qid = ~0; vhost_user_vring_t *rxvq; - u16 qsz_mask; u8 error; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 map_hint = 0; u8 retry = 8; u16 copy_len; @@ -1976,13 +2089,11 @@ vhost_user_tx (vlib_main_t * vm, qid = VHOST_VRING_IDX_RX (*vec_elt_at_index - (vui->per_cpu_tx_qid, os_get_cpu_number ())); + (vui->per_cpu_tx_qid, thread_index)); rxvq = &vui->vrings[qid]; if (PREDICT_FALSE (vui->use_tx_spinlock)) vhost_user_vring_lock (vui, qid); - qsz_mask = rxvq->qsz - 1; /* qsz is always power of 2 */ - retry: error = VHOST_USER_TX_FUNC_ERROR_NONE; tx_headers_len = 0; @@ -2003,10 +2114,10 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace = + vum->cpus[thread_index].current_trace = vlib_add_trace (vm, node, b0, - sizeof (*vum->cpus[cpu_index].current_trace)); - vhost_user_tx_trace (vum->cpus[cpu_index].current_trace, + sizeof (*vum->cpus[thread_index].current_trace)); + vhost_user_tx_trace (vum->cpus[thread_index].current_trace, vui, qid / 2, b0, rxvq); } @@ -2018,7 +2129,7 @@ retry: desc_table = rxvq->desc; desc_head = desc_index = - rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask]; + rxvq->avail->ring[rxvq->last_avail_idx & rxvq->qsz_mask]; /* Go deeper in case of indirect descriptor * I don't know of any driver providing indirect for RX. */ @@ -2048,14 +2159,14 @@ retry: { // Get a header from the header array virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len]; + &vum->cpus[thread_index].tx_headers[tx_headers_len]; tx_headers_len++; hdr->hdr.flags = 0; hdr->hdr.gso_type = 0; hdr->num_buffers = 1; //This is local, no need to check // Prepare a copy order executed later for the header - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = vui->virtio_net_hdr_sz; cpy->dst = buffer_map_addr; @@ -2080,16 +2191,16 @@ retry: else if (vui->virtio_net_hdr_sz == 12) //MRG is available { virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + &vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; //Move from available to used buffer - rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = + rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].id = desc_head; - rxvq->used->ring[rxvq->last_used_idx & qsz_mask].len = + rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].len = desc_len; vhost_user_log_dirty_ring (vui, rxvq, ring[rxvq->last_used_idx & - qsz_mask]); + rxvq->qsz_mask]); rxvq->last_avail_idx++; rxvq->last_used_idx++; @@ -2108,7 +2219,7 @@ retry: desc_table = rxvq->desc; desc_head = desc_index = - rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask]; + rxvq->avail->ring[rxvq->last_avail_idx & rxvq->qsz_mask]; if (PREDICT_FALSE (rxvq->desc[desc_head].flags & VIRTQ_DESC_F_INDIRECT)) { @@ -2142,7 +2253,7 @@ retry: } { - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = bytes_left; cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len; @@ -2176,33 +2287,52 @@ retry: } //Move from available to used ring - rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = desc_head; - rxvq->used->ring[rxvq->last_used_idx & qsz_mask].len = desc_len; + rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].id = desc_head; + rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].len = desc_len; vhost_user_log_dirty_ring (vui, rxvq, - ring[rxvq->last_used_idx & qsz_mask]); + ring[rxvq->last_used_idx & rxvq->qsz_mask]); rxvq->last_avail_idx++; rxvq->last_used_idx++; if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace->hdr = - vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + vum->cpus[thread_index].current_trace->hdr = + vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; } n_left--; //At the end for error counting when 'goto done' is invoked + + /* + * Do the copy periodically to prevent + * vum->cpus[thread_index].copy array overflow and corrupt memory + */ + if (PREDICT_FALSE (copy_len >= VHOST_USER_TX_COPY_THRESHOLD)) + { + if (PREDICT_FALSE + (vhost_user_tx_copy (vui, vum->cpus[thread_index].copy, + copy_len, &map_hint))) + { + vlib_error_count (vm, node->node_index, + VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1); + } + copy_len = 0; + + /* give buffers back to driver */ + CLIB_MEMORY_BARRIER (); + rxvq->used->idx = rxvq->last_used_idx; + vhost_user_log_dirty_ring (vui, rxvq, idx); + } buffers++; } done: //Do the memory copies if (PREDICT_FALSE - (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_tx_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { - clib_warning ("Memory mapping error on interface hw_if_index=%d " - "(Shutting down - Switch interface down and up to restart)", - vui->hw_if_index); - vui->admin_up = 0; + vlib_error_count (vm, node->node_index, + VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1); } CLIB_MEMORY_BARRIER (); @@ -2228,7 +2358,8 @@ done: } /* interrupt (call) handling */ - if ((rxvq->callfd_idx != ~0) && !(rxvq->avail->flags & 1)) + if ((rxvq->callfd_idx != ~0) && + !(rxvq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { rxvq->n_since_last_int += frame->n_vectors - n_left; @@ -2245,13 +2376,173 @@ done3: vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), vui->sw_if_index, n_left); + thread_index, vui->sw_if_index, n_left); } vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); return frame->n_vectors; } +static uword +vhost_user_send_interrupt_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vhost_user_intf_t *vui; + f64 timeout = 3153600000.0 /* 100 years */ ; + uword event_type, *event_data = 0; + vhost_user_main_t *vum = &vhost_user_main; + u16 *queue; + f64 now, poll_time_remaining; + f64 next_timeout; + u8 stop_timer = 0; + + while (1) + { + poll_time_remaining = + vlib_process_wait_for_event_or_clock (vm, timeout); + event_type = vlib_process_get_events (vm, &event_data); + vec_reset_length (event_data); + + /* + * Use the remaining timeout if it is less than coalesce time to avoid + * resetting the existing timer in the middle of expiration + */ + timeout = poll_time_remaining; + if (vlib_process_suspend_time_is_zero (timeout) || + (timeout > vum->coalesce_time)) + timeout = vum->coalesce_time; + + now = vlib_time_now (vm); + switch (event_type) + { + case VHOST_USER_EVENT_STOP_TIMER: + stop_timer = 1; + break; + + case VHOST_USER_EVENT_START_TIMER: + stop_timer = 0; + if (!vlib_process_suspend_time_is_zero (poll_time_remaining)) + break; + /* fall through */ + + case ~0: + /* *INDENT-OFF* */ + pool_foreach (vui, vum->vhost_user_interfaces, { + next_timeout = timeout; + vec_foreach (queue, vui->rx_queues) + { + vhost_user_vring_t *rxvq = + &vui->vrings[VHOST_VRING_IDX_RX (*queue)]; + vhost_user_vring_t *txvq = + &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; + + if (txvq->n_since_last_int) + { + if (now >= txvq->int_deadline) + vhost_user_send_call (vm, txvq); + else + next_timeout = txvq->int_deadline - now; + } + + if (rxvq->n_since_last_int) + { + if (now >= rxvq->int_deadline) + vhost_user_send_call (vm, rxvq); + else + next_timeout = rxvq->int_deadline - now; + } + + if ((next_timeout < timeout) && (next_timeout > 0.0)) + timeout = next_timeout; + } + }); + /* *INDENT-ON* */ + break; + + default: + clib_warning ("BUG: unhandled event type %d", event_type); + break; + } + /* No less than 1 millisecond */ + if (timeout < 1e-3) + timeout = 1e-3; + if (stop_timer) + timeout = 3153600000.0; + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vhost_user_send_interrupt_node,static) = { + .function = vhost_user_send_interrupt_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "vhost-user-send-interrupt-process", +}; +/* *INDENT-ON* */ + +static clib_error_t * +vhost_user_interface_rx_mode_change (vnet_main_t * vnm, u32 hw_if_index, + u32 qid, vnet_hw_interface_rx_mode mode) +{ + vlib_main_t *vm = vnm->vlib_main; + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + vhost_user_main_t *vum = &vhost_user_main; + vhost_user_intf_t *vui = + pool_elt_at_index (vum->vhost_user_interfaces, hif->dev_instance); + vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; + + if ((mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || + (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) + { + if (txvq->kickfd_idx == ~0) + { + // We cannot support interrupt mode if the driver opts out + return clib_error_return (0, "Driver does not support interrupt"); + } + if (txvq->mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + { + vum->ifq_count++; + // Start the timer if this is the first encounter on interrupt + // interface/queue + if ((vum->ifq_count == 1) && + (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_START_TIMER, 0); + } + } + else if (mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + { + if (((txvq->mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || + (txvq->mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) && + vum->ifq_count) + { + vum->ifq_count--; + // Stop the timer if there is no more interrupt interface/queue + if ((vum->ifq_count == 0) && + (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_STOP_TIMER, 0); + } + } + + txvq->mode = mode; + if (mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + txvq->used->flags = VRING_USED_F_NO_NOTIFY; + else if ((mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE) || + (mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT)) + txvq->used->flags = 0; + else + { + clib_warning ("BUG: unhandled mode %d changed for if %d queue %d", mode, + hw_if_index, qid); + return clib_error_return (0, "unsupported"); + } + + return 0; +} + static clib_error_t * vhost_user_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) @@ -2264,7 +2555,7 @@ vhost_user_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, vui->admin_up = is_up; - if (is_up) + if (is_up && vui->is_up) vnet_hw_interface_set_flags (vnm, vui->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP); @@ -2280,6 +2571,7 @@ VNET_DEVICE_CLASS (vhost_user_dev_class,static) = { .format_device_name = format_vhost_user_interface_name, .name_renumber = vhost_user_name_renumber, .admin_up_down_function = vhost_user_interface_admin_up_down, + .rx_mode_change_function = vhost_user_interface_rx_mode_change, .format_tx_trace = format_vhost_trace, }; @@ -2295,18 +2587,15 @@ vhost_user_process (vlib_main_t * vm, vhost_user_intf_t *vui; struct sockaddr_un sun; int sockfd; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; f64 timeout = 3153600000.0 /* 100 years */ ; uword *event_data = 0; - sockfd = socket (AF_UNIX, SOCK_STREAM, 0); + sockfd = -1; sun.sun_family = AF_UNIX; template.read_function = vhost_user_socket_read; template.error_function = vhost_user_socket_error; - if (sockfd < 0) - return 0; - while (1) { vlib_process_wait_for_event_or_clock (vm, timeout); @@ -2319,26 +2608,48 @@ vhost_user_process (vlib_main_t * vm, pool_foreach (vui, vum->vhost_user_interfaces, { if (vui->unix_server_index == ~0) { //Nothing to do for server sockets - if (vui->unix_file_index == ~0) + if (vui->clib_file_index == ~0) { + if ((sockfd < 0) && + ((sockfd = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)) + { + /* + * 1st time error or new error for this interface, + * spit out the message and record the error + */ + if (!vui->sock_errno || (vui->sock_errno != errno)) + { + clib_unix_warning + ("Error: Could not open unix socket for %s", + vui->sock_filename); + vui->sock_errno = errno; + } + continue; + } + /* try to connect */ strncpy (sun.sun_path, (char *) vui->sock_filename, sizeof (sun.sun_path) - 1); + /* Avoid hanging VPP if the other end does not accept */ + if (fcntl(sockfd, F_SETFL, O_NONBLOCK) < 0) + clib_unix_warning ("fcntl"); + if (connect (sockfd, (struct sockaddr *) &sun, sizeof (struct sockaddr_un)) == 0) { + /* Set the socket to blocking as it was before */ + if (fcntl(sockfd, F_SETFL, 0) < 0) + clib_unix_warning ("fcntl2"); + vui->sock_errno = 0; template.file_descriptor = sockfd; template.private_data = vui - vhost_user_main.vhost_user_interfaces; - vui->unix_file_index = unix_file_add (&unix_main, &template); + vui->clib_file_index = clib_file_add (&file_main, &template); - //Re-open for next connect - if ((sockfd = socket (AF_UNIX, SOCK_STREAM, 0)) < 0) { - clib_warning("Critical: Could not open unix socket"); - return 0; - } + /* This sockfd is considered consumed */ + sockfd = -1; } else { @@ -2350,7 +2661,7 @@ vhost_user_process (vlib_main_t * vm, /* check if socket is alive */ int error = 0; socklen_t len = sizeof (error); - int fd = UNIX_GET_FD(vui->unix_file_index); + int fd = UNIX_GET_FD(vui->clib_file_index); int retval = getsockopt (fd, SOL_SOCKET, SO_ERROR, &error, &len); @@ -2382,20 +2693,30 @@ VLIB_REGISTER_NODE (vhost_user_process_node,static) = { static void vhost_user_term_if (vhost_user_intf_t * vui) { - // Delete configured thread pinning - vec_reset_length (vui->workers); + int q; + vhost_user_main_t *vum = &vhost_user_main; + // disconnect interface sockets vhost_user_if_disconnect (vui); vhost_user_update_iface_state (vui); + for (q = 0; q < VHOST_VRING_MAX_N; q++) + { + clib_mem_free ((void *) vui->vring_locks[q]); + } + if (vui->unix_server_index != ~0) { //Close server socket - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vui->unix_server_index); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vui->unix_server_index = ~0; + unlink (vui->sock_filename); } + + mhash_unset (&vum->if_index_by_sock_name, vui->sock_filename, + &vui->if_index); } int @@ -2405,6 +2726,7 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vhost_user_intf_t *vui; int rv = 0; vnet_hw_interface_t *hwif; + u16 *queue; if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || hwif->dev_class_index != vhost_user_dev_class.index) @@ -2415,12 +2737,31 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vui = pool_elt_at_index (vum->vhost_user_interfaces, hwif->dev_instance); + vec_foreach (queue, vui->rx_queues) + { + vhost_user_vring_t *txvq; + + txvq = &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; + if ((vum->ifq_count > 0) && + ((txvq->mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || + (txvq->mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE))) + { + vum->ifq_count--; + // Stop the timer if there is no more interrupt interface/queue + if ((vum->ifq_count == 0) && + (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) + { + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_STOP_TIMER, 0); + break; + } + } + } + // Disable and reset interface vhost_user_term_if (vui); - // Back to pool - pool_put (vum->vhost_user_interfaces, vui); - // Reset renumbered iface if (hwif->dev_instance < vec_len (vum->show_dev_instance_by_real_dev_instance)) @@ -2428,9 +2769,32 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) // Delete ethernet interface ethernet_delete_interface (vnm, vui->hw_if_index); + + // Back to pool + pool_put (vum->vhost_user_interfaces, vui); + return rv; } +static clib_error_t * +vhost_user_exit (vlib_main_t * vm) +{ + vnet_main_t *vnm = vnet_get_main (); + vhost_user_main_t *vum = &vhost_user_main; + vhost_user_intf_t *vui; + + vlib_worker_thread_barrier_sync (vlib_get_main ()); + /* *INDENT-OFF* */ + pool_foreach (vui, vum->vhost_user_interfaces, { + vhost_user_delete_if (vnm, vm, vui->sw_if_index); + }); + /* *INDENT-ON* */ + vlib_worker_thread_barrier_release (vlib_get_main ()); + return 0; +} + +VLIB_MAIN_LOOP_EXIT_FUNCTION (vhost_user_exit); + /** * Open server unix socket on specified sock_filename. */ @@ -2520,16 +2884,19 @@ vhost_user_vui_init (vnet_main_t * vnm, u64 feature_mask, u32 * sw_if_index) { vnet_sw_interface_t *sw; - sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index); int q; + vhost_user_main_t *vum = &vhost_user_main; + vnet_hw_interface_t *hw; + hw = vnet_get_hw_interface (vnm, vui->hw_if_index); + sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index); if (server_sock_fd != -1) { - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; template.read_function = vhost_user_socksvr_accept_ready; template.file_descriptor = server_sock_fd; - template.private_data = vui - vhost_user_main.vhost_user_interfaces; //hw index - vui->unix_server_index = unix_file_add (&unix_main, &template); + template.private_data = vui - vum->vhost_user_interfaces; //hw index + vui->unix_server_index = clib_file_add (&file_main, &template); } else { @@ -2542,12 +2909,16 @@ vhost_user_vui_init (vnet_main_t * vnm, vui->sock_errno = 0; vui->is_up = 0; vui->feature_mask = feature_mask; - vui->unix_file_index = ~0; + vui->clib_file_index = ~0; vui->log_base_addr = 0; + vui->if_index = vui - vum->vhost_user_interfaces; + mhash_set_mem (&vum->if_index_by_sock_name, vui->sock_filename, + &vui->if_index, 0); for (q = 0; q < VHOST_VRING_MAX_N; q++) vhost_user_vring_init (vui, q); + hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); if (sw_if_index) @@ -2577,12 +2948,25 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_idx = ~0; int rv = 0; int server_sock_fd = -1; + vhost_user_main_t *vum = &vhost_user_main; + uword *if_index; if (sock_filename == NULL || !(strlen (sock_filename) > 0)) { return VNET_API_ERROR_INVALID_ARGUMENT; } + if_index = mhash_get (&vum->if_index_by_sock_name, (void *) sock_filename); + if (if_index) + { + if (sw_if_index) + { + vui = &vum->vhost_user_interfaces[*if_index]; + *sw_if_index = vui->sw_if_index; + } + return VNET_API_ERROR_IF_ALREADY_EXISTS; + } + if (is_server) { if ((rv = @@ -2606,6 +2990,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); + return rv; } @@ -2622,13 +3007,25 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, int server_sock_fd = -1; int rv = 0; vnet_hw_interface_t *hwif; + uword *if_index; if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || hwif->dev_class_index != vhost_user_dev_class.index) return VNET_API_ERROR_INVALID_SW_IF_INDEX; + if (sock_filename == NULL || !(strlen (sock_filename) > 0)) + return VNET_API_ERROR_INVALID_ARGUMENT; + vui = vec_elt_at_index (vum->vhost_user_interfaces, hwif->dev_instance); + /* + * Disallow changing the interface to have the same path name + * as other interface + */ + if_index = mhash_get (&vum->if_index_by_sock_name, (void *) sock_filename); + if (if_index && (*if_index != vui->if_index)) + return VNET_API_ERROR_IF_ALREADY_EXISTS; + // First try to open server socket if (is_server) if ((rv = vhost_user_init_server_sock (sock_filename, @@ -2644,6 +3041,7 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); + return rv; } @@ -2661,6 +3059,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm, u32 custom_dev_instance = ~0; u8 hwaddr[6]; u8 *hw = NULL; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -2683,10 +3082,12 @@ vhost_user_connect_command_fn (vlib_main_t * vm, renumber = 1; } else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); vnet_main_t *vnm = vnet_get_main (); @@ -2695,14 +3096,18 @@ vhost_user_connect_command_fn (vlib_main_t * vm, is_server, &sw_if_index, feature_mask, renumber, custom_dev_instance, hw))) { - vec_free (sock_filename); - return clib_error_return (0, "vhost_user_create_if returned %d", rv); + error = clib_error_return (0, "vhost_user_create_if returned %d", rv); + goto done; } - vec_free (sock_filename); vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (), sw_if_index); - return 0; + +done: + vec_free (sock_filename); + unformat_free (line_input); + + return error; } clib_error_t * @@ -2713,6 +3118,7 @@ vhost_user_delete_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; u32 sw_if_index = ~0; vnet_main_t *vnm = vnet_get_main (); + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -2730,15 +3136,25 @@ vhost_user_delete_command_fn (vlib_main_t * vm, vnet_get_sup_hw_interface (vnm, sw_if_index); if (hwif == NULL || vhost_user_dev_class.index != hwif->dev_class_index) - return clib_error_return (0, "Not a vhost interface"); + { + error = clib_error_return (0, "Not a vhost interface"); + goto done; + } } else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); + vhost_user_delete_if (vnm, vm, sw_if_index); - return 0; + +done: + unformat_free (line_input); + + return error; } int @@ -2772,6 +3188,7 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, vuid->virtio_net_hdr_sz = vui->virtio_net_hdr_sz; vuid->features = vui->features; vuid->num_regions = vui->nregions; + vuid->is_server = vui->unix_server_index != ~0; vuid->sock_errno = vui->sock_errno; strncpy ((char *) vuid->sock_filename, (char *) vui->sock_filename, ARRAY_LEN (vuid->sock_filename) - 1); @@ -2802,10 +3219,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, vhost_user_intf_t *vui; u32 hw_if_index, *hw_if_indices = 0; vnet_hw_interface_t *hi; - vhost_cpu_t *vhc; - vhost_iface_and_queue_t *vhiq; + u16 *queue; u32 ci; - int i, j, q; int show_descr = 0; struct feat_struct @@ -2858,6 +3273,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "Virtio vhost-user interfaces"); vlib_cli_output (vm, "Global:\n coalesce frames %d time %e", vum->coalesce_frames, vum->coalesce_time); + vlib_cli_output (vm, " number of rx virtqueues in interrupt mode: %d", + vum->ifq_count); for (i = 0; i < vec_len (hw_if_indices); i++) { @@ -2900,14 +3317,20 @@ show_vhost_user_command_fn (vlib_main_t * vm, strerror (vui->sock_errno)); vlib_cli_output (vm, " rx placement: "); - vec_foreach (vhc, vum->cpus) + + vec_foreach (queue, vui->rx_queues) { - vec_foreach (vhiq, vhc->rx_queues) - { - if (vhiq->vhost_iface_index == vui - vum->vhost_user_interfaces) - vlib_cli_output (vm, " thread %d on vring %d\n", - vhc - vum->cpus, VHOST_VRING_IDX_TX (vhiq->qid)); - } + vnet_main_t *vnm = vnet_get_main (); + uword thread_index; + vnet_hw_interface_rx_mode mode; + + thread_index = vnet_get_device_input_thread_index (vnm, + vui->hw_if_index, + *queue); + vnet_hw_interface_get_rx_mode (vnm, vui->hw_if_index, *queue, &mode); + vlib_cli_output (vm, " thread %d on vring %d, %U\n", + thread_index, VHOST_VRING_IDX_TX (*queue), + format_vnet_hw_interface_rx_mode, mode); } vlib_cli_output (vm, " tx placement: %s\n", @@ -2952,7 +3375,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, vlib_cli_output (vm, " qsz %d last_avail_idx %d last_used_idx %d\n", - vui->vrings[q].qsz, vui->vrings[q].last_avail_idx, + vui->vrings[q].qsz_mask + 1, + vui->vrings[q].last_avail_idx, vui->vrings[q].last_used_idx); if (vui->vrings[q].avail && vui->vrings[q].used) @@ -2975,7 +3399,7 @@ show_vhost_user_command_fn (vlib_main_t * vm, " id addr len flags next user_addr\n"); vlib_cli_output (vm, " ===== ================== ===== ====== ===== ==================\n"); - for (j = 0; j < vui->vrings[q].qsz; j++) + for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) { u32 mem_hint = 0; vlib_cli_output (vm, @@ -3009,16 +3433,23 @@ done: * * There are several parameters associated with a vHost interface: * - * - socket - Name of the linux socket used by QEMU/VM and - * VPP to manage the vHost interface. If socket does not already exist, VPP will - * create the socket. + * - socket - Name of the linux socket used by hypervisor + * and VPP to manage the vHost interface. If in 'server' mode, VPP will + * create the socket if it does not already exist. If in 'client' mode, + * hypervisor will create the socket if it does not already exist. The VPP code + * is indifferent to the file location. However, if SELinux is enabled, then the + * socket needs to be created in '/var/run/vpp/'. * - * - server - Optional flag to indicate that VPP should be the server for the - * linux socket. If not provided, VPP will be the client. + * - server - Optional flag to indicate that VPP should be the server for + * the linux socket. If not provided, VPP will be the client. In 'server' + * mode, the VM can be reset without tearing down the vHost Interface. In + * 'client' mode, VPP can be reset without bringing down the VM and + * tearing down the vHost Interface. * * - feature-mask - Optional virtio/vhost feature set negotiated at - * startup. By default, all supported features will be advertised. Otherwise, - * provide the set of features desired. + * startup. This is intended for degugging only. It is recommended that this + * parameter not be used except by experienced users. By default, all supported + * features will be advertised. Otherwise, provide the set of features desired. * - 0x000008000 (15) - VIRTIO_NET_F_MRG_RXBUF * - 0x000020000 (17) - VIRTIO_NET_F_CTRL_VQ * - 0x000200000 (21) - VIRTIO_NET_F_GUEST_ANNOUNCE @@ -3038,12 +3469,12 @@ done: * * @cliexpar * Example of how to create a vhost interface with VPP as the client and all features enabled: - * @cliexstart{create vhost-user socket /tmp/vhost1.sock} + * @cliexstart{create vhost-user socket /var/run/vpp/vhost1.sock} * VirtualEthernet0/0/0 * @cliexend * Example of how to create a vhost interface with VPP as the server and with just * multiple queues enabled: - * @cliexstart{create vhost-user socket /tmp/vhost2.sock server feature-mask 0x40400000} + * @cliexstart{create vhost-user socket /var/run/vpp/vhost2.sock server feature-mask 0x40400000} * VirtualEthernet0/0/1 * @cliexend * Once the vHost interface is created, enable the interface using: @@ -3052,14 +3483,15 @@ done: /* *INDENT-OFF* */ VLIB_CLI_COMMAND (vhost_user_connect_command, static) = { .path = "create vhost-user", - .short_help = "create vhost-user socket [server] [feature-mask ] [hwaddr ] [renumber ]", + .short_help = "create vhost-user socket [server] " + "[feature-mask ] [hwaddr ] [renumber ] ", .function = vhost_user_connect_command_fn, }; /* *INDENT-ON* */ /*? * Delete a vHost User interface using the interface name or the - * software interface index. Use the 'show interfaces' + * software interface index. Use the 'show interface' * command to determine the software interface index. On deletion, * the linux socket will not be deleted. * @@ -3101,7 +3533,7 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = { * VHOST_USER_PROTOCOL_F_MQ (0) * VHOST_USER_PROTOCOL_F_LOG_SHMFD (1) * - * socket filename /tmp/vhost1.sock type client errno "Success" + * socket filename /var/run/vpp/vhost1.sock type client errno "Success" * * rx placement: * thread 1 on vring 1 @@ -3217,6 +3649,64 @@ VLIB_CLI_COMMAND (show_vhost_user_command, static) = { }; /* *INDENT-ON* */ +clib_error_t * +debug_vhost_user_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = NULL; + vhost_user_main_t *vum = &vhost_user_main; + u8 onoff = 0; + u8 input_found = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return clib_error_return (0, "missing argument"); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (input_found) + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } + + if (unformat (line_input, "on")) + { + input_found = 1; + onoff = 1; + } + else if (unformat (line_input, "off")) + { + input_found = 1; + onoff = 0; + } + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } + } + + vum->debug = onoff; + +done: + unformat_free (line_input); + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (debug_vhost_user_command, static) = { + .path = "debug vhost-user", + .short_help = "debug vhost-user ", + .function = debug_vhost_user_command_fn, +}; +/* *INDENT-ON* */ + static clib_error_t * vhost_user_config (vlib_main_t * vm, unformat_input_t * input) { @@ -3255,61 +3745,6 @@ vhost_user_unmap_all (void) } } -static clib_error_t * -vhost_thread_command_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - unformat_input_t _line_input, *line_input = &_line_input; - u32 worker_thread_index; - u32 sw_if_index; - u8 del = 0; - int rv; - - /* Get a line of input. */ - if (!unformat_user (input, unformat_line_input, line_input)) - return 0; - - if (!unformat - (line_input, "%U %d", unformat_vnet_sw_interface, vnet_get_main (), - &sw_if_index, &worker_thread_index)) - { - unformat_free (line_input); - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - - if (unformat (line_input, "del")) - del = 1; - - if ((rv = - vhost_user_thread_placement (sw_if_index, worker_thread_index, del))) - return clib_error_return (0, "vhost_user_thread_placement returned %d", - rv); - return 0; -} - - -/*? - * This command is used to move the RX processing for the given - * interfaces to the provided thread. If the 'del' option is used, - * the forced thread assignment is removed and the thread assigment is - * reassigned automatically. Use 'show vhost-user ' - * to see the thread assignment. - * - * @cliexpar - * Example of how to move the RX processing for a given interface to a given thread: - * @cliexcmd{vhost thread VirtualEthernet0/0/0 1} - * Example of how to remove the forced thread assignment for a given interface: - * @cliexcmd{vhost thread VirtualEthernet0/0/0 1 del} -?*/ -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (vhost_user_thread_command, static) = { - .path = "vhost thread", - .short_help = "vhost thread [del]", - .function = vhost_thread_command_fn, -}; -/* *INDENT-ON* */ - /* * fd.io coding-style-patch-verification: ON *