X-Git-Url: https://gerrit.fd.io/r/gitweb?a=blobdiff_plain;f=src%2Fvnet%2Fdevices%2Fvirtio%2Fvhost_user.c;h=cd37d4c59f8b19640f26826535b64a2292a88d1c;hb=2c77ae484;hp=5c552f9be27cd77a13e0658a5cba92e9db35fda8;hpb=545866b5b3115b7de114bdb7883f5ece59b702d4;p=vpp.git diff --git a/src/vnet/devices/virtio/vhost_user.c b/src/vnet/devices/virtio/vhost_user.c index 5c552f9be27..cd37d4c59f8 100644 --- a/src/vnet/devices/virtio/vhost_user.c +++ b/src/vnet/devices/virtio/vhost_user.c @@ -33,11 +33,11 @@ #include #include -#include - #include #include #include +#include +#include #include #include @@ -107,50 +107,55 @@ unmap_all_mem_regions (vhost_user_intf_t * vui) } vui->nregions = 0; - for (q = 0; q < VHOST_VRING_MAX_N; q++) - { - vq = &vui->vrings[q]; - vq->avail = 0; - vq->used = 0; - vq->desc = 0; - } + FOR_ALL_VHOST_RX_TXQ (q, vui) + { + vq = &vui->vrings[q]; + vq->avail = 0; + vq->used = 0; + vq->desc = 0; + } } static_always_inline void -vhost_user_tx_thread_placement (vhost_user_intf_t * vui) +vhost_user_tx_thread_placement (vhost_user_intf_t *vui, u32 qid) { - //Let's try to assign one queue to each thread - u32 qid; - u32 thread_index = 0; + vnet_main_t *vnm = vnet_get_main (); + vhost_user_vring_t *rxvq = &vui->vrings[qid]; + u32 q = qid >> 1, rxvq_count; - vui->use_tx_spinlock = 0; - while (1) + ASSERT ((qid & 1) == 0); + if (!rxvq->started || !rxvq->enabled) + return; + + rxvq_count = (qid >> 1) + 1; + if (rxvq->queue_index == ~0) { - for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++) - { - vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)]; - if (!rxvq->started || !rxvq->enabled) - continue; - - vui->per_cpu_tx_qid[thread_index] = qid; - thread_index++; - if (thread_index == vlib_get_thread_main ()->n_vlib_mains) - return; - } - //We need to loop, meaning the spinlock has to be used - vui->use_tx_spinlock = 1; - if (thread_index == 0) - { - //Could not find a single valid one - for (thread_index = 0; - thread_index < vlib_get_thread_main ()->n_vlib_mains; - thread_index++) - { - vui->per_cpu_tx_qid[thread_index] = 0; - } - return; - } + rxvq->queue_index = + vnet_hw_if_register_tx_queue (vnm, vui->hw_if_index, q); + rxvq->qid = q; } + + FOR_ALL_VHOST_RXQ (q, vui) + { + vhost_user_vring_t *rxvq = &vui->vrings[q]; + u32 qi = rxvq->queue_index; + + if (rxvq->queue_index == ~0) + break; + for (u32 i = 0; i < vlib_get_n_threads (); i++) + vnet_hw_if_tx_queue_unassign_thread (vnm, qi, i); + } + + for (u32 i = 0; i < vlib_get_n_threads (); i++) + { + vhost_user_vring_t *rxvq = + &vui->vrings[VHOST_VRING_IDX_RX (i % rxvq_count)]; + u32 qi = rxvq->queue_index; + + vnet_hw_if_tx_queue_assign_thread (vnm, qi, i); + } + + vnet_hw_if_update_runtime_data (vnm, vui->hw_if_index); } /** @@ -164,20 +169,34 @@ vhost_user_rx_thread_placement (vhost_user_intf_t * vui, u32 qid) vnet_main_t *vnm = vnet_get_main (); int rv; u32 q = qid >> 1; + vhost_user_main_t *vum = &vhost_user_main; ASSERT ((qid & 1) == 1); // should be odd // Assign new queue mappings for the interface - vnet_hw_interface_set_input_node (vnm, vui->hw_if_index, - vhost_user_input_node.index); - vnet_hw_interface_assign_rx_thread (vnm, vui->hw_if_index, q, ~0); - if (txvq->mode == VNET_HW_INTERFACE_RX_MODE_UNKNOWN) + if (txvq->queue_index != ~0) + return; + vnet_hw_if_set_input_node (vnm, vui->hw_if_index, + vhost_user_input_node.index); + txvq->queue_index = vnet_hw_if_register_rx_queue (vnm, vui->hw_if_index, q, + VNET_HW_IF_RXQ_THREAD_ANY); + txvq->thread_index = + vnet_hw_if_get_rx_queue_thread_index (vnm, txvq->queue_index); + + if (txvq->mode == VNET_HW_IF_RX_MODE_UNKNOWN) /* Set polling as the default */ - txvq->mode = VNET_HW_INTERFACE_RX_MODE_POLLING; + txvq->mode = VNET_HW_IF_RX_MODE_POLLING; + if (txvq->mode == VNET_HW_IF_RX_MODE_POLLING) + { + vhost_cpu_t *cpu = vec_elt_at_index (vum->cpus, txvq->thread_index); + /* Keep a polling queue count for each thread */ + cpu->polling_q_count++; + } txvq->qid = q; - rv = vnet_hw_interface_set_rx_mode (vnm, vui->hw_if_index, q, txvq->mode); + rv = vnet_hw_if_set_rx_queue_mode (vnm, txvq->queue_index, txvq->mode); if (rv) vu_log_warn (vui, "unable to set rx mode for interface %d, " "queue %d: rc=%d", vui->hw_if_index, q, rv); + vnet_hw_if_update_runtime_data (vnm, vui->hw_if_index); } /** @brief Returns whether at least one TX and one RX vring are enabled */ @@ -186,7 +205,7 @@ vhost_user_intf_ready (vhost_user_intf_t * vui) { int i, found[2] = { }; //RX + TX - for (i = 0; i < VHOST_VRING_MAX_N; i++) + for (i = 0; i < vui->num_qid; i++) if (vui->vrings[i].started && vui->vrings[i].enabled) found[i & 1] = 1; @@ -210,22 +229,6 @@ vhost_user_update_iface_state (vhost_user_intf_t * vui) } } -static void -vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) -{ - u32 qid; - vnet_main_t *vnm = vnet_get_main (); - - qid = ifq & 0xff; - if ((qid & 1) == 0) - /* Only care about the odd number, or TX, virtqueue */ - return; - - if (vhost_user_intf_ready (vui)) - // qid >> 1 is to convert virtqueue number to vring queue index - vnet_device_input_set_interrupt_pending (vnm, vui->hw_if_index, qid >> 1); -} - static clib_error_t * vhost_user_callfd_read_ready (clib_file_t * uf) { @@ -242,37 +245,49 @@ vhost_user_thread_placement (vhost_user_intf_t * vui, u32 qid) { if (qid & 1) // RX is odd, TX is even { - if (vui->vrings[qid].qid == -1) + if (vui->vrings[qid].queue_index == ~0) vhost_user_rx_thread_placement (vui, qid); } else - vhost_user_tx_thread_placement (vui); + vhost_user_tx_thread_placement (vui, qid); } static clib_error_t * vhost_user_kickfd_read_ready (clib_file_t * uf) { - __attribute__ ((unused)) int n; + __attribute__ ((unused)) ssize_t n; u8 buff[8]; + vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui = - pool_elt_at_index (vhost_user_main.vhost_user_interfaces, - uf->private_data >> 8); + pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data >> 8); u32 qid = uf->private_data & 0xff; + u32 is_txq = qid & 1; + vhost_user_vring_t *vq = &vui->vrings[qid]; + vnet_main_t *vnm = vnet_get_main (); - n = read (uf->file_descriptor, ((char *) &buff), 8); - vu_log_debug (vui, "if %d KICK queue %d", uf->private_data >> 8, qid); - if (!vui->vrings[qid].started || - (vhost_user_intf_ready (vui) != vui->is_ready)) + n = read (uf->file_descriptor, buff, 8); + if (vq->started == 0) { - if (vui->vrings[qid].started == 0) - { - vui->vrings[qid].started = 1; - vhost_user_thread_placement (vui, qid); - vhost_user_update_iface_state (vui); - } + vq->started = 1; + vhost_user_thread_placement (vui, qid); + vhost_user_update_iface_state (vui); + if (is_txq) + vnet_hw_if_set_rx_queue_file_index (vnm, vq->queue_index, + vq->kickfd_idx); + } + + if (is_txq && (vq->mode != VNET_HW_IF_RX_MODE_POLLING) && + vhost_user_intf_ready (vui)) + { + vhost_cpu_t *cpu = vec_elt_at_index (vum->cpus, vq->thread_index); + /* + * If the thread has more than 1 queue and the other queue is in polling + * mode, there is no need to trigger an interrupt + */ + if (cpu->polling_q_count == 0) + vnet_hw_if_rx_queue_set_int_pending (vnm, vq->queue_index); } - vhost_user_set_interrupt_pending (vui, uf->private_data); return 0; } @@ -280,11 +295,17 @@ static_always_inline void vhost_user_vring_init (vhost_user_intf_t * vui, u32 qid) { vhost_user_vring_t *vring = &vui->vrings[qid]; + clib_memset (vring, 0, sizeof (*vring)); vring->kickfd_idx = ~0; vring->callfd_idx = ~0; vring->errfd = -1; vring->qid = -1; + vring->queue_index = ~0; + vring->thread_index = ~0; + vring->mode = VNET_HW_IF_RX_MODE_POLLING; + + clib_spinlock_init (&vring->vring_lock); /* * We have a bug with some qemu 2.5, and this may be a fix. @@ -324,11 +345,18 @@ vhost_user_vring_close (vhost_user_intf_t * vui, u32 qid) vring->errfd = -1; } - // save the qid so that we don't need to unassign and assign_rx_thread - // when the interface comes back up. They are expensive calls. + clib_spinlock_free (&vring->vring_lock); + + // save the needed information in vrings prior to being wiped out u16 q = vui->vrings[qid].qid; + u32 queue_index = vui->vrings[qid].queue_index; + u32 mode = vui->vrings[qid].mode; + u32 thread_index = vui->vrings[qid].thread_index; vhost_user_vring_init (vui, qid); vui->vrings[qid].qid = q; + vui->vrings[qid].queue_index = queue_index; + vui->vrings[qid].mode = mode; + vui->vrings[qid].thread_index = thread_index; } static_always_inline void @@ -347,13 +375,38 @@ vhost_user_if_disconnect (vhost_user_intf_t * vui) vui->is_ready = 0; - for (q = 0; q < VHOST_VRING_MAX_N; q++) - vhost_user_vring_close (vui, q); + FOR_ALL_VHOST_RX_TXQ (q, vui) { vhost_user_vring_close (vui, q); } unmap_all_mem_regions (vui); vu_log_debug (vui, "interface ifindex %d disconnected", vui->sw_if_index); } +void +vhost_user_set_operation_mode (vhost_user_intf_t *vui, + vhost_user_vring_t *txvq) +{ + if (vhost_user_is_packed_ring_supported (vui)) + { + if (txvq->used_event) + { + if (txvq->mode == VNET_HW_IF_RX_MODE_POLLING) + txvq->used_event->flags = VRING_EVENT_F_DISABLE; + else + txvq->used_event->flags = 0; + } + } + else + { + if (txvq->used) + { + if (txvq->mode == VNET_HW_IF_RX_MODE_POLLING) + txvq->used->flags = VRING_USED_F_NO_NOTIFY; + else + txvq->used->flags = 0; + } + } +} + static clib_error_t * vhost_user_socket_read (clib_file_t * uf) { @@ -453,16 +506,24 @@ vhost_user_socket_read (clib_file_t * uf) { case VHOST_USER_GET_FEATURES: msg.flags |= 4; - msg.u64 = (1ULL << FEAT_VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << FEAT_VIRTIO_NET_F_CTRL_VQ) | - (1ULL << FEAT_VIRTIO_F_ANY_LAYOUT) | - (1ULL << FEAT_VIRTIO_F_INDIRECT_DESC) | - (1ULL << FEAT_VHOST_F_LOG_ALL) | - (1ULL << FEAT_VIRTIO_NET_F_GUEST_ANNOUNCE) | - (1ULL << FEAT_VIRTIO_NET_F_MQ) | - (1ULL << FEAT_VHOST_USER_F_PROTOCOL_FEATURES) | - (1ULL << FEAT_VIRTIO_F_VERSION_1); + msg.u64 = VIRTIO_FEATURE (VIRTIO_NET_F_MRG_RXBUF) | + VIRTIO_FEATURE (VIRTIO_NET_F_CTRL_VQ) | + VIRTIO_FEATURE (VIRTIO_F_ANY_LAYOUT) | + VIRTIO_FEATURE (VIRTIO_RING_F_INDIRECT_DESC) | + VIRTIO_FEATURE (VHOST_F_LOG_ALL) | + VIRTIO_FEATURE (VIRTIO_NET_F_GUEST_ANNOUNCE) | + VIRTIO_FEATURE (VIRTIO_NET_F_MQ) | + VIRTIO_FEATURE (VHOST_USER_F_PROTOCOL_FEATURES) | + VIRTIO_FEATURE (VIRTIO_F_VERSION_1); msg.u64 &= vui->feature_mask; + + if (vui->enable_event_idx) + msg.u64 |= VIRTIO_FEATURE (VIRTIO_RING_F_EVENT_IDX); + if (vui->enable_gso) + msg.u64 |= FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + if (vui->enable_packed) + msg.u64 |= VIRTIO_FEATURE (VIRTIO_F_RING_PACKED); + msg.size = sizeof (msg.u64); vu_log_debug (vui, "if %d msg VHOST_USER_GET_FEATURES - reply " "0x%016llx", vui->hw_if_index, msg.u64); @@ -482,16 +543,30 @@ vhost_user_socket_read (clib_file_t * uf) vui->features = msg.u64; if (vui->features & - ((1 << FEAT_VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << FEAT_VIRTIO_F_VERSION_1))) + (VIRTIO_FEATURE (VIRTIO_NET_F_MRG_RXBUF) | + VIRTIO_FEATURE (VIRTIO_F_VERSION_1))) vui->virtio_net_hdr_sz = 12; else vui->virtio_net_hdr_sz = 10; vui->is_any_layout = - (vui->features & (1 << FEAT_VIRTIO_F_ANY_LAYOUT)) ? 1 : 0; + (vui->features & VIRTIO_FEATURE (VIRTIO_F_ANY_LAYOUT)) ? 1 : 0; ASSERT (vui->virtio_net_hdr_sz < VLIB_BUFFER_PRE_DATA_SIZE); + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, vui->hw_if_index); + if (vui->enable_gso && + ((vui->features & FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS) + == FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS)) + { + hw->caps |= (VNET_HW_INTERFACE_CAP_SUPPORTS_TCP_GSO | + VNET_HW_INTERFACE_CAP_SUPPORTS_TX_TCP_CKSUM | + VNET_HW_INTERFACE_CAP_SUPPORTS_TX_UDP_CKSUM); + } + else + { + hw->caps &= ~(VNET_HW_INTERFACE_CAP_SUPPORTS_TCP_GSO | + VNET_HW_INTERFACE_CAP_SUPPORTS_L4_TX_CKSUM); + } vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); vui->is_ready = 0; vhost_user_update_iface_state (vui); @@ -557,6 +632,24 @@ vhost_user_socket_read (clib_file_t * uf) vui->nregions++; } + + /* + * Re-compute desc, used, and avail descriptor table if vring address + * is set. + */ + FOR_ALL_VHOST_RX_TXQ (q, vui) + { + if (vui->vrings[q].desc_user_addr && vui->vrings[q].used_user_addr && + vui->vrings[q].avail_user_addr) + { + vui->vrings[q].desc = + map_user_mem (vui, vui->vrings[q].desc_user_addr); + vui->vrings[q].used = + map_user_mem (vui, vui->vrings[q].used_user_addr); + vui->vrings[q].avail = + map_user_mem (vui, vui->vrings[q].avail_user_addr); + } + } vlib_worker_thread_barrier_release (vm); break; @@ -566,8 +659,14 @@ vhost_user_socket_read (clib_file_t * uf) if ((msg.state.num > 32768) || /* maximum ring size is 32768 */ (msg.state.num == 0) || /* it cannot be zero */ - ((msg.state.num - 1) & msg.state.num)) /* must be power of 2 */ - goto close_socket; + ((msg.state.num - 1) & msg.state.num) || /* must be power of 2 */ + (msg.state.index >= vui->num_qid)) + { + vu_log_debug (vui, "invalid VHOST_USER_SET_VRING_NUM: msg.state.num" + " %d, msg.state.index %d, curruent max q %d", + msg.state.num, msg.state.index, vui->num_qid); + goto close_socket; + } vui->vrings[msg.state.index].qsz_mask = msg.state.num - 1; break; @@ -575,10 +674,10 @@ vhost_user_socket_read (clib_file_t * uf) vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_ADDR idx %d", vui->hw_if_index, msg.state.index); - if (msg.state.index >= VHOST_VRING_MAX_N) + if (msg.state.index >= vui->num_qid) { vu_log_debug (vui, "invalid vring index VHOST_USER_SET_VRING_ADDR:" - " %d >= %d", msg.state.index, VHOST_VRING_MAX_N); + " %u >= %u", msg.state.index, vui->num_qid); goto close_socket; } @@ -600,6 +699,10 @@ vhost_user_socket_read (clib_file_t * uf) goto close_socket; } + vui->vrings[msg.state.index].desc_user_addr = msg.addr.desc_user_addr; + vui->vrings[msg.state.index].used_user_addr = msg.addr.used_user_addr; + vui->vrings[msg.state.index].avail_user_addr = msg.addr.avail_user_addr; + vlib_worker_thread_barrier_sync (vm); vui->vrings[msg.state.index].desc = desc; vui->vrings[msg.state.index].used = used; @@ -611,15 +714,17 @@ vhost_user_socket_read (clib_file_t * uf) /* Spec says: If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized in an enabled state. */ - if (!(vui->features & (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES))) + if (!(vui->features & VIRTIO_FEATURE (VHOST_USER_F_PROTOCOL_FEATURES))) vui->vrings[msg.state.index].enabled = 1; vui->vrings[msg.state.index].last_used_idx = vui->vrings[msg.state.index].last_avail_idx = vui->vrings[msg.state.index].used->idx; + vui->vrings[msg.state.index].last_kick = + vui->vrings[msg.state.index].last_used_idx; - /* tell driver that we don't want interrupts */ - vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; + /* tell driver that we want interrupts or not */ + vhost_user_set_operation_mode (vui, &vui->vrings[msg.state.index]); vlib_worker_thread_barrier_release (vm); vhost_user_update_iface_state (vui); break; @@ -638,14 +743,49 @@ vhost_user_socket_read (clib_file_t * uf) vui->hw_if_index, msg.u64); q = (u8) (msg.u64 & 0xFF); - - /* if there is old fd, delete and close it */ - if (vui->vrings[q].callfd_idx != ~0) + if (vui->num_qid > q) { - clib_file_t *uf = pool_elt_at_index (file_main.file_pool, - vui->vrings[q].callfd_idx); - clib_file_del (&file_main, uf); - vui->vrings[q].callfd_idx = ~0; + /* if there is old fd, delete and close it */ + if (vui->vrings[q].callfd_idx != ~0) + { + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, + vui->vrings[q].callfd_idx); + clib_file_del (&file_main, uf); + vui->vrings[q].callfd_idx = ~0; + } + } + else if (vec_len (vui->vrings) > q) + { + /* grow vrings by pair (RX + TX) */ + vui->num_qid = (q & 1) ? (q + 1) : (q + 2); + } + else + { + u32 i, new_max_q, old_max_q = vec_len (vui->vrings); + + /* + * Double the array size if it is less than 64 entries. + * Slow down thereafter. + */ + if (vec_len (vui->vrings) < (VHOST_VRING_INIT_MQ_PAIR_SZ << 3)) + new_max_q = vec_len (vui->vrings) << 1; + else + new_max_q = vec_len (vui->vrings) + + (VHOST_VRING_INIT_MQ_PAIR_SZ << 2); + if (new_max_q > (VHOST_VRING_MAX_MQ_PAIR_SZ << 1)) + new_max_q = (VHOST_VRING_MAX_MQ_PAIR_SZ << 1); + + /* sync with the worker threads, vrings may move due to realloc */ + vlib_worker_thread_barrier_sync (vm); + vec_validate_aligned (vui->vrings, new_max_q - 1, + CLIB_CACHE_LINE_BYTES); + vlib_worker_thread_barrier_release (vm); + + for (i = old_max_q; i < vec_len (vui->vrings); i++) + vhost_user_vring_init (vui, i); + + /* grow vrings by pair (RX + TX) */ + vui->num_qid = (q & 1) ? (q + 1) : (q + 2); } if (!(msg.u64 & VHOST_USER_VRING_NOFD_MASK)) @@ -660,6 +800,7 @@ vhost_user_socket_read (clib_file_t * uf) template.file_descriptor = fds[0]; template.private_data = ((vui - vhost_user_main.vhost_user_interfaces) << 8) + q; + template.description = format (0, "vhost user"); vui->vrings[q].callfd_idx = clib_file_add (&file_main, &template); } else @@ -671,6 +812,12 @@ vhost_user_socket_read (clib_file_t * uf) vui->hw_if_index, msg.u64); q = (u8) (msg.u64 & 0xFF); + if (q >= vui->num_qid) + { + vu_log_debug (vui, "invalid vring index VHOST_USER_SET_VRING_KICK:" + " %u >= %u", q, vui->num_qid); + goto close_socket; + } if (vui->vrings[q].kickfd_idx != ~0) { @@ -710,6 +857,12 @@ vhost_user_socket_read (clib_file_t * uf) vui->hw_if_index, msg.u64); q = (u8) (msg.u64 & 0xFF); + if (q >= vui->num_qid) + { + vu_log_debug (vui, "invalid vring index VHOST_USER_SET_VRING_ERR:" + " %u >= %u", q, vui->num_qid); + goto close_socket; + } if (vui->vrings[q].errfd != -1) close (vui->vrings[q].errfd); @@ -726,18 +879,63 @@ vhost_user_socket_read (clib_file_t * uf) break; case VHOST_USER_SET_VRING_BASE: - vu_log_debug (vui, "if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d", + vu_log_debug (vui, + "if %d msg VHOST_USER_SET_VRING_BASE idx %d num 0x%x", vui->hw_if_index, msg.state.index, msg.state.num); + if (msg.state.index >= vui->num_qid) + { + vu_log_debug (vui, "invalid vring index VHOST_USER_SET_VRING_ADDR:" + " %u >= %u", msg.state.index, vui->num_qid); + goto close_socket; + } vlib_worker_thread_barrier_sync (vm); vui->vrings[msg.state.index].last_avail_idx = msg.state.num; + if (vhost_user_is_packed_ring_supported (vui)) + { + /* + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | last avail idx | | last used idx | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * ^ ^ + * | | + * avail wrap counter used wrap counter + */ + /* last avail idx at bit 0-14. */ + vui->vrings[msg.state.index].last_avail_idx = + msg.state.num & 0x7fff; + /* avail wrap counter at bit 15 */ + vui->vrings[msg.state.index].avail_wrap_counter = + ! !(msg.state.num & (1 << 15)); + + /* + * Although last_used_idx is passed in the upper 16 bits in qemu + * implementation, in practice, last_avail_idx and last_used_idx are + * usually the same. As a result, DPDK does not bother to pass us + * last_used_idx. The spec is not clear on thex coding. I figured it + * out by reading the qemu code. So let's just read last_avail_idx + * and set last_used_idx equals to last_avail_idx. + */ + vui->vrings[msg.state.index].last_used_idx = + vui->vrings[msg.state.index].last_avail_idx; + vui->vrings[msg.state.index].last_kick = + vui->vrings[msg.state.index].last_used_idx; + vui->vrings[msg.state.index].used_wrap_counter = + vui->vrings[msg.state.index].avail_wrap_counter; + + if (vui->vrings[msg.state.index].avail_wrap_counter == 1) + vui->vrings[msg.state.index].avail_wrap_counter = + VRING_DESC_F_AVAIL; + } vlib_worker_thread_barrier_release (vm); break; case VHOST_USER_GET_VRING_BASE: - if (msg.state.index >= VHOST_VRING_MAX_N) + if (msg.state.index >= vui->num_qid) { vu_log_debug (vui, "invalid vring index VHOST_USER_GET_VRING_BASE:" - " %d >= %d", msg.state.index, VHOST_VRING_MAX_N); + " %u >= %u", msg.state.index, vui->num_qid); goto close_socket; } @@ -748,6 +946,15 @@ vhost_user_socket_read (clib_file_t * uf) * closing the vring also initializes the vring last_avail_idx */ msg.state.num = vui->vrings[msg.state.index].last_avail_idx; + if (vhost_user_is_packed_ring_supported (vui)) + { + msg.state.num = + (vui->vrings[msg.state.index].last_avail_idx & 0x7fff) | + (! !vui->vrings[msg.state.index].avail_wrap_counter << 15); + msg.state.num |= + ((vui->vrings[msg.state.index].last_used_idx & 0x7fff) | + (! !vui->vrings[msg.state.index].used_wrap_counter << 15)) << 16; + } msg.flags |= 4; msg.size = sizeof (msg.state); @@ -757,7 +964,8 @@ vhost_user_socket_read (clib_file_t * uf) */ vhost_user_vring_close (vui, msg.state.index); vlib_worker_thread_barrier_release (vm); - vu_log_debug (vui, "if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", + vu_log_debug (vui, + "if %d msg VHOST_USER_GET_VRING_BASE idx %d num 0x%x", vui->hw_if_index, msg.state.index, msg.state.num); n = send (uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0); @@ -855,7 +1063,7 @@ vhost_user_socket_read (clib_file_t * uf) case VHOST_USER_GET_QUEUE_NUM: msg.flags |= 4; - msg.u64 = VHOST_VRING_MAX_N; + msg.u64 = VHOST_VRING_MAX_MQ_PAIR_SZ; msg.size = sizeof (msg.u64); vu_log_debug (vui, "if %d msg VHOST_USER_GET_QUEUE_NUM - reply %d", vui->hw_if_index, msg.u64); @@ -872,10 +1080,10 @@ vhost_user_socket_read (clib_file_t * uf) vu_log_debug (vui, "if %d VHOST_USER_SET_VRING_ENABLE: %s queue %d", vui->hw_if_index, msg.state.num ? "enable" : "disable", msg.state.index); - if (msg.state.index >= VHOST_VRING_MAX_N) + if (msg.state.index >= vui->num_qid) { vu_log_debug (vui, "invalid vring idx VHOST_USER_SET_VRING_ENABLE:" - " %d >= %d", msg.state.index, VHOST_VRING_MAX_N); + " %u >= %u", msg.state.index, vui->num_qid); goto close_socket; } @@ -947,7 +1155,9 @@ vhost_user_socksvr_accept_ready (clib_file_t * uf) template.error_function = vhost_user_socket_error; template.file_descriptor = client_fd; template.private_data = vui - vhost_user_main.vhost_user_interfaces; + template.description = format (0, "vhost interface %d", vui->sw_if_index); vui->clib_file_index = clib_file_add (&file_main, &template); + vui->num_qid = 2; return 0; } @@ -1024,41 +1234,33 @@ vhost_user_send_interrupt_process (vlib_main_t * vm, case VHOST_USER_EVENT_START_TIMER: stop_timer = 0; + timeout = 1e-3; if (!vlib_process_suspend_time_is_zero (poll_time_remaining)) break; /* fall through */ case ~0: /* *INDENT-OFF* */ - pool_foreach (vui, vum->vhost_user_interfaces, { + pool_foreach (vui, vum->vhost_user_interfaces) { next_timeout = timeout; - for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid += 2) - { - vhost_user_vring_t *rxvq = &vui->vrings[qid]; - vhost_user_vring_t *txvq = &vui->vrings[qid + 1]; - - if (txvq->qid == -1) - continue; - if (txvq->n_since_last_int) - { - if (now >= txvq->int_deadline) - vhost_user_send_call (vm, txvq); - else - next_timeout = txvq->int_deadline - now; - } - - if (rxvq->n_since_last_int) - { - if (now >= rxvq->int_deadline) - vhost_user_send_call (vm, rxvq); - else - next_timeout = rxvq->int_deadline - now; - } - - if ((next_timeout < timeout) && (next_timeout > 0.0)) - timeout = next_timeout; - } - }); + FOR_ALL_VHOST_RX_TXQ (qid, vui) + { + vhost_user_vring_t *vq = &vui->vrings[qid]; + + if (vq->started == 0) + continue; + if (vq->n_since_last_int) + { + if (now >= vq->int_deadline) + vhost_user_send_call (vm, vui, vq); + else + next_timeout = vq->int_deadline - now; + } + + if ((next_timeout < timeout) && (next_timeout > 0.0)) + timeout = next_timeout; + } + } /* *INDENT-ON* */ break; @@ -1109,7 +1311,7 @@ vhost_user_process (vlib_main_t * vm, timeout = 3.0; /* *INDENT-OFF* */ - pool_foreach (vui, vum->vhost_user_interfaces, { + pool_foreach (vui, vum->vhost_user_interfaces) { if (vui->unix_server_index == ~0) { //Nothing to do for server sockets if (vui->clib_file_index == ~0) @@ -1134,6 +1336,7 @@ vhost_user_process (vlib_main_t * vm, /* try to connect */ strncpy (sun.sun_path, (char *) vui->sock_filename, sizeof (sun.sun_path) - 1); + sun.sun_path[sizeof (sun.sun_path) - 1] = 0; /* Avoid hanging VPP if the other end does not accept */ if (fcntl(sockfd, F_SETFL, O_NONBLOCK) < 0) @@ -1150,7 +1353,9 @@ vhost_user_process (vlib_main_t * vm, template.file_descriptor = sockfd; template.private_data = vui - vhost_user_main.vhost_user_interfaces; + template.description = format (0, "vhost user process"); vui->clib_file_index = clib_file_add (&file_main, &template); + vui->num_qid = 2; /* This sockfd is considered consumed */ sockfd = -1; @@ -1176,7 +1381,7 @@ vhost_user_process (vlib_main_t * vm, } } } - }); + } /* *INDENT-ON* */ } return 0; @@ -1202,30 +1407,13 @@ vhost_user_term_if (vhost_user_intf_t * vui) // disconnect interface sockets vhost_user_if_disconnect (vui); + vhost_user_update_gso_interface_count (vui, 0 /* delete */ ); vhost_user_update_iface_state (vui); - for (q = 0; q < VHOST_VRING_MAX_N; q++) - { - // Remove existing queue mapping for the interface - if (q & 1) - { - int rv; - vnet_main_t *vnm = vnet_get_main (); - vhost_user_vring_t *txvq = &vui->vrings[q]; - - if (txvq->qid != -1) - { - rv = vnet_hw_interface_unassign_rx_thread (vnm, - vui->hw_if_index, - q >> 1); - if (rv) - vu_log_warn (vui, "unable to unassign interface %d, " - "queue %d: rc=%d", vui->hw_if_index, q >> 1, rv); - } - } - - clib_mem_free ((void *) vui->vring_locks[q]); - } + FOR_ALL_VHOST_RX_TXQ (q, vui) + { + clib_spinlock_free (&vui->vrings[q].vring_lock); + } if (vui->unix_server_index != ~0) { @@ -1250,8 +1438,10 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vnet_hw_interface_t *hwif; u16 qid; - if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || - hwif->dev_class_index != vhost_user_device_class.index) + if (! + (hwif = + vnet_get_sup_hw_interface_api_visible_or_null (vnm, sw_if_index)) + || hwif->dev_class_index != vhost_user_device_class.index) return VNET_API_ERROR_INVALID_SW_IF_INDEX; vui = pool_elt_at_index (vum->vhost_user_interfaces, hwif->dev_instance); @@ -1259,28 +1449,33 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vu_log_debug (vui, "Deleting vhost-user interface %s (instance %d)", hwif->name, hwif->dev_instance); - for (qid = 1; qid < VHOST_VRING_MAX_N / 2; qid += 2) - { - vhost_user_vring_t *txvq = &vui->vrings[qid]; + FOR_ALL_VHOST_TXQ (qid, vui) + { + vhost_user_vring_t *txvq = &vui->vrings[qid]; - if (txvq->qid == -1) - continue; - if ((vum->ifq_count > 0) && - ((txvq->mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || - (txvq->mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE))) - { - vum->ifq_count--; - // Stop the timer if there is no more interrupt interface/queue - if ((vum->ifq_count == 0) && - (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) - { - vlib_process_signal_event (vm, - vhost_user_send_interrupt_node.index, - VHOST_USER_EVENT_STOP_TIMER, 0); - break; - } - } - } + if ((txvq->mode == VNET_HW_IF_RX_MODE_POLLING) && + (txvq->thread_index != ~0)) + { + vhost_cpu_t *cpu = vec_elt_at_index (vum->cpus, txvq->thread_index); + ASSERT (cpu->polling_q_count != 0); + cpu->polling_q_count--; + } + + if ((vum->ifq_count > 0) && + ((txvq->mode == VNET_HW_IF_RX_MODE_INTERRUPT) || + (txvq->mode == VNET_HW_IF_RX_MODE_ADAPTIVE))) + { + vum->ifq_count--; + // Stop the timer if there is no more interrupt interface/queue + if (vum->ifq_count == 0) + { + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_STOP_TIMER, 0); + break; + } + } + } // Disable and reset interface vhost_user_term_if (vui); @@ -1293,6 +1488,9 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) // Delete ethernet interface ethernet_delete_interface (vnm, vui->hw_if_index); + // free vrings + vec_free (vui->vrings); + // Back to pool pool_put (vum->vhost_user_interfaces, vui); @@ -1308,9 +1506,9 @@ vhost_user_exit (vlib_main_t * vm) vlib_worker_thread_barrier_sync (vlib_get_main ()); /* *INDENT-OFF* */ - pool_foreach (vui, vum->vhost_user_interfaces, { + pool_foreach (vui, vum->vhost_user_interfaces) { vhost_user_delete_if (vnm, vm, vui->sw_if_index); - }); + } /* *INDENT-ON* */ vlib_worker_thread_barrier_release (vlib_get_main ()); return 0; @@ -1362,17 +1560,18 @@ error: * Create ethernet interface for vhost user interface. */ static void -vhost_user_create_ethernet (vnet_main_t * vnm, vlib_main_t * vm, - vhost_user_intf_t * vui, u8 * hwaddress) +vhost_user_create_ethernet (vnet_main_t *vnm, vlib_main_t *vm, + vhost_user_intf_t *vui, + vhost_user_create_if_args_t *args) { vhost_user_main_t *vum = &vhost_user_main; u8 hwaddr[6]; clib_error_t *error; /* create hw and sw interface */ - if (hwaddress) + if (args->use_custom_mac) { - clib_memcpy (hwaddr, hwaddress, 6); + clib_memcpy (hwaddr, args->hwaddr, 6); } else { @@ -1397,11 +1596,9 @@ vhost_user_create_ethernet (vnet_main_t * vnm, vlib_main_t * vm, * Initialize vui with specified attributes */ static void -vhost_user_vui_init (vnet_main_t * vnm, - vhost_user_intf_t * vui, - int server_sock_fd, - const char *sock_filename, - u64 feature_mask, u32 * sw_if_index) +vhost_user_vui_init (vnet_main_t * vnm, vhost_user_intf_t * vui, + int server_sock_fd, vhost_user_create_if_args_t * args, + u32 * sw_if_index) { vnet_sw_interface_t *sw; int q; @@ -1416,6 +1613,7 @@ vhost_user_vui_init (vnet_main_t * vnm, template.read_function = vhost_user_socksvr_accept_ready; template.file_descriptor = server_sock_fd; template.private_data = vui - vum->vhost_user_interfaces; //hw index + template.description = format (0, "vhost user %d", sw); vui->unix_server_index = clib_file_add (&file_main, &template); } else @@ -1424,45 +1622,52 @@ vhost_user_vui_init (vnet_main_t * vnm, } vui->sw_if_index = sw->sw_if_index; - strncpy (vui->sock_filename, sock_filename, + strncpy (vui->sock_filename, args->sock_filename, ARRAY_LEN (vui->sock_filename) - 1); vui->sock_errno = 0; vui->is_ready = 0; - vui->feature_mask = feature_mask; + vui->feature_mask = args->feature_mask; vui->clib_file_index = ~0; vui->log_base_addr = 0; vui->if_index = vui - vum->vhost_user_interfaces; + vui->enable_gso = args->enable_gso; + vui->enable_event_idx = args->enable_event_idx; + vui->enable_packed = args->enable_packed; + /* + * enable_gso takes precedence over configurable feature mask if there + * is a clash. + * if feature mask disables gso, but enable_gso is configured, + * then gso is enable + * if feature mask enables gso, but enable_gso is not configured, + * then gso is enable + * + * if gso is enable via feature mask, it must enable both host and guest + * gso feature mask, we don't support one sided GSO or partial GSO. + */ + if ((vui->enable_gso == 0) && + ((args->feature_mask & FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS) + == (FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS))) + vui->enable_gso = 1; + vhost_user_update_gso_interface_count (vui, 1 /* add */ ); mhash_set_mem (&vum->if_index_by_sock_name, vui->sock_filename, &vui->if_index, 0); - for (q = 0; q < VHOST_VRING_MAX_N; q++) + vec_validate_aligned (vui->vrings, (VHOST_VRING_INIT_MQ_PAIR_SZ << 1) - 1, + CLIB_CACHE_LINE_BYTES); + vui->num_qid = 2; + for (q = 0; q < vec_len (vui->vrings); q++) vhost_user_vring_init (vui, q); - hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; + hw->caps |= VNET_HW_INTERFACE_CAP_SUPPORTS_INT_MODE; vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); if (sw_if_index) *sw_if_index = vui->sw_if_index; - - for (q = 0; q < VHOST_VRING_MAX_N; q++) - { - vui->vring_locks[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, - CLIB_CACHE_LINE_BYTES); - clib_memset ((void *) vui->vring_locks[q], 0, CLIB_CACHE_LINE_BYTES); - } - - vec_validate (vui->per_cpu_tx_qid, - vlib_get_thread_main ()->n_vlib_mains - 1); - vhost_user_tx_thread_placement (vui); } int vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, - const char *sock_filename, - u8 is_server, - u32 * sw_if_index, - u64 feature_mask, - u8 renumber, u32 custom_dev_instance, u8 * hwaddr) + vhost_user_create_if_args_t * args) { vhost_user_intf_t *vui = NULL; u32 sw_if_idx = ~0; @@ -1471,26 +1676,25 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, vhost_user_main_t *vum = &vhost_user_main; uword *if_index; - if (sock_filename == NULL || !(strlen (sock_filename) > 0)) + if (args->sock_filename == NULL || !(strlen (args->sock_filename) > 0)) { return VNET_API_ERROR_INVALID_ARGUMENT; } - if_index = mhash_get (&vum->if_index_by_sock_name, (void *) sock_filename); + if_index = mhash_get (&vum->if_index_by_sock_name, + (void *) args->sock_filename); if (if_index) { - if (sw_if_index) - { - vui = &vum->vhost_user_interfaces[*if_index]; - *sw_if_index = vui->sw_if_index; - } + vui = &vum->vhost_user_interfaces[*if_index]; + args->sw_if_index = vui->sw_if_index; return VNET_API_ERROR_IF_ALREADY_EXISTS; } - if (is_server) + if (args->is_server) { if ((rv = - vhost_user_init_server_sock (sock_filename, &server_sock_fd)) != 0) + vhost_user_init_server_sock (args->sock_filename, + &server_sock_fd)) != 0) { return rv; } @@ -1499,19 +1703,17 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, /* Protect the uninitialized vui from being dispatched by rx/tx */ vlib_worker_thread_barrier_sync (vm); pool_get (vhost_user_main.vhost_user_interfaces, vui); - vhost_user_create_ethernet (vnm, vm, vui, hwaddr); + vhost_user_create_ethernet (vnm, vm, vui, args); vlib_worker_thread_barrier_release (vm); - vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename, - feature_mask, &sw_if_idx); + vhost_user_vui_init (vnm, vui, server_sock_fd, args, &sw_if_idx); vnet_sw_interface_set_mtu (vnm, vui->sw_if_index, 9000); vhost_user_rx_thread_placement (vui, 1); - if (renumber) - vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); + if (args->renumber) + vnet_interface_name_renumber (sw_if_idx, args->custom_dev_instance); - if (sw_if_index) - *sw_if_index = sw_if_idx; + args->sw_if_index = sw_if_idx; // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); @@ -1521,10 +1723,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, int vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, - const char *sock_filename, - u8 is_server, - u32 sw_if_index, - u64 feature_mask, u8 renumber, u32 custom_dev_instance) + vhost_user_create_if_args_t * args) { vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui = NULL; @@ -1534,11 +1733,12 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, vnet_hw_interface_t *hwif; uword *if_index; - if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || - hwif->dev_class_index != vhost_user_device_class.index) + if (!(hwif = vnet_get_sup_hw_interface_api_visible_or_null (vnm, + args->sw_if_index)) + || hwif->dev_class_index != vhost_user_device_class.index) return VNET_API_ERROR_INVALID_SW_IF_INDEX; - if (sock_filename == NULL || !(strlen (sock_filename) > 0)) + if (args->sock_filename == NULL || !(strlen (args->sock_filename) > 0)) return VNET_API_ERROR_INVALID_ARGUMENT; vui = vec_elt_at_index (vum->vhost_user_interfaces, hwif->dev_instance); @@ -1547,22 +1747,22 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, * Disallow changing the interface to have the same path name * as other interface */ - if_index = mhash_get (&vum->if_index_by_sock_name, (void *) sock_filename); + if_index = mhash_get (&vum->if_index_by_sock_name, + (void *) args->sock_filename); if (if_index && (*if_index != vui->if_index)) return VNET_API_ERROR_IF_ALREADY_EXISTS; // First try to open server socket - if (is_server) - if ((rv = vhost_user_init_server_sock (sock_filename, + if (args->is_server) + if ((rv = vhost_user_init_server_sock (args->sock_filename, &server_sock_fd)) != 0) return rv; vhost_user_term_if (vui); - vhost_user_vui_init (vnm, vui, server_sock_fd, - sock_filename, feature_mask, &sw_if_idx); + vhost_user_vui_init (vnm, vui, server_sock_fd, args, &sw_if_idx); - if (renumber) - vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); + if (args->renumber) + vnet_interface_name_renumber (sw_if_idx, args->custom_dev_instance); // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); @@ -1575,37 +1775,46 @@ vhost_user_connect_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { + vnet_main_t *vnm = vnet_get_main (); unformat_input_t _line_input, *line_input = &_line_input; - u8 *sock_filename = NULL; - u32 sw_if_index; - u8 is_server = 0; - u64 feature_mask = (u64) ~ (0ULL); - u8 renumber = 0; - u32 custom_dev_instance = ~0; - u8 hwaddr[6]; - u8 *hw = NULL; clib_error_t *error = NULL; + vhost_user_create_if_args_t args = { 0 }; + int rv; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) return 0; + args.feature_mask = (u64) ~ (0ULL); + args.custom_dev_instance = ~0; + /* GSO feature is disable by default */ + args.feature_mask &= ~FEATURE_VIRTIO_NET_F_HOST_GUEST_TSO_FEATURE_BITS; + /* packed-ring feature is disable by default */ + args.feature_mask &= ~VIRTIO_FEATURE (VIRTIO_F_RING_PACKED); + /* event_idx feature is disable by default */ + args.feature_mask &= ~VIRTIO_FEATURE (VIRTIO_RING_F_EVENT_IDX); + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { - if (unformat (line_input, "socket %s", &sock_filename)) + if (unformat (line_input, "socket %s", &args.sock_filename)) ; else if (unformat (line_input, "server")) - is_server = 1; - else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask)) + args.is_server = 1; + else if (unformat (line_input, "gso")) + args.enable_gso = 1; + else if (unformat (line_input, "packed")) + args.enable_packed = 1; + else if (unformat (line_input, "event-idx")) + args.enable_event_idx = 1; + else if (unformat (line_input, "feature-mask 0x%llx", + &args.feature_mask)) ; - else - if (unformat - (line_input, "hwaddr %U", unformat_ethernet_address, hwaddr)) - hw = hwaddr; - else if (unformat (line_input, "renumber %d", &custom_dev_instance)) - { - renumber = 1; - } + else if (unformat (line_input, "hwaddr %U", unformat_ethernet_address, + args.hwaddr)) + args.use_custom_mac = 1; + else if (unformat (line_input, "renumber %d", + &args.custom_dev_instance)) + args.renumber = 1; else { error = clib_error_return (0, "unknown input `%U'", @@ -1614,22 +1823,17 @@ vhost_user_connect_command_fn (vlib_main_t * vm, } } - vnet_main_t *vnm = vnet_get_main (); - - int rv; - if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename, - is_server, &sw_if_index, feature_mask, - renumber, custom_dev_instance, hw))) + if ((rv = vhost_user_create_if (vnm, vm, &args))) { error = clib_error_return (0, "vhost_user_create_if returned %d", rv); goto done; } - vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (), - sw_if_index); + vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnm, + args.sw_if_index); done: - vec_free (sock_filename); + vec_free (args.sock_filename); unformat_free (line_input); return error; @@ -1658,7 +1862,7 @@ vhost_user_delete_command_fn (vlib_main_t * vm, &sw_if_index)) { vnet_hw_interface_t *hwif = - vnet_get_sup_hw_interface (vnm, sw_if_index); + vnet_get_sup_hw_interface_api_visible_or_null (vnm, sw_if_index); if (hwif == NULL || vhost_user_device_class.index != hwif->dev_class_index) { @@ -1693,15 +1897,13 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, vhost_user_intf_details_t *vuid = NULL; u32 *hw_if_indices = 0; vnet_hw_interface_t *hi; - u8 *s = NULL; int i; if (!out_vuids) return -1; - pool_foreach (vui, vum->vhost_user_interfaces, - vec_add1 (hw_if_indices, vui->hw_if_index); - ); + pool_foreach (vui, vum->vhost_user_interfaces) + vec_add1 (hw_if_indices, vui->hw_if_index); for (i = 0; i < vec_len (hw_if_indices); i++) { @@ -1715,17 +1917,13 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, vuid->num_regions = vui->nregions; vuid->is_server = vui->unix_server_index != ~0; vuid->sock_errno = vui->sock_errno; - strncpy ((char *) vuid->sock_filename, (char *) vui->sock_filename, - sizeof (vuid->sock_filename)); - vuid->sock_filename[ARRAY_LEN (vuid->sock_filename) - 1] = '\0'; - s = format (s, "%v%c", hi->name, 0); - - strncpy ((char *) vuid->if_name, (char *) s, - ARRAY_LEN (vuid->if_name) - 1); - _vec_len (s) = 0; + snprintf ((char *) vuid->sock_filename, sizeof (vuid->sock_filename), + "%s", vui->sock_filename); + memcpy_s (vuid->if_name, sizeof (vuid->if_name), hi->name, + clib_min (vec_len (hi->name), sizeof (vuid->if_name) - 1)); + vuid->if_name[sizeof (vuid->if_name) - 1] = 0; } - vec_free (s); vec_free (hw_if_indices); *out_vuids = r_vuids; @@ -1733,6 +1931,195 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, return rv; } +static u8 * +format_vhost_user_desc (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + vring_desc_t *desc_table = va_arg (*args, vring_desc_t *); + int idx = va_arg (*args, int); + u32 *mem_hint = va_arg (*args, u32 *); + + s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len, + desc_table[idx].flags, desc_table[idx].next, + pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr, + mem_hint))); + return s; +} + +static void +vhost_user_show_fds (vlib_main_t * vm, vhost_user_vring_t * vq) +{ + int kickfd = UNIX_GET_FD (vq->kickfd_idx); + int callfd = UNIX_GET_FD (vq->callfd_idx); + + vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", kickfd, callfd, + vq->errfd); +} + +static void +vhost_user_show_desc (vlib_main_t * vm, vhost_user_intf_t * vui, int q, + int show_descr, int show_verbose) +{ + int j; + u32 mem_hint = 0; + u32 idx; + u32 n_entries; + vring_desc_t *desc_table; + vhost_user_vring_t *vq = &vui->vrings[q]; + + if (vq->avail && vq->used) + vlib_cli_output (vm, + " avail.flags %x avail event idx %u avail.idx %d " + "used.flags %x used event idx %u used.idx %d\n", + vq->avail->flags, vhost_user_avail_event_idx (vq), + vq->avail->idx, vq->used->flags, + vhost_user_used_event_idx (vq), vq->used->idx); + + vhost_user_show_fds (vm, vq); + + if (show_descr) + { + vlib_cli_output (vm, "\n descriptor table:\n"); + vlib_cli_output (vm, + " slot addr len flags next " + "user_addr\n"); + vlib_cli_output (vm, + " ===== ================== ===== ====== ===== " + "==================\n"); + for (j = 0; j < vq->qsz_mask + 1; j++) + { + desc_table = vq->desc; + vlib_cli_output (vm, "%U", format_vhost_user_desc, + " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", vui, + desc_table, j, &mem_hint); + if (show_verbose && (desc_table[j].flags & VRING_DESC_F_INDIRECT)) + { + n_entries = desc_table[j].len / sizeof (vring_desc_t); + desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint); + if (desc_table) + { + for (idx = 0; idx < clib_min (20, n_entries); idx++) + { + vlib_cli_output + (vm, "%U", format_vhost_user_desc, + "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, idx, &mem_hint); + } + if (n_entries >= 20) + vlib_cli_output (vm, "Skip displaying entries 20...%u\n", + n_entries); + } + } + } + } +} + +static u8 * +format_vhost_user_packed_desc (u8 * s, va_list * args) +{ + char *fmt = va_arg (*args, char *); + vhost_user_intf_t *vui = va_arg (*args, vhost_user_intf_t *); + vring_packed_desc_t *desc_table = va_arg (*args, vring_packed_desc_t *); + int idx = va_arg (*args, int); + u32 *mem_hint = va_arg (*args, u32 *); + + s = format (s, fmt, idx, desc_table[idx].addr, desc_table[idx].len, + desc_table[idx].flags, desc_table[idx].id, + pointer_to_uword (map_guest_mem (vui, desc_table[idx].addr, + mem_hint))); + return s; +} + +static u8 * +format_vhost_user_event_idx_flags (u8 * s, va_list * args) +{ + u32 flags = va_arg (*args, u32); + typedef struct + { + u8 value; + char *str; + } event_idx_flags; + static event_idx_flags event_idx_array[] = { +#define _(s,v) { .str = #s, .value = v, }, + foreach_virtio_event_idx_flags +#undef _ + }; + u32 num_entries = sizeof (event_idx_array) / sizeof (event_idx_flags); + + if (flags < num_entries) + s = format (s, "%s", event_idx_array[flags].str); + else + s = format (s, "%u", flags); + return s; +} + +static void +vhost_user_show_desc_packed (vlib_main_t * vm, vhost_user_intf_t * vui, int q, + int show_descr, int show_verbose) +{ + int j; + u32 mem_hint = 0; + u32 idx; + u32 n_entries; + vring_packed_desc_t *desc_table; + vhost_user_vring_t *vq = &vui->vrings[q]; + u16 off_wrap, event_idx; + + off_wrap = vq->avail_event->off_wrap; + event_idx = off_wrap & 0x7fff; + vlib_cli_output (vm, " avail_event.flags %U avail_event.off_wrap %u " + "avail event idx %u\n", format_vhost_user_event_idx_flags, + (u32) vq->avail_event->flags, off_wrap, event_idx); + + off_wrap = vq->used_event->off_wrap; + event_idx = off_wrap & 0x7fff; + vlib_cli_output (vm, " used_event.flags %U used_event.off_wrap %u " + "used event idx %u\n", format_vhost_user_event_idx_flags, + (u32) vq->used_event->flags, off_wrap, event_idx); + + vlib_cli_output (vm, " avail wrap counter %u, used wrap counter %u\n", + vq->avail_wrap_counter, vq->used_wrap_counter); + + vhost_user_show_fds (vm, vq); + + if (show_descr) + { + vlib_cli_output (vm, "\n descriptor table:\n"); + vlib_cli_output (vm, + " slot addr len flags id " + "user_addr\n"); + vlib_cli_output (vm, + " ===== ================== ===== ====== ===== " + "==================\n"); + for (j = 0; j < vq->qsz_mask + 1; j++) + { + desc_table = vq->packed_desc; + vlib_cli_output (vm, "%U", format_vhost_user_packed_desc, + " %-5u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, j, &mem_hint); + if (show_verbose && (desc_table[j].flags & VRING_DESC_F_INDIRECT)) + { + n_entries = desc_table[j].len >> 4; + desc_table = map_guest_mem (vui, desc_table[j].addr, &mem_hint); + if (desc_table) + { + for (idx = 0; idx < clib_min (20, n_entries); idx++) + { + vlib_cli_output + (vm, "%U", format_vhost_user_packed_desc, + "> %-4u 0x%016lx %-5u 0x%04x %-5u 0x%016lx\n", vui, + desc_table, idx, &mem_hint); + } + if (n_entries >= 20) + vlib_cli_output (vm, "Skip displaying entries 20...%u\n", + n_entries); + } + } + } + } +} + clib_error_t * show_vhost_user_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -1745,9 +2132,9 @@ show_vhost_user_command_fn (vlib_main_t * vm, u32 hw_if_index, *hw_if_indices = 0; vnet_hw_interface_t *hi; u16 qid; - u32 ci; int i, j, q; int show_descr = 0; + int show_verbose = 0; struct feat_struct { u8 bit; @@ -1757,7 +2144,7 @@ show_vhost_user_command_fn (vlib_main_t * vm, static struct feat_struct feat_array[] = { #define _(s,b) { .str = #s, .bit = b, }, - foreach_virtio_net_feature + foreach_virtio_net_features #undef _ {.str = NULL} }; @@ -1789,6 +2176,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, } else if (unformat (input, "descriptors") || unformat (input, "desc")) show_descr = 1; + else if (unformat (input, "verbose")) + show_verbose = 1; else { error = clib_error_return (0, "unknown input `%U'", @@ -1798,15 +2187,21 @@ show_vhost_user_command_fn (vlib_main_t * vm, } if (vec_len (hw_if_indices) == 0) { - pool_foreach (vui, vum->vhost_user_interfaces, - vec_add1 (hw_if_indices, vui->hw_if_index); - ); + pool_foreach (vui, vum->vhost_user_interfaces) + vec_add1 (hw_if_indices, vui->hw_if_index); } vlib_cli_output (vm, "Virtio vhost-user interfaces"); vlib_cli_output (vm, "Global:\n coalesce frames %d time %e", vum->coalesce_frames, vum->coalesce_time); - vlib_cli_output (vm, " number of rx virtqueues in interrupt mode: %d", + vlib_cli_output (vm, " Number of rx virtqueues in interrupt mode: %d", vum->ifq_count); + vlib_cli_output (vm, " Number of GSO interfaces: %d", vum->gso_count); + for (u32 tid = 0; tid <= vlib_num_workers (); tid++) + { + vhost_cpu_t *cpu = vec_elt_at_index (vum->cpus, tid); + vlib_cli_output (vm, " Thread %u: Polling queue count %u", tid, + cpu->polling_q_count); + } for (i = 0; i < vec_len (hw_if_indices); i++) { @@ -1815,6 +2210,13 @@ show_vhost_user_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "Interface: %U (ifindex %d)", format_vnet_hw_if_index_name, vnm, hw_if_indices[i], hw_if_indices[i]); + vlib_cli_output (vm, " Number of qids %u", vui->num_qid); + if (vui->enable_gso) + vlib_cli_output (vm, " GSO enable"); + if (vui->enable_packed) + vlib_cli_output (vm, " Packed ring enable"); + if (vui->enable_event_idx) + vlib_cli_output (vm, " Event index enable"); vlib_cli_output (vm, "virtio_net_hdr_sz %d\n" " features mask (0x%llx): \n" @@ -1851,32 +2253,31 @@ show_vhost_user_command_fn (vlib_main_t * vm, vlib_cli_output (vm, " rx placement: "); - for (qid = 1; qid < VHOST_VRING_MAX_N / 2; qid += 2) - { - vnet_main_t *vnm = vnet_get_main (); - uword thread_index; - vnet_hw_interface_rx_mode mode; - vhost_user_vring_t *txvq = &vui->vrings[qid]; - - if (txvq->qid == -1) - continue; - thread_index = - vnet_get_device_input_thread_index (vnm, vui->hw_if_index, - qid >> 1); - vnet_hw_interface_get_rx_mode (vnm, vui->hw_if_index, qid >> 1, - &mode); - vlib_cli_output (vm, " thread %d on vring %d, %U\n", - thread_index, qid, - format_vnet_hw_interface_rx_mode, mode); - } + FOR_ALL_VHOST_TXQ (qid, vui) + { + vhost_user_vring_t *txvq = &vui->vrings[qid]; + + if (txvq->qid == -1) + continue; + vlib_cli_output (vm, " thread %d on vring %d, %U\n", + txvq->thread_index, qid, format_vnet_hw_if_rx_mode, + txvq->mode); + } - vlib_cli_output (vm, " tx placement: %s\n", - vui->use_tx_spinlock ? "spin-lock" : "lock-free"); + vlib_cli_output (vm, " tx placement\n"); - vec_foreach_index (ci, vui->per_cpu_tx_qid) + FOR_ALL_VHOST_RXQ (qid, vui) { - vlib_cli_output (vm, " thread %d on vring %d\n", ci, - VHOST_VRING_IDX_RX (vui->per_cpu_tx_qid[ci])); + vhost_user_vring_t *rxvq = &vui->vrings[qid]; + vnet_hw_if_tx_queue_t *txq; + + if (rxvq->queue_index == ~0) + continue; + txq = vnet_hw_if_get_tx_queue (vnm, rxvq->queue_index); + if (txq->threads) + vlib_cli_output (vm, " threads %U on vring %u: %s\n", + format_bitmap_list, txq->threads, qid, + txq->shared_queue ? "spin-lock" : "lock-free"); } vlib_cli_output (vm, "\n"); @@ -1901,57 +2302,29 @@ show_vhost_user_command_fn (vlib_main_t * vm, vui->regions[j].mmap_offset, pointer_to_uword (vui->region_mmap_addr[j])); } - for (q = 0; q < VHOST_VRING_MAX_N; q++) - { - if (!vui->vrings[q].started) - continue; - - vlib_cli_output (vm, "\n Virtqueue %d (%s%s)\n", q, - (q & 1) ? "RX" : "TX", - vui->vrings[q].enabled ? "" : " disabled"); - - vlib_cli_output (vm, - " qsz %d last_avail_idx %d last_used_idx %d\n", - vui->vrings[q].qsz_mask + 1, - vui->vrings[q].last_avail_idx, - vui->vrings[q].last_used_idx); - - if (vui->vrings[q].avail && vui->vrings[q].used) - vlib_cli_output (vm, - " avail.flags %x avail.idx %d used.flags %x used.idx %d\n", - vui->vrings[q].avail->flags, - vui->vrings[q].avail->idx, - vui->vrings[q].used->flags, - vui->vrings[q].used->idx); - - int kickfd = UNIX_GET_FD (vui->vrings[q].kickfd_idx); - int callfd = UNIX_GET_FD (vui->vrings[q].callfd_idx); - vlib_cli_output (vm, " kickfd %d callfd %d errfd %d\n", - kickfd, callfd, vui->vrings[q].errfd); - - if (show_descr) - { - vlib_cli_output (vm, "\n descriptor table:\n"); - vlib_cli_output (vm, - " id addr len flags next user_addr\n"); - vlib_cli_output (vm, - " ===== ================== ===== ====== ===== ==================\n"); - for (j = 0; j < vui->vrings[q].qsz_mask + 1; j++) - { - u32 mem_hint = 0; - vlib_cli_output (vm, - " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n", - j, vui->vrings[q].desc[j].addr, - vui->vrings[q].desc[j].len, - vui->vrings[q].desc[j].flags, - vui->vrings[q].desc[j].next, - pointer_to_uword (map_guest_mem - (vui, - vui->vrings[q].desc[j]. - addr, &mem_hint))); - } - } - } + FOR_ALL_VHOST_RX_TXQ (q, vui) + { + if (!vui->vrings[q].started) + continue; + + vlib_cli_output (vm, "\n Virtqueue %d (%s%s)\n", q, + (q & 1) ? "RX" : "TX", + vui->vrings[q].enabled ? "" : " disabled"); + vlib_cli_output (vm, " global %s queue index %u\n", + (q & 1) ? "RX" : "TX", vui->vrings[q].queue_index); + + vlib_cli_output ( + vm, + " qsz %d last_avail_idx %d last_used_idx %d" + " last_kick %u\n", + vui->vrings[q].qsz_mask + 1, vui->vrings[q].last_avail_idx, + vui->vrings[q].last_used_idx, vui->vrings[q].last_kick); + + if (vhost_user_is_packed_ring_supported (vui)) + vhost_user_show_desc_packed (vm, vui, q, show_descr, show_verbose); + else + vhost_user_show_desc (vm, vui, q, show_descr, show_verbose); + } vlib_cli_output (vm, "\n"); } done: @@ -1970,23 +2343,25 @@ done: * * There are several parameters associated with a vHost interface: * - * - socket - Name of the linux socket used by hypervisor - * and VPP to manage the vHost interface. If in 'server' mode, VPP will - * create the socket if it does not already exist. If in 'client' mode, - * hypervisor will create the socket if it does not already exist. The VPP code - * is indifferent to the file location. However, if SELinux is enabled, then the - * socket needs to be created in '/var/run/vpp/'. + * - socket - Name of the linux socket used by + * hypervisor and VPP to manage the vHost interface. If in server + * mode, VPP will create the socket if it does not already exist. If in + * client mode, hypervisor will create the socket if it does not + * already exist. The VPP code is indifferent to the file location. However, + * if SELinux is enabled, then the socket needs to be created in + * /var/run/vpp/. * - * - server - Optional flag to indicate that VPP should be the server for - * the linux socket. If not provided, VPP will be the client. In 'server' - * mode, the VM can be reset without tearing down the vHost Interface. In - * 'client' mode, VPP can be reset without bringing down the VM and - * tearing down the vHost Interface. + * - server - Optional flag to indicate that VPP should be the server + * for the linux socket. If not provided, VPP will be the client. In + * server mode, the VM can be reset without tearing down the vHost + * Interface. In client mode, VPP can be reset without bringing down + * the VM and tearing down the vHost Interface. * - * - feature-mask - Optional virtio/vhost feature set negotiated at - * startup. This is intended for degugging only. It is recommended that this - * parameter not be used except by experienced users. By default, all supported - * features will be advertised. Otherwise, provide the set of features desired. + * - feature-mask - Optional virtio/vhost feature set negotiated + * at startup. This is intended for degugging only. It is recommended + * that this parameter not be used except by experienced users. By default, + * all supported features will be advertised. Otherwise, provide the set of + * features desired. * - 0x000008000 (15) - VIRTIO_NET_F_MRG_RXBUF * - 0x000020000 (17) - VIRTIO_NET_F_CTRL_VQ * - 0x000200000 (21) - VIRTIO_NET_F_GUEST_ANNOUNCE @@ -2000,18 +2375,21 @@ done: * - hwaddr - Optional ethernet address, can be in either * X:X:X:X:X:X unix or X.X.X cisco format. * - * - renumber - Optional parameter which allows the instance - * in the name to be specified. If instance already exists, name will be used - * anyway and multiple instances will have the same name. Use with caution. + * - renumber - Optional parameter which allows the + * instance in the name to be specified. If instance already exists, name + * will be used anyway and multiple instances will have the same name. Use + * with caution. * * @cliexpar - * Example of how to create a vhost interface with VPP as the client and all features enabled: + * Example of how to create a vhost interface with VPP as the client and all + * features enabled: * @cliexstart{create vhost-user socket /var/run/vpp/vhost1.sock} * VirtualEthernet0/0/0 * @cliexend - * Example of how to create a vhost interface with VPP as the server and with just - * multiple queues enabled: - * @cliexstart{create vhost-user socket /var/run/vpp/vhost2.sock server feature-mask 0x40400000} + * Example of how to create a vhost interface with VPP as the server and with + * just multiple queues enabled: + * @cliexstart{create vhost-user socket /var/run/vpp/vhost2.sock server + * feature-mask 0x40400000} * VirtualEthernet0/0/1 * @cliexend * Once the vHost interface is created, enable the interface using: @@ -2021,7 +2399,8 @@ done: VLIB_CLI_COMMAND (vhost_user_connect_command, static) = { .path = "create vhost-user", .short_help = "create vhost-user socket [server] " - "[feature-mask ] [hwaddr ] [renumber ] ", + "[feature-mask ] [hwaddr ] [renumber ] [gso] " + "[packed] [event-idx]", .function = vhost_user_connect_command_fn, .is_mp_safe = 1, }; @@ -2048,9 +2427,9 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = { /*? * Display the attributes of a single vHost User interface (provide interface - * name), multiple vHost User interfaces (provide a list of interface names seperated - * by spaces) or all Vhost User interfaces (omit an interface name to display all - * vHost interfaces). + * name), multiple vHost User interfaces (provide a list of interface names + * separated by spaces) or all Vhost User interfaces (omit an interface name + * to display all vHost interfaces). * * @cliexpar * @parblock @@ -2084,10 +2463,10 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = { * thread 2 on vring 0 * * Memory regions (total 2) - * region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr - * ====== ===== ================== ================== ================== ================== ================== - * 0 60 0x0000000000000000 0x00000000000a0000 0x00002aaaaac00000 0x0000000000000000 0x00002aab2b400000 - * 1 61 0x00000000000c0000 0x000000003ff40000 0x00002aaaaacc0000 0x00000000000c0000 0x00002aababcc0000 + * region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr + * ====== == =============== =========== ============== =========== ========== + * 0 60 0x00000000 0x000a0000 0xaac00000 0x00000000 0x2b400000 + * 1 61 0x000c0000 0x3ff40000 0xaacc0000 0x000c0000 0xabcc0000 * * Virtqueue 0 (TX) * qsz 256 last_avail_idx 0 last_used_idx 0 @@ -2131,8 +2510,9 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = { * * @cliexend * - * The optional 'descriptors' parameter will display the same output as - * the previous example but will include the descriptor table for each queue. + * The optional 'descriptors' parameter will display the same output + * as the previous example but will include the descriptor table for each + * queue. * The output is truncated below: * @cliexstart{show vhost-user VirtualEthernet0/0/0 descriptors} * Virtio vhost-user interfaces @@ -2182,7 +2562,8 @@ VLIB_CLI_COMMAND (vhost_user_delete_command, static) = { /* *INDENT-OFF* */ VLIB_CLI_COMMAND (show_vhost_user_command, static) = { .path = "show vhost-user", - .short_help = "show vhost-user [ [ [..]]] [descriptors]", + .short_help = "show vhost-user [ [ [..]]] " + "[[descriptors] [verbose]]", .function = show_vhost_user_command_fn, }; /* *INDENT-ON* */ @@ -2220,9 +2601,8 @@ vhost_user_unmap_all (void) if (vum->dont_dump_vhost_user_memory) { - pool_foreach (vui, vum->vhost_user_interfaces, - unmap_all_mem_regions (vui); - ); + pool_foreach (vui, vum->vhost_user_interfaces) + unmap_all_mem_regions (vui); } }