From 4667c229a05458eb7558cf927a44df3d5f585c11 Mon Sep 17 00:00:00 2001 From: Yoann Desmouceaux Date: Wed, 24 Feb 2016 22:51:00 +0100 Subject: [PATCH 1/1] Add live migration support to (non-dpdk) vhost-user driver MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch adds live migration support to vhost interfaces, by supporting the VHOST_F_LOG_ALL feature. When qemu starts a migration, it will negotiate this feature, and provide a fd for a "dirty log" shared mem space. This log is a bitmap representing pages in the device memory. Whenever we touch memory pointed by a "desc" vring, or modify a "used" vring, we log the corresponding page in the bitmap. This allows qemu to send the dirty page to the destination host. See https://github.com/qemu/qemu/blob/master/docs/specs/vhost-user.txt, § "Live migration" for more details. In addition to this, this code provides support for the VHOST_USER_F_PROTOCOL_FEATURES feature, and to VHOST_USER_{GET,SET}_PROTOCOL_FEATURES and VHOST_USER_SET_VRING_ENABLE messages, required for live migration. Change-Id: I7577efce8bd67653218f4291af1d651de451e552 Signed-off-by: Yoann Desmouceaux --- vnet/vnet/devices/virtio/vhost-user.c | 126 ++++++++++++++++++++++++++++++++-- vnet/vnet/devices/virtio/vhost-user.h | 26 +++++-- 2 files changed, 144 insertions(+), 8 deletions(-) diff --git a/vnet/vnet/devices/virtio/vhost-user.c b/vnet/vnet/devices/virtio/vhost-user.c index df09db87e57..945f03a1eea 100644 --- a/vnet/vnet/devices/virtio/vhost-user.c +++ b/vnet/vnet/devices/virtio/vhost-user.c @@ -230,12 +230,38 @@ static inline void vhost_user_if_disconnect(vhost_user_intf_t * vui) vui->vrings[q].desc = NULL; vui->vrings[q].avail = NULL; vui->vrings[q].used = NULL; + vui->vrings[q].log_guest_addr = 0; } unmap_all_mem_regions(vui); DBG_SOCK("interface ifindex %d disconnected", vui->sw_if_index); } +#define VHOST_LOG_PAGE 0x1000 +always_inline void vhost_user_log_dirty_pages(vhost_user_intf_t * vui, + u64 addr, u64 len) +{ + if (PREDICT_TRUE(vui->log_base_addr == 0 + || !(vui->features & (1 << FEAT_VHOST_F_LOG_ALL)))) { + return; + } + if (PREDICT_FALSE((addr + len - 1) / VHOST_LOG_PAGE / 8 >= vui->log_size)) { + DBG_SOCK("vhost_user_log_dirty_pages(): out of range\n"); + return; + } + + CLIB_MEMORY_BARRIER(); + u64 page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + ((u8*)vui->log_base_addr)[page / 8] |= 1 << page % 8; + page++; + } +} + +#define vhost_user_log_dirty_ring(vui, vq, member) \ + vhost_user_log_dirty_pages(vui, vq->log_guest_addr + offsetof(vring_used_t, member), \ + sizeof(vq->used->member)) + static clib_error_t * vhost_user_socket_read (unix_file_t * uf) { int n, i; @@ -313,7 +339,10 @@ static clib_error_t * vhost_user_socket_read (unix_file_t * uf) msg.flags |= 4; msg.u64 = (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF) | - (1 << FEAT_VIRTIO_F_ANY_LAYOUT); + (1 << FEAT_VIRTIO_F_ANY_LAYOUT) | + (1 << FEAT_VHOST_F_LOG_ALL) | + (1 << FEAT_VIRTIO_NET_F_GUEST_ANNOUNCE) | + (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES); msg.u64 &= vui->feature_mask; msg.size = sizeof(msg.u64); @@ -324,6 +353,7 @@ static clib_error_t * vhost_user_socket_read (unix_file_t * uf) vui->hw_if_index, msg.u64); vui->features = msg.u64; + if (vui->features & (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF)) vui->virtio_net_hdr_sz = 12; else @@ -339,6 +369,7 @@ static clib_error_t * vhost_user_socket_read (unix_file_t * uf) vui->vrings[q].desc = 0; vui->vrings[q].avail = 0; vui->vrings[q].used = 0; + vui->vrings[q].log_guest_addr = 0; } DBG_SOCK("interface %d disconnected", vui->sw_if_index); @@ -419,6 +450,15 @@ static clib_error_t * vhost_user_socket_read (unix_file_t * uf) goto close_socket; } + vui->vrings[msg.state.index].log_guest_addr = msg.addr.log_guest_addr; + + /* Spec says: If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, + the ring is initialized in an enabled state. */ + + if (!(vui->features & (1 << FEAT_VHOST_USER_F_PROTOCOL_FEATURES))) { + vui->vrings[msg.state.index].enabled = 1; + } + vui->vrings[msg.state.index].last_used_idx = vui->vrings[msg.state.index].used->idx; @@ -509,7 +549,10 @@ static clib_error_t * vhost_user_socket_read (unix_file_t * uf) DBG_SOCK("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d", vui->hw_if_index, msg.state.index, msg.state.num); - msg.state.num = vui->vrings[msg.state.index].last_used_idx; + /* Spec says: Client must [...] stop ring upon receiving VHOST_USER_GET_VRING_BASE. */ + vui->vrings[msg.state.index].enabled = 0; + + msg.state.num = vui->vrings[msg.state.index].last_avail_idx; msg.flags |= 4; msg.size = sizeof(msg.state); break; @@ -521,10 +564,45 @@ static clib_error_t * vhost_user_socket_read (unix_file_t * uf) break; case VHOST_USER_SET_LOG_BASE: + { DBG_SOCK("if %d msg VHOST_USER_SET_LOG_BASE", vui->hw_if_index); + if (msg.size != sizeof(msg.log)) { + DBG_SOCK("invalid msg size for VHOST_USER_SET_LOG_BASE: %d instead of %d", + msg.size, sizeof(msg.log)); + goto close_socket; + } + + if (!(vui->protocol_features & (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD))) { + DBG_SOCK("VHOST_USER_PROTOCOL_F_LOG_SHMFD not set but VHOST_USER_SET_LOG_BASE received"); + goto close_socket; + } + + fd = fds[0]; + /* align size to 2M page */ + long page_sz = get_huge_page_size(fd); + ssize_t map_sz = (msg.log.size + msg.log.offset + page_sz) & ~(page_sz - 1); + + vui->log_base_addr = mmap(0, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + + DBG_SOCK("map log region addr 0 len 0x%lx off 0x%lx fd %d mapped 0x%lx", + map_sz, msg.log.offset, fd, vui->log_base_addr); + + if (vui->log_base_addr == MAP_FAILED) { + clib_warning("failed to map memory. errno is %d", errno); + goto close_socket; + } + + vui->log_base_addr += msg.log.offset; + vui->log_size = msg.log.size; + + msg.flags |= 4; + msg.size = sizeof(msg.u64); + break; + } case VHOST_USER_SET_LOG_FD: DBG_SOCK("if %d msg VHOST_USER_SET_LOG_FD", @@ -532,6 +610,28 @@ static clib_error_t * vhost_user_socket_read (unix_file_t * uf) break; + case VHOST_USER_GET_PROTOCOL_FEATURES: + DBG_SOCK("if %d msg VHOST_USER_GET_PROTOCOL_FEATURES", vui->hw_if_index); + + msg.flags |= 4; + msg.u64 = (1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD); + msg.size = sizeof(msg.u64); + break; + + case VHOST_USER_SET_PROTOCOL_FEATURES: + DBG_SOCK("if %d msg VHOST_USER_SET_PROTOCOL_FEATURES features 0x%lx", + vui->hw_if_index, msg.u64); + + vui->protocol_features = msg.u64; + + break; + + case VHOST_USER_SET_VRING_ENABLE: + DBG_SOCK("if %d VHOST_USER_SET_VRING_ENABLE, enable: %d", + vui->hw_if_index, msg.state.num); + vui->vrings[msg.state.index].enabled = msg.state.num; + break; + default: DBG_SOCK("unknown vhost-user message %d received. closing socket", msg.request); @@ -750,6 +850,7 @@ static inline void vhost_user_send_call(vlib_main_t * vm, vhost_user_vring_t * v vq->int_deadline = vlib_time_now(vm) + vum->coalesce_time; } + static u32 vhost_user_if_input ( vlib_main_t * vm, vhost_user_main_t * vum, vhost_user_intf_t * vui, @@ -770,7 +871,7 @@ static u32 vhost_user_if_input ( vlib_main_t * vm, vec_reset_length (vui->d_trace_buffers); /* no descriptor ptr - bail out */ - if (PREDICT_FALSE(!txvq->desc || !txvq->avail)) + if (PREDICT_FALSE(!txvq->desc || !txvq->avail || !txvq->enabled)) return 0; /* do we have pending intterupts ? */ @@ -799,6 +900,7 @@ static u32 vhost_user_if_input ( vlib_main_t * vm, txvq->last_avail_idx = txvq->last_used_idx = txvq->avail->idx; CLIB_MEMORY_BARRIER(); txvq->used->idx = txvq->last_used_idx; + vhost_user_log_dirty_ring(vui, txvq, idx); vhost_user_send_call(vm, txvq); return 0; } @@ -849,6 +951,7 @@ static u32 vhost_user_if_input ( vlib_main_t * vm, txvq->last_avail_idx++; txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head; txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0; + vhost_user_log_dirty_ring(vui, txvq, ring[txvq->last_used_idx & qsz_mask]); txvq->last_used_idx++; flush--; } @@ -914,6 +1017,7 @@ static u32 vhost_user_if_input ( vlib_main_t * vm, txvq->last_avail_idx++; txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head; txvq->used->ring[txvq->last_used_idx & qsz_mask].len = 0; + vhost_user_log_dirty_ring(vui, txvq, ring[txvq->last_used_idx & qsz_mask]); txvq->last_used_idx++; if(PREDICT_FALSE(b_head->current_length < 14 && @@ -957,6 +1061,7 @@ static u32 vhost_user_if_input ( vlib_main_t * vm, /* give buffers back to driver */ CLIB_MEMORY_BARRIER(); txvq->used->idx = txvq->last_used_idx; + vhost_user_log_dirty_ring(vui, txvq, idx); if (PREDICT_FALSE (vec_len (vui->d_trace_buffers) > 0)) { @@ -1052,7 +1157,7 @@ vhost_user_intfc_tx (vlib_main_t * vm, if (PREDICT_FALSE(!vui->is_up)) goto done2; - if (PREDICT_FALSE(!rxvq->desc || !rxvq->avail || vui->sock_errno != 0)) { + if (PREDICT_FALSE(!rxvq->desc || !rxvq->avail || vui->sock_errno != 0 || !rxvq->enabled)) { error = VHOST_USER_TX_FUNC_ERROR_NOT_READY; goto done2; } @@ -1111,6 +1216,8 @@ vhost_user_intfc_tx (vlib_main_t * vm, hdr->hdr.flags = 0; hdr->hdr.gso_type = 0; + vhost_user_log_dirty_pages(vui, rxvq->desc[desc_current].addr, vui->virtio_net_hdr_sz); + if (vui->virtio_net_hdr_sz == 12) hdr->num_buffers = 1; @@ -1149,6 +1256,7 @@ vhost_user_intfc_tx (vlib_main_t * vm, //Move from available to used buffer rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head; rxvq->used->ring[used_index & qsz_mask].len = desc_len; + vhost_user_log_dirty_ring(vui, rxvq, ring[used_index & qsz_mask]); rxvq->last_avail_idx++; used_index++; hdr->num_buffers++; @@ -1182,6 +1290,7 @@ vhost_user_intfc_tx (vlib_main_t * vm, u16 bytes_to_copy = bytes_left > (rxvq->desc[desc_current].len - offset) ? (rxvq->desc[desc_current].len - offset) : bytes_left; rte_memcpy(buffer_addr, vlib_buffer_get_current (current_b0) + current_b0->current_length - bytes_left, bytes_to_copy); + vhost_user_log_dirty_pages(vui, rxvq->desc[desc_current].addr + offset, bytes_to_copy); bytes_left -= bytes_to_copy; offset += bytes_to_copy; buffer_addr += bytes_to_copy; @@ -1191,6 +1300,8 @@ vhost_user_intfc_tx (vlib_main_t * vm, //Move from available to used ring rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head; rxvq->used->ring[used_index & qsz_mask].len = desc_len; + vhost_user_log_dirty_ring(vui, rxvq, ring[used_index & qsz_mask]); + rxvq->last_avail_idx++; used_index++; } @@ -1198,6 +1309,7 @@ vhost_user_intfc_tx (vlib_main_t * vm, done: CLIB_MEMORY_BARRIER(); rxvq->used->idx = used_index; + vhost_user_log_dirty_ring(vui, rxvq, idx); /* interrupt (call) handling */ if((rxvq->callfd > 0) && !(rxvq->avail->flags & 1)) { @@ -1473,6 +1585,7 @@ static void vhost_user_vui_init(vnet_main_t * vnm, vnet_sw_interface_t * sw; sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index); vlib_thread_main_t * tm = vlib_get_thread_main(); + int q; vui->unix_fd = sockfd; vui->sw_if_index = sw->sw_if_index; @@ -1484,6 +1597,11 @@ static void vhost_user_vui_init(vnet_main_t * vnm, vui->feature_mask = feature_mask; vui->active = 1; vui->unix_file_index = ~0; + vui->log_base_addr = 0; + + for (q = 0; q < 2; q++) { + vui->vrings[q].enabled = 0; + } vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); diff --git a/vnet/vnet/devices/virtio/vhost-user.h b/vnet/vnet/devices/virtio/vhost-user.h index bf3f6d5352b..83dbf3e3b22 100644 --- a/vnet/vnet/devices/virtio/vhost-user.h +++ b/vnet/vnet/devices/virtio/vhost-user.h @@ -26,9 +26,11 @@ #define VIRTQ_DESC_F_NEXT 1 #define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 + #if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0) #define VHOST_USER_F_PROTOCOL_FEATURES 30 -#define VHOST_USER_PROTOCOL_F_MQ 0 #define VHOST_USER_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_MQ) /* If multiqueue is provided by host, then we suppport it. */ @@ -40,7 +42,11 @@ #define foreach_virtio_net_feature \ _ (VIRTIO_NET_F_MRG_RXBUF, 15) \ - _ (VIRTIO_F_ANY_LAYOUT, 27) + _ (VIRTIO_F_ANY_LAYOUT, 27) \ + _ (VHOST_F_LOG_ALL, 26) \ + _ (VIRTIO_NET_F_GUEST_ANNOUNCE, 21) \ + _ (VHOST_USER_F_PROTOCOL_FEATURES, 30) + typedef enum { #define _(f,n) FEAT_##f = (n), @@ -80,6 +86,11 @@ typedef struct vhost_vring_addr { u64 desc_user_addr, used_user_addr, avail_user_addr, log_guest_addr; } vhost_vring_addr_t; +typedef struct vhost_user_log { + u64 size; + u64 offset; +} vhost_user_log_t; + typedef enum vhost_user_req { VHOST_USER_NONE = 0, VHOST_USER_GET_FEATURES = 1, @@ -96,12 +107,12 @@ typedef enum vhost_user_req { VHOST_USER_SET_VRING_KICK = 12, VHOST_USER_SET_VRING_CALL = 13, VHOST_USER_SET_VRING_ERR = 14, -#if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0) VHOST_USER_GET_PROTOCOL_FEATURES = 15, VHOST_USER_SET_PROTOCOL_FEATURES = 16, +#if RTE_VERSION >= RTE_VERSION_NUM(2, 2, 0, 0) VHOST_USER_GET_QUEUE_NUM = 17, - VHOST_USER_SET_VRING_ENABLE = 18, #endif + VHOST_USER_SET_VRING_ENABLE = 18, VHOST_USER_MAX } vhost_user_req_t; @@ -151,6 +162,7 @@ typedef struct vhost_user_msg { vhost_vring_state_t state; vhost_vring_addr_t addr; vhost_user_memory_t memory; + vhost_user_log_t log; }; } __attribute ((packed)) vhost_user_msg_t; @@ -161,9 +173,11 @@ typedef struct { vring_desc_t *desc; vring_avail_t *avail; vring_used_t *used; + u64 log_guest_addr; int callfd; int kickfd; int errfd; + u32 enabled; u32 callfd_idx; u32 n_since_last_int; f64 int_deadline; @@ -186,6 +200,7 @@ typedef struct { u32 nregions; u64 features; u64 feature_mask; + u64 protocol_features; u32 num_vrings; vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS]; void * region_mmap_addr[VHOST_MEMORY_MAX_NREGIONS]; @@ -194,6 +209,9 @@ typedef struct { int virtio_net_hdr_sz; int is_any_layout; u32 * d_trace_buffers; + + void * log_base_addr; + u64 log_size; } vhost_user_intf_t; typedef struct { -- 2.16.6