2 *------------------------------------------------------------------
5 * Copyright (c) 2014-2018 Cisco and/or its affiliates.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at:
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *------------------------------------------------------------------
21 #include <fcntl.h> /* for open */
22 #include <sys/ioctl.h>
23 #include <sys/socket.h>
26 #include <sys/types.h>
27 #include <sys/uio.h> /* for iovec */
28 #include <netinet/in.h>
31 #include <linux/if_arp.h>
32 #include <linux/if_tun.h>
34 #include <vlib/vlib.h>
35 #include <vlib/unix/unix.h>
37 #include <vnet/ethernet/ethernet.h>
38 #include <vnet/devices/devices.h>
39 #include <vnet/feature/feature.h>
40 #include <vnet/ip/ip_psh_cksum.h>
42 #include <vhost/vhost_user.h>
43 #include <vhost/vhost_user_inline.h>
45 #include <vnet/gso/hdr_offset_parser.h>
47 * On the transmit side, we keep processing the buffers from vlib in the while
48 * loop and prepare the copy order to be executed later. However, the static
49 * array which we keep the copy order is limited to VHOST_USER_COPY_ARRAY_N
50 * entries. In order to not corrupt memory, we have to do the copy when the
51 * static array reaches the copy threshold. We subtract 40 in case the code
52 * goes into the inner loop for a maximum of 64k frames which may require
53 * more array entries. We subtract 200 because our default buffer size is
54 * 2048 and the default desc len is likely 1536. While it takes less than 40
55 * vlib buffers for the jumbo frame, it may take twice as much descriptors
56 * for the same jumbo frame. Use 200 for the extra head room.
58 #define VHOST_USER_TX_COPY_THRESHOLD (VHOST_USER_COPY_ARRAY_N - 200)
60 extern vnet_device_class_t vhost_user_device_class;
62 #define foreach_vhost_user_tx_func_error \
64 _(NOT_READY, "vhost vring not ready") \
65 _(DOWN, "vhost interface is down") \
66 _(PKT_DROP_NOBUF, "tx packet drops (no available descriptors)") \
67 _(PKT_DROP_NOMRG, "tx packet drops (cannot merge descriptors)") \
68 _(MMAP_FAIL, "mmap failure") \
69 _(INDIRECT_OVERFLOW, "indirect descriptor table overflow")
73 #define _(f,s) VHOST_USER_TX_FUNC_ERROR_##f,
74 foreach_vhost_user_tx_func_error
76 VHOST_USER_TX_FUNC_N_ERROR,
77 } vhost_user_tx_func_error_t;
79 static __clib_unused char *vhost_user_tx_func_error_strings[] = {
81 foreach_vhost_user_tx_func_error
85 static __clib_unused u8 *
86 format_vhost_user_interface_name (u8 * s, va_list * args)
88 u32 i = va_arg (*args, u32);
89 u32 show_dev_instance = ~0;
90 vhost_user_main_t *vum = &vhost_user_main;
92 if (i < vec_len (vum->show_dev_instance_by_real_dev_instance))
93 show_dev_instance = vum->show_dev_instance_by_real_dev_instance[i];
95 if (show_dev_instance != ~0)
96 i = show_dev_instance;
98 s = format (s, "VirtualEthernet0/0/%d", i);
102 static __clib_unused int
103 vhost_user_name_renumber (vnet_hw_interface_t * hi, u32 new_dev_instance)
105 // FIXME: check if the new dev instance is already used
106 vhost_user_main_t *vum = &vhost_user_main;
107 vhost_user_intf_t *vui = pool_elt_at_index (vum->vhost_user_interfaces,
110 vec_validate_init_empty (vum->show_dev_instance_by_real_dev_instance,
111 hi->dev_instance, ~0);
113 vum->show_dev_instance_by_real_dev_instance[hi->dev_instance] =
116 vu_log_debug (vui, "renumbered vhost-user interface dev_instance %d to %d",
117 hi->dev_instance, new_dev_instance);
122 static_always_inline void
123 vhost_user_tx_trace (vhost_trace_t * t,
124 vhost_user_intf_t * vui, u16 qid,
125 vlib_buffer_t * b, vhost_user_vring_t * rxvq)
127 vhost_user_main_t *vum = &vhost_user_main;
128 u32 last_avail_idx = rxvq->last_avail_idx;
129 u32 desc_current = rxvq->avail->ring[last_avail_idx & rxvq->qsz_mask];
130 vnet_virtio_vring_desc_t *hdr_desc = 0;
133 clib_memset (t, 0, sizeof (*t));
134 t->device_index = vui - vum->vhost_user_interfaces;
137 hdr_desc = &rxvq->desc[desc_current];
138 if (rxvq->desc[desc_current].flags & VRING_DESC_F_INDIRECT)
140 t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT;
141 /* Header is the first here */
142 hdr_desc = map_guest_mem (vui, rxvq->desc[desc_current].addr, &hint);
144 if (rxvq->desc[desc_current].flags & VRING_DESC_F_NEXT)
146 t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED;
148 if (!(rxvq->desc[desc_current].flags & VRING_DESC_F_NEXT) &&
149 !(rxvq->desc[desc_current].flags & VRING_DESC_F_INDIRECT))
151 t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC;
154 t->first_desc_len = hdr_desc ? hdr_desc->len : 0;
157 static_always_inline u32
158 vhost_user_tx_copy (vhost_user_intf_t * vui, vhost_copy_t * cpy,
159 u16 copy_len, u32 * map_hint)
161 void *dst0, *dst1, *dst2, *dst3;
162 if (PREDICT_TRUE (copy_len >= 4))
164 if (PREDICT_FALSE (!(dst2 = map_guest_mem (vui, cpy[0].dst, map_hint))))
166 if (PREDICT_FALSE (!(dst3 = map_guest_mem (vui, cpy[1].dst, map_hint))))
168 while (PREDICT_TRUE (copy_len >= 4))
174 (!(dst2 = map_guest_mem (vui, cpy[2].dst, map_hint))))
177 (!(dst3 = map_guest_mem (vui, cpy[3].dst, map_hint))))
180 clib_prefetch_load ((void *) cpy[2].src);
181 clib_prefetch_load ((void *) cpy[3].src);
183 clib_memcpy_fast (dst0, (void *) cpy[0].src, cpy[0].len);
184 clib_memcpy_fast (dst1, (void *) cpy[1].src, cpy[1].len);
186 vhost_user_log_dirty_pages_2 (vui, cpy[0].dst, cpy[0].len, 1);
187 vhost_user_log_dirty_pages_2 (vui, cpy[1].dst, cpy[1].len, 1);
194 if (PREDICT_FALSE (!(dst0 = map_guest_mem (vui, cpy->dst, map_hint))))
196 clib_memcpy_fast (dst0, (void *) cpy->src, cpy->len);
197 vhost_user_log_dirty_pages_2 (vui, cpy->dst, cpy->len, 1);
204 static_always_inline void
205 vhost_user_handle_tx_offload (vhost_user_intf_t *vui, vlib_buffer_t *b,
206 vnet_virtio_net_hdr_t *hdr)
208 generic_header_offset_t gho = { 0 };
209 int is_ip4 = b->flags & VNET_BUFFER_F_IS_IP4;
210 int is_ip6 = b->flags & VNET_BUFFER_F_IS_IP6;
211 vnet_buffer_oflags_t oflags = vnet_buffer (b)->oflags;
213 ip4_header_t *ip4 = 0;
214 ip6_header_t *ip6 = 0;
216 ASSERT (!(is_ip4 && is_ip6));
217 vnet_generic_header_offset_parser (b, &gho, 1 /* l2 */ , is_ip4, is_ip6);
218 if (oflags & VNET_BUFFER_OFFLOAD_F_IP_CKSUM)
221 (ip4_header_t *) (vlib_buffer_get_current (b) + gho.l3_hdr_offset);
222 ip4->checksum = ip4_header_checksum (ip4);
223 psh_cksum = ip4_pseudo_header_cksum (ip4);
227 ip6 = (ip6_header_t *) (vlib_buffer_get_current (b) + gho.l3_hdr_offset);
228 psh_cksum = ip6_pseudo_header_cksum (ip6);
231 /* checksum offload */
232 if (oflags & VNET_BUFFER_OFFLOAD_F_UDP_CKSUM)
235 (udp_header_t *) (vlib_buffer_get_current (b) + gho.l4_hdr_offset);
236 udp->checksum = psh_cksum;
237 hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
238 hdr->csum_start = gho.l4_hdr_offset;
239 hdr->csum_offset = offsetof (udp_header_t, checksum);
241 else if (oflags & VNET_BUFFER_OFFLOAD_F_TCP_CKSUM)
244 (tcp_header_t *) (vlib_buffer_get_current (b) + gho.l4_hdr_offset);
245 tcp->checksum = psh_cksum;
246 hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
247 hdr->csum_start = gho.l4_hdr_offset;
248 hdr->csum_offset = offsetof (tcp_header_t, checksum);
252 if (b->flags & VNET_BUFFER_F_GSO)
254 if (oflags & VNET_BUFFER_OFFLOAD_F_TCP_CKSUM)
257 (vui->features & VIRTIO_FEATURE (VIRTIO_NET_F_GUEST_TSO4)))
259 hdr->gso_size = vnet_buffer2 (b)->gso_size;
260 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
263 (vui->features & VIRTIO_FEATURE (VIRTIO_NET_F_GUEST_TSO6)))
265 hdr->gso_size = vnet_buffer2 (b)->gso_size;
266 hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
269 else if ((vui->features & VIRTIO_FEATURE (VIRTIO_NET_F_GUEST_UFO)) &&
270 (oflags & VNET_BUFFER_OFFLOAD_F_UDP_CKSUM))
272 hdr->gso_size = vnet_buffer2 (b)->gso_size;
273 hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
278 static_always_inline void
279 vhost_user_mark_desc_available (vlib_main_t * vm, vhost_user_intf_t * vui,
280 vhost_user_vring_t * rxvq,
281 u16 * n_descs_processed, u8 chained,
282 vlib_frame_t * frame, u32 n_left)
285 vnet_virtio_vring_packed_desc_t *desc_table = rxvq->packed_desc;
286 u16 last_used_idx = rxvq->last_used_idx;
288 if (PREDICT_FALSE (*n_descs_processed == 0))
291 if (rxvq->used_wrap_counter)
292 flags = desc_table[last_used_idx & rxvq->qsz_mask].flags |
293 (VRING_DESC_F_AVAIL | VRING_DESC_F_USED);
295 flags = desc_table[last_used_idx & rxvq->qsz_mask].flags &
296 ~(VRING_DESC_F_AVAIL | VRING_DESC_F_USED);
298 vhost_user_advance_last_used_idx (rxvq);
300 for (desc_idx = 1; desc_idx < *n_descs_processed; desc_idx++)
302 if (rxvq->used_wrap_counter)
303 desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags |=
304 (VRING_DESC_F_AVAIL | VRING_DESC_F_USED);
306 desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags &=
307 ~(VRING_DESC_F_AVAIL | VRING_DESC_F_USED);
308 vhost_user_advance_last_used_idx (rxvq);
311 desc_table[last_used_idx & rxvq->qsz_mask].flags = flags;
313 *n_descs_processed = 0;
317 vnet_virtio_vring_packed_desc_t *desc_table = rxvq->packed_desc;
319 while (desc_table[rxvq->last_used_idx & rxvq->qsz_mask].flags &
321 vhost_user_advance_last_used_idx (rxvq);
323 /* Advance past the current chained table entries */
324 vhost_user_advance_last_used_idx (rxvq);
327 /* interrupt (call) handling */
328 if ((rxvq->callfd_idx != ~0) &&
329 (rxvq->avail_event->flags != VRING_EVENT_F_DISABLE))
331 vhost_user_main_t *vum = &vhost_user_main;
333 rxvq->n_since_last_int += frame->n_vectors - n_left;
334 if (rxvq->n_since_last_int > vum->coalesce_frames)
335 vhost_user_send_call (vm, vui, rxvq);
339 static_always_inline void
340 vhost_user_tx_trace_packed (vhost_trace_t * t, vhost_user_intf_t * vui,
341 u16 qid, vlib_buffer_t * b,
342 vhost_user_vring_t * rxvq)
344 vhost_user_main_t *vum = &vhost_user_main;
345 u32 last_avail_idx = rxvq->last_avail_idx;
346 u32 desc_current = last_avail_idx & rxvq->qsz_mask;
347 vnet_virtio_vring_packed_desc_t *hdr_desc = 0;
350 clib_memset (t, 0, sizeof (*t));
351 t->device_index = vui - vum->vhost_user_interfaces;
354 hdr_desc = &rxvq->packed_desc[desc_current];
355 if (rxvq->packed_desc[desc_current].flags & VRING_DESC_F_INDIRECT)
357 t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT;
358 /* Header is the first here */
359 hdr_desc = map_guest_mem (vui, rxvq->packed_desc[desc_current].addr,
362 if (rxvq->packed_desc[desc_current].flags & VRING_DESC_F_NEXT)
364 t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED;
366 if (!(rxvq->packed_desc[desc_current].flags & VRING_DESC_F_NEXT) &&
367 !(rxvq->packed_desc[desc_current].flags & VRING_DESC_F_INDIRECT))
369 t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC;
372 t->first_desc_len = hdr_desc ? hdr_desc->len : 0;
375 static_always_inline uword
376 vhost_user_device_class_packed (vlib_main_t *vm, vlib_node_runtime_t *node,
377 vlib_frame_t *frame, vhost_user_intf_t *vui,
378 vhost_user_vring_t *rxvq)
380 u32 *buffers = vlib_frame_vector_args (frame);
381 u32 n_left = frame->n_vectors;
382 vhost_user_main_t *vum = &vhost_user_main;
385 u32 thread_index = vm->thread_index;
386 vhost_cpu_t *cpu = &vum->cpus[thread_index];
391 vnet_virtio_vring_packed_desc_t *desc_table;
393 u16 desc_head, desc_index, desc_len;
394 u16 n_descs_processed;
395 u8 indirect, chained;
398 error = VHOST_USER_TX_FUNC_ERROR_NONE;
401 n_descs_processed = 0;
405 vlib_buffer_t *b0, *current_b0;
406 uword buffer_map_addr;
409 u32 total_desc_len = 0;
414 if (PREDICT_TRUE (n_left > 1))
415 vlib_prefetch_buffer_with_index (vm, buffers[1], LOAD);
417 b0 = vlib_get_buffer (vm, buffers[0]);
418 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
420 cpu->current_trace = vlib_add_trace (vm, node, b0,
421 sizeof (*cpu->current_trace));
422 vhost_user_tx_trace_packed (cpu->current_trace, vui, qid / 2, b0,
426 desc_table = rxvq->packed_desc;
427 desc_head = desc_index = rxvq->last_avail_idx & rxvq->qsz_mask;
428 if (PREDICT_FALSE (!vhost_user_packed_desc_available (rxvq, desc_head)))
430 error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
434 * Go deeper in case of indirect descriptor.
435 * To test it, turn off mrg_rxbuf.
437 if (desc_table[desc_head].flags & VRING_DESC_F_INDIRECT)
440 if (PREDICT_FALSE (desc_table[desc_head].len <
441 sizeof (vnet_virtio_vring_packed_desc_t)))
443 error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
446 n_entries = desc_table[desc_head].len >> 4;
447 desc_table = map_guest_mem (vui, desc_table[desc_index].addr,
449 if (PREDICT_FALSE (desc_table == 0))
451 error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
456 else if (rxvq->packed_desc[desc_head].flags & VRING_DESC_F_NEXT)
459 desc_len = vui->virtio_net_hdr_sz;
460 buffer_map_addr = desc_table[desc_index].addr;
461 buffer_len = desc_table[desc_index].len;
463 /* Get a header from the header array */
464 vnet_virtio_net_hdr_mrg_rxbuf_t *hdr = &cpu->tx_headers[tx_headers_len];
467 hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
468 hdr->num_buffers = 1;
470 or_flags = (b0->flags & VNET_BUFFER_F_OFFLOAD);
472 /* Guest supports csum offload and buffer requires checksum offload? */
474 (vui->features & VIRTIO_FEATURE (VIRTIO_NET_F_GUEST_CSUM)))
475 vhost_user_handle_tx_offload (vui, b0, &hdr->hdr);
477 /* Prepare a copy order executed later for the header */
478 ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N);
479 vhost_copy_t *cpy = &cpu->copy[copy_len];
481 cpy->len = vui->virtio_net_hdr_sz;
482 cpy->dst = buffer_map_addr;
483 cpy->src = (uword) hdr;
485 buffer_map_addr += vui->virtio_net_hdr_sz;
486 buffer_len -= vui->virtio_net_hdr_sz;
487 bytes_left = b0->current_length;
497 * Next one is chained
498 * Test it with both indirect and mrg_rxbuf off
500 if (PREDICT_FALSE (!(desc_table[desc_index].flags &
504 * Last descriptor in chain.
505 * Dequeue queued descriptors for this packet
507 vhost_user_dequeue_chained_descs (rxvq,
509 error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
512 vhost_user_advance_last_avail_idx (rxvq);
513 desc_index = rxvq->last_avail_idx & rxvq->qsz_mask;
515 buffer_map_addr = desc_table[desc_index].addr;
516 buffer_len = desc_table[desc_index].len;
517 total_desc_len += desc_len;
524 * Test it with mrg_rxnuf off
526 if (PREDICT_TRUE (n_entries > 0))
530 /* Dequeue queued descriptors for this packet */
531 vhost_user_dequeue_chained_descs (rxvq,
533 error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
536 total_desc_len += desc_len;
537 desc_index = (desc_index + 1) & rxvq->qsz_mask;
538 buffer_map_addr = desc_table[desc_index].addr;
539 buffer_len = desc_table[desc_index].len;
542 else if (vui->virtio_net_hdr_sz == 12)
546 * This is the default setting for the guest VM
548 vnet_virtio_net_hdr_mrg_rxbuf_t *hdr =
549 &cpu->tx_headers[tx_headers_len - 1];
551 desc_table[desc_index].len = desc_len;
552 vhost_user_advance_last_avail_idx (rxvq);
553 desc_head = desc_index =
554 rxvq->last_avail_idx & rxvq->qsz_mask;
559 if (PREDICT_FALSE (!vhost_user_packed_desc_available
562 /* Dequeue queued descriptors for this packet */
563 vhost_user_dequeue_descs (rxvq, hdr,
565 error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
569 buffer_map_addr = desc_table[desc_index].addr;
570 buffer_len = desc_table[desc_index].len;
574 error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOMRG;
579 ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N);
580 vhost_copy_t *cpy = &cpu->copy[copy_len];
582 cpy->len = bytes_left;
583 cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len;
584 cpy->dst = buffer_map_addr;
585 cpy->src = (uword) vlib_buffer_get_current (current_b0) +
586 current_b0->current_length - bytes_left;
588 bytes_left -= cpy->len;
589 buffer_len -= cpy->len;
590 buffer_map_addr += cpy->len;
591 desc_len += cpy->len;
593 clib_prefetch_load (&rxvq->packed_desc);
595 /* Check if vlib buffer has more data. If not, get more or break */
596 if (PREDICT_TRUE (!bytes_left))
599 (current_b0->flags & VLIB_BUFFER_NEXT_PRESENT))
601 current_b0 = vlib_get_buffer (vm, current_b0->next_buffer);
602 bytes_left = current_b0->current_length;
612 /* Move from available to used ring */
613 total_desc_len += desc_len;
614 rxvq->packed_desc[desc_head].len = total_desc_len;
616 vhost_user_advance_last_avail_table_idx (vui, rxvq, chained);
619 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
620 cpu->current_trace->hdr = cpu->tx_headers[tx_headers_len - 1];
625 * Do the copy periodically to prevent
626 * cpu->copy array overflow and corrupt memory
628 if (PREDICT_FALSE (copy_len >= VHOST_USER_TX_COPY_THRESHOLD) || chained)
630 if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len,
632 vlib_error_count (vm, node->node_index,
633 VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
636 /* give buffers back to driver */
637 vhost_user_mark_desc_available (vm, vui, rxvq, &n_descs_processed,
638 chained, frame, n_left);
645 if (PREDICT_TRUE (copy_len))
647 if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len,
649 vlib_error_count (vm, node->node_index,
650 VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
652 vhost_user_mark_desc_available (vm, vui, rxvq, &n_descs_processed,
653 chained, frame, n_left);
657 * When n_left is set, error is always set to something too.
658 * In case error is due to lack of remaining buffers, we go back up and
660 * The idea is that it is better to waste some time on packets
661 * that have been processed already than dropping them and get
662 * more fresh packets with a good likelyhood that they will be dropped too.
663 * This technique also gives more time to VM driver to pick-up packets.
664 * In case the traffic flows from physical to virtual interfaces, this
665 * technique will end-up leveraging the physical NIC buffer in order to
666 * absorb the VM's CPU jitter.
668 if (n_left && (error == VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF) && retry)
674 clib_spinlock_unlock (&rxvq->vring_lock);
676 if (PREDICT_FALSE (n_left && error != VHOST_USER_TX_FUNC_ERROR_NONE))
678 vlib_error_count (vm, node->node_index, error, n_left);
679 vlib_increment_simple_counter
680 (vnet_main.interface_main.sw_if_counters +
681 VNET_INTERFACE_COUNTER_DROP, thread_index, vui->sw_if_index, n_left);
684 vlib_buffer_free (vm, vlib_frame_vector_args (frame), frame->n_vectors);
685 return frame->n_vectors;
688 VNET_DEVICE_CLASS_TX_FN (vhost_user_device_class) (vlib_main_t * vm,
689 vlib_node_runtime_t *
690 node, vlib_frame_t * frame)
692 u32 *buffers = vlib_frame_vector_args (frame);
693 u32 n_left = frame->n_vectors;
694 vhost_user_main_t *vum = &vhost_user_main;
695 vnet_interface_output_runtime_t *rd = (void *) node->runtime_data;
696 vhost_user_intf_t *vui =
697 pool_elt_at_index (vum->vhost_user_interfaces, rd->dev_instance);
699 vhost_user_vring_t *rxvq;
701 u32 thread_index = vm->thread_index;
702 vhost_cpu_t *cpu = &vum->cpus[thread_index];
708 vnet_hw_if_tx_frame_t *tf = vlib_frame_scalar_args (frame);
710 if (PREDICT_FALSE (!vui->admin_up))
712 error = VHOST_USER_TX_FUNC_ERROR_DOWN;
716 if (PREDICT_FALSE (!vui->is_ready))
718 error = VHOST_USER_TX_FUNC_ERROR_NOT_READY;
722 qid = VHOST_VRING_IDX_RX (tf->queue_id);
723 rxvq = &vui->vrings[qid];
724 ASSERT (tf->queue_id == rxvq->qid);
726 if (PREDICT_FALSE (rxvq->avail == 0))
728 error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
731 if (tf->shared_queue)
732 clib_spinlock_lock (&rxvq->vring_lock);
734 if (vhost_user_is_packed_ring_supported (vui))
735 return (vhost_user_device_class_packed (vm, node, frame, vui, rxvq));
738 error = VHOST_USER_TX_FUNC_ERROR_NONE;
743 vlib_buffer_t *b0, *current_b0;
744 u16 desc_head, desc_index, desc_len;
745 vnet_virtio_vring_desc_t *desc_table;
746 uword buffer_map_addr;
750 if (PREDICT_TRUE (n_left > 1))
751 vlib_prefetch_buffer_with_index (vm, buffers[1], LOAD);
753 b0 = vlib_get_buffer (vm, buffers[0]);
755 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
757 cpu->current_trace = vlib_add_trace (vm, node, b0,
758 sizeof (*cpu->current_trace));
759 vhost_user_tx_trace (cpu->current_trace, vui, qid / 2, b0, rxvq);
762 if (PREDICT_FALSE (rxvq->last_avail_idx == rxvq->avail->idx))
764 error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
768 desc_table = rxvq->desc;
769 desc_head = desc_index =
770 rxvq->avail->ring[rxvq->last_avail_idx & rxvq->qsz_mask];
772 /* Go deeper in case of indirect descriptor
773 * I don't know of any driver providing indirect for RX. */
774 if (PREDICT_FALSE (rxvq->desc[desc_head].flags & VRING_DESC_F_INDIRECT))
776 if (PREDICT_FALSE (rxvq->desc[desc_head].len <
777 sizeof (vnet_virtio_vring_desc_t)))
779 error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
784 map_guest_mem (vui, rxvq->desc[desc_index].addr,
787 error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
793 desc_len = vui->virtio_net_hdr_sz;
794 buffer_map_addr = desc_table[desc_index].addr;
795 buffer_len = desc_table[desc_index].len;
798 // Get a header from the header array
799 vnet_virtio_net_hdr_mrg_rxbuf_t *hdr =
800 &cpu->tx_headers[tx_headers_len];
803 hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
804 hdr->num_buffers = 1; //This is local, no need to check
806 or_flags = (b0->flags & VNET_BUFFER_F_OFFLOAD);
808 /* Guest supports csum offload and buffer requires checksum offload? */
810 && (vui->features & VIRTIO_FEATURE (VIRTIO_NET_F_GUEST_CSUM)))
811 vhost_user_handle_tx_offload (vui, b0, &hdr->hdr);
813 // Prepare a copy order executed later for the header
814 ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N);
815 vhost_copy_t *cpy = &cpu->copy[copy_len];
817 cpy->len = vui->virtio_net_hdr_sz;
818 cpy->dst = buffer_map_addr;
819 cpy->src = (uword) hdr;
822 buffer_map_addr += vui->virtio_net_hdr_sz;
823 buffer_len -= vui->virtio_net_hdr_sz;
824 bytes_left = b0->current_length;
830 if (desc_table[desc_index].flags & VRING_DESC_F_NEXT)
832 //Next one is chained
833 desc_index = desc_table[desc_index].next;
834 buffer_map_addr = desc_table[desc_index].addr;
835 buffer_len = desc_table[desc_index].len;
837 else if (vui->virtio_net_hdr_sz == 12) //MRG is available
839 vnet_virtio_net_hdr_mrg_rxbuf_t *hdr =
840 &cpu->tx_headers[tx_headers_len - 1];
842 //Move from available to used buffer
843 rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].id =
845 rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].len =
847 vhost_user_log_dirty_ring (vui, rxvq,
848 ring[rxvq->last_used_idx &
851 rxvq->last_avail_idx++;
852 rxvq->last_used_idx++;
857 (rxvq->last_avail_idx == rxvq->avail->idx))
859 //Dequeue queued descriptors for this packet
860 rxvq->last_used_idx -= hdr->num_buffers - 1;
861 rxvq->last_avail_idx -= hdr->num_buffers - 1;
862 error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF;
866 desc_table = rxvq->desc;
867 desc_head = desc_index =
868 rxvq->avail->ring[rxvq->last_avail_idx & rxvq->qsz_mask];
870 (rxvq->desc[desc_head].flags & VRING_DESC_F_INDIRECT))
872 //It is seriously unlikely that a driver will put indirect descriptor
873 //after non-indirect descriptor.
874 if (PREDICT_FALSE (rxvq->desc[desc_head].len <
875 sizeof (vnet_virtio_vring_desc_t)))
877 error = VHOST_USER_TX_FUNC_ERROR_INDIRECT_OVERFLOW;
883 rxvq->desc[desc_index].addr,
886 error = VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL;
891 buffer_map_addr = desc_table[desc_index].addr;
892 buffer_len = desc_table[desc_index].len;
896 error = VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOMRG;
902 ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N);
903 vhost_copy_t *cpy = &cpu->copy[copy_len];
905 cpy->len = bytes_left;
906 cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len;
907 cpy->dst = buffer_map_addr;
908 cpy->src = (uword) vlib_buffer_get_current (current_b0) +
909 current_b0->current_length - bytes_left;
911 bytes_left -= cpy->len;
912 buffer_len -= cpy->len;
913 buffer_map_addr += cpy->len;
914 desc_len += cpy->len;
916 clib_prefetch_load (&rxvq->desc);
919 // Check if vlib buffer has more data. If not, get more or break.
920 if (PREDICT_TRUE (!bytes_left))
923 (current_b0->flags & VLIB_BUFFER_NEXT_PRESENT))
925 current_b0 = vlib_get_buffer (vm, current_b0->next_buffer);
926 bytes_left = current_b0->current_length;
936 //Move from available to used ring
937 rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].id = desc_head;
938 rxvq->used->ring[rxvq->last_used_idx & rxvq->qsz_mask].len = desc_len;
939 vhost_user_log_dirty_ring (vui, rxvq,
940 ring[rxvq->last_used_idx & rxvq->qsz_mask]);
941 rxvq->last_avail_idx++;
942 rxvq->last_used_idx++;
944 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
946 cpu->current_trace->hdr = cpu->tx_headers[tx_headers_len - 1];
949 n_left--; //At the end for error counting when 'goto done' is invoked
952 * Do the copy periodically to prevent
953 * cpu->copy array overflow and corrupt memory
955 if (PREDICT_FALSE (copy_len >= VHOST_USER_TX_COPY_THRESHOLD))
957 if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len,
960 vlib_error_count (vm, node->node_index,
961 VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
965 /* give buffers back to driver */
966 CLIB_MEMORY_BARRIER ();
967 rxvq->used->idx = rxvq->last_used_idx;
968 vhost_user_log_dirty_ring (vui, rxvq, idx);
974 //Do the memory copies
975 if (PREDICT_FALSE (vhost_user_tx_copy (vui, cpu->copy, copy_len,
978 vlib_error_count (vm, node->node_index,
979 VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
982 CLIB_MEMORY_BARRIER ();
983 rxvq->used->idx = rxvq->last_used_idx;
984 vhost_user_log_dirty_ring (vui, rxvq, idx);
987 * When n_left is set, error is always set to something too.
988 * In case error is due to lack of remaining buffers, we go back up and
990 * The idea is that it is better to waste some time on packets
991 * that have been processed already than dropping them and get
992 * more fresh packets with a good likelihood that they will be dropped too.
993 * This technique also gives more time to VM driver to pick-up packets.
994 * In case the traffic flows from physical to virtual interfaces, this
995 * technique will end-up leveraging the physical NIC buffer in order to
996 * absorb the VM's CPU jitter.
998 if (n_left && (error == VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF) && retry)
1004 /* interrupt (call) handling */
1005 if ((rxvq->callfd_idx != ~0) &&
1006 !(rxvq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1008 rxvq->n_since_last_int += frame->n_vectors - n_left;
1010 if (rxvq->n_since_last_int > vum->coalesce_frames)
1011 vhost_user_send_call (vm, vui, rxvq);
1014 clib_spinlock_unlock (&rxvq->vring_lock);
1017 if (PREDICT_FALSE (n_left && error != VHOST_USER_TX_FUNC_ERROR_NONE))
1019 vlib_error_count (vm, node->node_index, error, n_left);
1020 vlib_increment_simple_counter
1021 (vnet_main.interface_main.sw_if_counters
1022 + VNET_INTERFACE_COUNTER_DROP,
1023 thread_index, vui->sw_if_index, n_left);
1026 vlib_buffer_free (vm, vlib_frame_vector_args (frame), frame->n_vectors);
1027 return frame->n_vectors;
1030 static __clib_unused clib_error_t *
1031 vhost_user_interface_rx_mode_change (vnet_main_t * vnm, u32 hw_if_index,
1032 u32 qid, vnet_hw_if_rx_mode mode)
1034 vlib_main_t *vm = vnm->vlib_main;
1035 vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
1036 vhost_user_main_t *vum = &vhost_user_main;
1037 vhost_user_intf_t *vui =
1038 pool_elt_at_index (vum->vhost_user_interfaces, hif->dev_instance);
1039 vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)];
1042 if (mode == txvq->mode)
1045 if ((mode != VNET_HW_IF_RX_MODE_POLLING) &&
1046 (mode != VNET_HW_IF_RX_MODE_ADAPTIVE) &&
1047 (mode != VNET_HW_IF_RX_MODE_INTERRUPT))
1049 vu_log_err (vui, "unhandled mode %d changed for if %d queue %d", mode,
1051 return clib_error_return (0, "unsupported");
1054 if (txvq->thread_index == ~0)
1055 return clib_error_return (0, "Queue initialization is not finished yet");
1057 cpu = vec_elt_at_index (vum->cpus, txvq->thread_index);
1058 if ((mode == VNET_HW_IF_RX_MODE_INTERRUPT) ||
1059 (mode == VNET_HW_IF_RX_MODE_ADAPTIVE))
1061 if (txvq->kickfd_idx == ~0)
1063 // We cannot support interrupt mode if the driver opts out
1064 return clib_error_return (0, "Driver does not support interrupt");
1066 if (txvq->mode == VNET_HW_IF_RX_MODE_POLLING)
1068 ASSERT (cpu->polling_q_count != 0);
1069 if (cpu->polling_q_count)
1070 cpu->polling_q_count--;
1072 // Start the timer if this is the first encounter on interrupt
1074 if ((vum->ifq_count == 1) &&
1075 ((vum->coalesce_time > 0.0) || (vum->coalesce_frames > 0)))
1076 vlib_process_signal_event (vm,
1077 vhost_user_send_interrupt_node.index,
1078 VHOST_USER_EVENT_START_TIMER, 0);
1081 else if (mode == VNET_HW_IF_RX_MODE_POLLING)
1083 if (((txvq->mode == VNET_HW_IF_RX_MODE_INTERRUPT) ||
1084 (txvq->mode == VNET_HW_IF_RX_MODE_ADAPTIVE)) && vum->ifq_count)
1086 cpu->polling_q_count++;
1088 // Stop the timer if there is no more interrupt interface/queue
1089 if (vum->ifq_count == 0)
1090 vlib_process_signal_event (vm,
1091 vhost_user_send_interrupt_node.index,
1092 VHOST_USER_EVENT_STOP_TIMER, 0);
1097 vhost_user_set_operation_mode (vui, txvq);
1102 static __clib_unused clib_error_t *
1103 vhost_user_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index,
1106 vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index);
1107 vhost_user_main_t *vum = &vhost_user_main;
1108 vhost_user_intf_t *vui =
1109 pool_elt_at_index (vum->vhost_user_interfaces, hif->dev_instance);
1110 u8 link_old, link_new;
1112 link_old = vui_is_link_up (vui);
1114 vui->admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
1116 link_new = vui_is_link_up (vui);
1118 if (link_old != link_new)
1119 vnet_hw_interface_set_flags (vnm, vui->hw_if_index, link_new ?
1120 VNET_HW_INTERFACE_FLAG_LINK_UP : 0);
1122 return /* no error */ 0;
1126 VNET_DEVICE_CLASS (vhost_user_device_class) = {
1127 .name = "vhost-user",
1128 .tx_function_n_errors = VHOST_USER_TX_FUNC_N_ERROR,
1129 .tx_function_error_strings = vhost_user_tx_func_error_strings,
1130 .format_device_name = format_vhost_user_interface_name,
1131 .name_renumber = vhost_user_name_renumber,
1132 .admin_up_down_function = vhost_user_interface_admin_up_down,
1133 .rx_mode_change_function = vhost_user_interface_rx_mode_change,
1134 .format_tx_trace = format_vhost_trace,
1140 * fd.io coding-style-patch-verification: ON
1143 * eval: (c-set-style "gnu")