#include <rte_udp.h>
#include <rte_sctp.h>
#include <rte_arp.h>
+#include <rte_spinlock.h>
#include "vhost.h"
#define MAX_PKT_BURST 32
#define VHOST_LOG_PAGE 4096
+/*
+ * Atomically set a bit in memory.
+ */
+static inline void __attribute__((always_inline))
+vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
+{
+ __sync_fetch_and_or_8(addr, (1U << nr));
+}
+
static inline void __attribute__((always_inline))
vhost_log_page(uint8_t *log_base, uint64_t page)
{
- log_base[page / 8] |= 1 << (page % 8);
+ vhost_set_bit(page % 8, &log_base[page / 8]);
}
static inline void __attribute__((always_inline))
static void
virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
{
- if (m_buf->ol_flags & PKT_TX_L4_MASK) {
+ uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
+
+ if (m_buf->ol_flags & PKT_TX_TCP_SEG)
+ csum_l4 |= PKT_TX_TCP_CKSUM;
+
+ if (csum_l4) {
net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
- switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
+ switch (csum_l4) {
case PKT_TX_TCP_CKSUM:
net_hdr->csum_offset = (offsetof(struct tcp_hdr,
cksum));
}
}
+ /* IP cksum verification cannot be bypassed, then calculate here */
+ if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
+ struct ipv4_hdr *ipv4_hdr;
+
+ ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
+ m_buf->l2_len);
+ ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+ }
+
if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
if (m_buf->ol_flags & PKT_TX_IPV4)
net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
struct vring_desc *desc;
uint64_t desc_addr;
struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+ /* A counter to avoid desc dead loop chain */
+ uint16_t nr_desc = 1;
desc = &descs[desc_idx];
desc_addr = gpa_to_vva(dev, desc->addr);
/* Room in vring buffer is not enough */
return -1;
}
- if (unlikely(desc->next >= size))
+ if (unlikely(desc->next >= size || ++nr_desc > size))
return -1;
desc = &descs[desc->next];
}
vq = dev->virtqueue[queue_id];
+
+ rte_spinlock_lock(&vq->access_lock);
+
if (unlikely(vq->enabled == 0))
- return 0;
+ goto out_access_unlock;
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
start_idx = vq->last_used_idx;
count = RTE_MIN(count, free_entries);
count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
if (count == 0)
- return 0;
+ goto out_access_unlock;
LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
dev->vid, start_idx, start_idx + count);
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
+
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
return count;
}
}
vq = dev->virtqueue[queue_id];
+
+ rte_spinlock_lock(&vq->access_lock);
+
if (unlikely(vq->enabled == 0))
- return 0;
+ goto out_access_unlock;
count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
if (count == 0)
- return 0;
+ goto out_access_unlock;
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
eventfd_write(vq->callfd, (eventfd_t)1);
}
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
return pkt_idx;
}
virtio_net_with_host_offload(struct virtio_net *dev)
{
if (dev->features &
- (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_ECN |
- VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 |
- VIRTIO_NET_F_HOST_UFO))
+ ((1ULL << VIRTIO_NET_F_CSUM) |
+ (1ULL << VIRTIO_NET_F_HOST_ECN) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) |
+ (1ULL << VIRTIO_NET_F_HOST_UFO)))
return true;
return false;
default:
m->l3_len = 0;
*l4_proto = 0;
+ *l4_hdr = NULL;
break;
}
}
}
}
- if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV4:
case VIRTIO_NET_HDR_GSO_TCPV6:
desc->addr + desc_offset, cpy_len)))) {
cur->data_len = cpy_len;
cur->data_off = 0;
- cur->buf_addr = (void *)(uintptr_t)desc_addr;
+ cur->buf_addr = (void *)(uintptr_t)(desc_addr
+ + desc_offset);
cur->buf_physaddr = hpa;
/*
"allocate memory for mbuf.\n");
return -1;
}
+ if (unlikely(dev->dequeue_zero_copy))
+ rte_mbuf_refcnt_update(cur, 1);
prev->next = cur;
prev->data_len = mbuf_offset;
return true;
}
+static inline void __attribute__((always_inline))
+restore_mbuf(struct rte_mbuf *m)
+{
+ uint32_t mbuf_size, priv_size;
+
+ while (m) {
+ priv_size = rte_pktmbuf_priv_size(m->pool);
+ mbuf_size = sizeof(struct rte_mbuf) + priv_size;
+ /* start of buffer is after mbuf structure and priv data */
+
+ m->buf_addr = (char *)m + mbuf_size;
+ m->buf_physaddr = rte_mempool_virt2phy(NULL, m) + mbuf_size;
+ m = m->next;
+ }
+}
+
uint16_t
rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
}
vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
+
+ if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
return 0;
+ if (unlikely(vq->enabled == 0))
+ goto out_access_unlock;
+
if (unlikely(dev->dequeue_zero_copy)) {
struct zcopy_mbuf *zmbuf, *next;
int nr_updated = 0;
nr_updated += 1;
TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+ restore_mbuf(zmbuf->mbuf);
rte_pktmbuf_free(zmbuf->mbuf);
put_zmbuf(zmbuf);
vq->nr_zmbuf -= 1;
* array, to looks like that guest actually send such packet.
*
* Check user_send_rarp() for more information.
+ *
+ * broadcast_rarp shares a cacheline in the virtio_net structure
+ * with some fields that are accessed during enqueue and
+ * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
+ * result in false sharing between enqueue and dequeue.
+ *
+ * Prevent unnecessary false sharing by reading broadcast_rarp first
+ * and only performing cmpset if the read indicates it is likely to
+ * be set.
*/
- if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
- &dev->broadcast_rarp.cnt, 1, 0))) {
+
+ if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
+ rte_atomic16_cmpset((volatile uint16_t *)
+ &dev->broadcast_rarp.cnt, 1, 0))) {
+
rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
if (rarp_mbuf == NULL) {
RTE_LOG(ERR, VHOST_DATA,
"Failed to allocate memory for mbuf.\n");
- return 0;
+ goto out_access_unlock;
}
if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
free_entries = *((volatile uint16_t *)&vq->avail->idx) -
vq->last_avail_idx;
if (free_entries == 0)
- goto out;
+ goto out_access_unlock;
LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
update_used_idx(dev, vq, i);
}
-out:
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
if (unlikely(rarp_mbuf != NULL)) {
/*
* Inject it to the head of "pkts" array, so that switch's mac