Discussion:
[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.
(too old to reply)
x***@intel.com
2010-04-09 09:37:43 UTC
Permalink
From: Xin Xiaohui <***@intel.com>

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui <***@intel.com>
Signed-off-by: Zhao Yu <***@gmail.com>
Reviewed-by: Jeff Dike <***@linux.intel.com>
---

memory leak fixed,
kconfig made,
do_unbind() made,
mp_chr_ioctl() cleaned up and
some other cleanups made

by Jeff Dike <***@linux.intel.com>

drivers/vhost/Kconfig | 5 +
drivers/vhost/Makefile | 2 +
drivers/vhost/mpassthru.c | 1264 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/mpassthru.h | 29 +
4 files changed, 1300 insertions(+), 0 deletions(-)
create mode 100644 drivers/vhost/mpassthru.c
create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
To compile this driver as a module, choose M here: the module will
be called vhost_net.

+config VHOST_PASSTHRU
+ tristate "Zerocopy network driver (EXPERIMENTAL)"
+ depends on VHOST_NET
+ ---help---
+ zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
obj-$(CONFIG_VHOST_NET) += vhost_net.o
vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..86d2525
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1264 @@
+/*
+ * MPASSTHRU - Mediate passthrough device.
+ * Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME "mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/aio.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/nsproxy.h>
+#include <linux/uaccess.h>
+#include <linux/virtio_net.h>
+#include <linux/mpassthru.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+#include "vhost.h"
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+ u16 offset;
+ u16 size;
+};
+
+struct page_ctor {
+ struct list_head readq;
+ int w_len;
+ int r_len;
+ spinlock_t read_lock;
+ struct kmem_cache *cache;
+ /* record the locked pages */
+ int lock_pages;
+ struct rlimit o_rlim;
+ struct net_device *dev;
+ struct mpassthru_port port;
+};
+
+struct page_info {
+ void *ctrl;
+ struct list_head list;
+ int header;
+ /* indicate the actual length of bytes
+ * send/recv in the user space buffers
+ */
+ int total;
+ int offset;
+ struct page *pages[MAX_SKB_FRAGS+1];
+ struct skb_frag_struct frag[MAX_SKB_FRAGS+1];
+ struct sk_buff *skb;
+ struct page_ctor *ctor;
+
+ /* The pointer relayed to skb, to indicate
+ * it's a user space allocated skb or kernel
+ */
+ struct skb_user_page user;
+ struct skb_shared_info ushinfo;
+
+#define INFO_READ 0
+#define INFO_WRITE 1
+ unsigned flags;
+ unsigned pnum;
+
+ /* It's meaningful for receive, means
+ * the max length allowed
+ */
+ size_t len;
+
+ /* The fields after that is for backend
+ * driver, now for vhost-net.
+ */
+
+ struct kiocb *iocb;
+ unsigned int desc_pos;
+ unsigned int log;
+ struct iovec hdr[VHOST_NET_MAX_SG];
+ struct iovec iov[VHOST_NET_MAX_SG];
+};
+
+struct mp_struct {
+ struct mp_file *mfile;
+ struct net_device *dev;
+ struct page_ctor *ctor;
+ struct socket socket;
+
+#ifdef MPASSTHRU_DEBUG
+ int debug;
+#endif
+};
+
+struct mp_file {
+ atomic_t count;
+ struct mp_struct *mp;
+ struct net *net;
+};
+
+struct mp_sock {
+ struct sock sk;
+ struct mp_struct *mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+ int ret = 0;
+
+ rtnl_lock();
+ ret = dev_change_flags(dev, flags);
+ rtnl_unlock();
+
+ if (ret < 0)
+ printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+ return ret;
+}
+
+/* The main function to allocate user space buffers */
+static struct skb_user_page *page_ctor(struct mpassthru_port *port,
+ struct sk_buff *skb, int npages)
+{
+ int i;
+ unsigned long flags;
+ struct page_ctor *ctor;
+ struct page_info *info = NULL;
+
+ ctor = container_of(port, struct page_ctor, port);
+
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq, struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ if (!info)
+ return NULL;
+
+ for (i = 0; i < info->pnum; i++) {
+ get_page(info->pages[i]);
+ info->frag[i].page = info->pages[i];
+ info->frag[i].page_offset = i ? 0 : info->offset;
+ info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+ port->data_len;
+ }
+ info->skb = skb;
+ info->user.frags = info->frag;
+ info->user.ushinfo = &info->ushinfo;
+ return &info->user;
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+ struct page_info *info = (struct page_info *)(iocb->private);
+ int i;
+
+ if (info->flags == INFO_READ) {
+ for (i = 0; i < info->pnum; i++) {
+ if (info->pages[i]) {
+ set_page_dirty_lock(info->pages[i]);
+ put_page(info->pages[i]);
+ }
+ }
+ skb_shinfo(info->skb)->destructor_arg = &info->user;
+ info->skb->destructor = NULL;
+ kfree_skb(info->skb);
+ }
+ /* Decrement the number of locked pages */
+ info->ctor->lock_pages -= info->pnum;
+ kmem_cache_free(info->ctor->cache, info);
+
+ return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+ struct kiocb *iocb = NULL;
+
+ iocb = info->iocb;
+ if (!iocb)
+ return iocb;
+ iocb->ki_flags = 0;
+ iocb->ki_users = 1;
+ iocb->ki_key = 0;
+ iocb->ki_ctx = NULL;
+ iocb->ki_cancel = NULL;
+ iocb->ki_retry = NULL;
+ iocb->ki_iovec = NULL;
+ iocb->ki_eventfd = NULL;
+ iocb->private = (void *)info;
+ iocb->ki_pos = info->desc_pos;
+ iocb->ki_nbytes = size;
+ iocb->ki_user_data = info->log;
+ iocb->ki_dtor = mp_ki_dtor;
+ return iocb;
+}
+
+/* A helper to clean the skb before the kfree_skb() */
+
+static void page_dtor_prepare(struct page_info *info)
+{
+ if (info->flags == INFO_READ)
+ if (info->skb)
+ info->skb->head = NULL;
+}
+
+/* The callback to destruct the user space buffers or skb */
+static void page_dtor(struct skb_user_page *user)
+{
+ struct page_info *info;
+ struct page_ctor *ctor;
+ struct sock *sk;
+ struct sk_buff *skb;
+ struct kiocb *iocb = NULL;
+ struct vhost_virtqueue *vq = NULL;
+ unsigned long flags;
+ int i;
+
+ if (!user)
+ return;
+ info = container_of(user, struct page_info, user);
+ if (!info)
+ return;
+ ctor = info->ctor;
+ skb = info->skb;
+
+ page_dtor_prepare(info);
+
+ /* If the info->total is 0, make it to be reused */
+ if (!info->total) {
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ list_add(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ return;
+ }
+
+ if (info->flags == INFO_READ)
+ return;
+
+ /* For transmit, we should wait for the DMA finish by hardware.
+ * Queue the notifier to wake up the backend driver
+ */
+ vq = (struct vhost_virtqueue *)info->ctrl;
+ iocb = create_iocb(info, info->total);
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ list_add_tail(&iocb->ki_list, &vq->notifier);
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+
+ sk = ctor->port.sock->sk;
+ sk->sk_write_space(sk);
+
+ return;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+ int rc;
+ struct page_ctor *ctor;
+ struct net_device *dev = mp->dev;
+
+ /* locked by mp_mutex */
+ if (rcu_dereference(mp->ctor))
+ return -EBUSY;
+
+ ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
+ if (!ctor)
+ return -ENOMEM;
+ rc = netdev_mp_port_prep(dev, &ctor->port);
+ if (rc)
+ goto fail;
+
+ ctor->cache = kmem_cache_create("skb_page_info",
+ sizeof(struct page_info), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+
+ if (!ctor->cache)
+ goto cache_fail;
+
+ INIT_LIST_HEAD(&ctor->readq);
+ spin_lock_init(&ctor->read_lock);
+
+ ctor->w_len = 0;
+ ctor->r_len = 0;
+
+ dev_hold(dev);
+ ctor->dev = dev;
+ ctor->port.ctor = page_ctor;
+ ctor->port.sock = &mp->socket;
+ ctor->lock_pages = 0;
+ rc = netdev_mp_port_attach(dev, &ctor->port);
+ if (rc)
+ goto fail;
+
+ /* locked by mp_mutex */
+ rcu_assign_pointer(mp->ctor, ctor);
+
+ /* XXX:Need we do set_offload here ? */
+
+ return 0;
+
+fail:
+ kmem_cache_destroy(ctor->cache);
+cache_fail:
+ kfree(ctor);
+ dev_put(dev);
+
+ return rc;
+}
+
+struct page_info *info_dequeue(struct page_ctor *ctor)
+{
+ unsigned long flags;
+ struct page_info *info = NULL;
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq,
+ struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ return info;
+}
+
+static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
+ unsigned long cur, unsigned long max)
+{
+ struct rlimit new_rlim, *old_rlim;
+ int retval;
+
+ if (resource != RLIMIT_MEMLOCK)
+ return -EINVAL;
+ new_rlim.rlim_cur = cur;
+ new_rlim.rlim_max = max;
+
+ old_rlim = current->signal->rlim + resource;
+
+ /* remember the old rlimit value when backend enabled */
+ ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
+ ctor->o_rlim.rlim_max = old_rlim->rlim_max;
+
+ if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+ !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ retval = security_task_setrlimit(resource, &new_rlim);
+ if (retval)
+ return retval;
+
+ task_lock(current->group_leader);
+ *old_rlim = new_rlim;
+ task_unlock(current->group_leader);
+ return 0;
+}
+
+static int page_ctor_detach(struct mp_struct *mp)
+{
+ struct page_ctor *ctor;
+ struct page_info *info;
+ struct vhost_virtqueue *vq = NULL;
+ struct kiocb *iocb = NULL;
+ int i;
+ unsigned long flags;
+
+ /* locked by mp_mutex */
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -ENODEV;
+
+ while ((info = info_dequeue(ctor))) {
+ for (i = 0; i < info->pnum; i++)
+ if (info->pages[i])
+ put_page(info->pages[i]);
+ vq = (struct vhost_virtqueue *)(info->ctrl);
+ iocb = create_iocb(info, 0);
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ list_add_tail(&iocb->ki_list, &vq->notifier);
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+
+ kmem_cache_free(ctor->cache, info);
+ }
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ ctor->o_rlim.rlim_cur,
+ ctor->o_rlim.rlim_max);
+ kmem_cache_destroy(ctor->cache);
+ netdev_mp_port_detach(ctor->dev);
+ dev_put(ctor->dev);
+
+ /* locked by mp_mutex */
+ rcu_assign_pointer(mp->ctor, NULL);
+ synchronize_rcu();
+
+ kfree(ctor);
+ return 0;
+}
+
+/* For small user space buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
+ struct kiocb *iocb, int total)
+{
+ struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+ if (!info)
+ return NULL;
+ info->total = total;
+ info->user.dtor = page_dtor;
+ info->ctor = ctor;
+ info->flags = INFO_WRITE;
+ info->iocb = iocb;
+ return info;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the user space address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+ struct kiocb *iocb, struct iovec *iov,
+ int count, struct frag *frags,
+ int npages, int total)
+{
+ int rc;
+ int i, j, n = 0;
+ int len;
+ unsigned long base, lock_limit;
+ struct page_info *info = NULL;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ if (ctor->lock_pages + count > lock_limit) {
+ printk(KERN_INFO "exceed the locked memory rlimit %d!",
+ lock_limit);
+ return NULL;
+ }
+
+ info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+ if (!info)
+ return NULL;
+
+ for (i = j = 0; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base;
+ len = iov[i].iov_len;
+
+ if (!len)
+ continue;
+ n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+ rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+ &info->pages[j]);
+ if (rc != n)
+ goto failed;
+
+ while (n--) {
+ frags[j].offset = base & ~PAGE_MASK;
+ frags[j].size = min_t(int, len,
+ PAGE_SIZE - frags[j].offset);
+ len -= frags[j].size;
+ base += frags[j].size;
+ j++;
+ }
+ }
+
+#ifdef CONFIG_HIGHMEM
+ if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+ for (i = 0; i < j; i++) {
+ if (PageHighMem(info->pages[i]))
+ goto failed;
+ }
+ }
+#endif
+
+ info->total = total;
+ info->user.dtor = page_dtor;
+ info->ctor = ctor;
+ info->pnum = j;
+ info->iocb = iocb;
+ if (!npages)
+ info->flags = INFO_WRITE;
+ if (info->flags == INFO_READ) {
+ info->user.start = (u8 *)(((unsigned long)
+ (pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
+ frags[0].offset) - NET_IP_ALIGN - NET_SKB_PAD);
+ info->user.size = iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD;
+ }
+ /* increment the number of locked pages */
+ ctor->lock_pages += j;
+ return info;
+
+failed:
+ for (i = 0; i < j; i++)
+ put_page(info->pages[i]);
+
+ kmem_cache_free(ctor->cache, info);
+
+ return NULL;
+}
+
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
+ struct iovec *iov = m->msg_iov;
+ struct page_info *info = NULL;
+ struct frag frags[MAX_SKB_FRAGS];
+ struct sk_buff *skb;
+ int count = m->msg_iovlen;
+ int total = 0, header, n, i, len, rc;
+ unsigned long base;
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -ENODEV;
+
+ total = iov_length(iov, count);
+
+ if (total < ETH_HLEN)
+ return -EINVAL;
+
+ if (total <= COPY_THRESHOLD)
+ goto copy;
+
+ n = 0;
+ for (i = 0; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base;
+ len = iov[i].iov_len;
+ if (!len)
+ continue;
+ n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (n > MAX_SKB_FRAGS)
+ return -EINVAL;
+ }
+
+copy:
+ header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
+
+ skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
+ if (!skb)
+ goto drop;
+
+ skb_reserve(skb, NET_IP_ALIGN);
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ memcpy_fromiovec(skb->data, iov, header);
+ skb_put(skb, header);
+ skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+
+ if (header == total) {
+ rc = total;
+ info = alloc_small_page_info(ctor, iocb, total);
+ } else {
+ info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total);
+ if (info)
+ for (i = 0; info->pages[i]; i++) {
+ skb_add_rx_frag(skb, i, info->pages[i],
+ frags[i].offset, frags[i].size);
+ info->pages[i] = NULL;
+ }
+ }
+ if (info != NULL) {
+ info->desc_pos = iocb->ki_pos;
+ info->ctrl = vq;
+ info->total = total;
+ info->skb = skb;
+ skb_shinfo(skb)->destructor_arg = &info->user;
+ skb->dev = mp->dev;
+ dev_queue_xmit(skb);
+ return 0;
+ }
+drop:
+ kfree_skb(skb);
+ if (info) {
+ for (i = 0; info->pages[i]; i++)
+ put_page(info->pages[i]);
+ kmem_cache_free(info->ctor->cache, info);
+ }
+ mp->dev->stats.tx_dropped++;
+ return -ENOMEM;
+}
+
+
+static void mp_recvmsg_notify(struct vhost_virtqueue *vq)
+{
+ struct socket *sock = vq->private_data;
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor = NULL;
+ struct sk_buff *skb = NULL;
+ struct page_info *info = NULL;
+ struct ethhdr *eth;
+ struct kiocb *iocb = NULL;
+ int len, i;
+ unsigned long flags;
+
+ struct virtio_net_hdr hdr = {
+ .flags = 0,
+ .gso_type = VIRTIO_NET_HDR_GSO_NONE
+ };
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return;
+
+ while ((skb = skb_dequeue(&sock->sk->sk_receive_queue)) != NULL) {
+ if (skb_shinfo(skb)->destructor_arg) {
+ info = container_of(skb_shinfo(skb)->destructor_arg,
+ struct page_info, user);
+ info->skb = skb;
+ if (skb->len > info->len) {
+ mp->dev->stats.rx_dropped++;
+ DBG(KERN_INFO "Discarded truncated rx packet: "
+ " len %d > %zd\n", skb->len, info->len);
+ info->total = skb->len;
+ goto clean;
+ } else {
+ int i;
+ struct skb_shared_info *gshinfo =
+ (struct skb_shared_info *)(&info->ushinfo);
+ struct skb_shared_info *hshinfo =
+ skb_shinfo(skb);
+
+ if (gshinfo->nr_frags < hshinfo->nr_frags)
+ goto clean;
+ eth = eth_hdr(skb);
+ skb_push(skb, ETH_HLEN);
+
+ hdr.hdr_len = skb_headlen(skb);
+ info->total = skb->len;
+
+ for (i = 0; i < gshinfo->nr_frags; i++)
+ gshinfo->frags[i].size = 0;
+ for (i = 0; i < hshinfo->nr_frags; i++)
+ gshinfo->frags[i].size =
+ hshinfo->frags[i].size;
+ memcpy(skb_shinfo(skb), &info->ushinfo,
+ sizeof(struct skb_shared_info));
+ }
+ } else {
+ /* The skb composed with kernel buffers
+ * in case user space buffers are not sufficent.
+ * The case should be rare.
+ */
+ unsigned long flags;
+ int i;
+ struct skb_shared_info *gshinfo = NULL;
+
+ info = NULL;
+
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq,
+ struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ if (!info) {
+ DBG(KERN_INFO "No user buffer avaliable %p\n",
+ skb);
+ skb_queue_head(&sock->sk->sk_receive_queue,
+ skb);
+ break;
+ }
+ info->skb = skb;
+ /* compute the guest skb frags info */
+ gshinfo = (struct skb_shared_info *)(info->user.start +
+ SKB_DATA_ALIGN(info->user.size));
+
+ if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
+ goto clean;
+
+ eth = eth_hdr(skb);
+ skb_push(skb, ETH_HLEN);
+ info->total = skb->len;
+
+ for (i = 0; i < gshinfo->nr_frags; i++)
+ gshinfo->frags[i].size = 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ gshinfo->frags[i].size =
+ skb_shinfo(skb)->frags[i].size;
+ hdr.hdr_len = min_t(int, skb->len,
+ info->iov[1].iov_len);
+ skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
+ }
+
+ len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
+ sizeof hdr);
+ if (len) {
+ DBG(KERN_INFO
+ "Unable to write vnet_hdr at addr %p: %d\n",
+ info->hdr->iov_base, len);
+ goto clean;
+ }
+ iocb = create_iocb(info, skb->len + sizeof(hdr));
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ list_add_tail(&iocb->ki_list, &vq->notifier);
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+ continue;
+
+clean:
+ kfree_skb(skb);
+ for (i = 0; info->pages[i]; i++)
+ put_page(info->pages[i]);
+ kmem_cache_free(ctor->cache, info);
+ }
+ return;
+}
+
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len,
+ int flags)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
+ struct iovec *iov = m->msg_iov;
+ int count = m->msg_iovlen;
+ int npages, payload;
+ struct page_info *info;
+ struct frag frags[MAX_SKB_FRAGS];
+ unsigned long base;
+ int i, len;
+ unsigned long flag;
+
+ if (!(flags & MSG_DONTWAIT))
+ return -EINVAL;
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -EINVAL;
+
+ /* Error detections in case invalid user space buffer */
+ if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
+ mp->dev->features & NETIF_F_SG) {
+ return -EINVAL;
+ }
+
+ npages = ctor->port.npages;
+ payload = ctor->port.data_len;
+
+ /* If KVM guest virtio-net FE driver use SG feature */
+ if (count > 2) {
+ for (i = 2; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+ len = iov[i].iov_len;
+ if (npages == 1)
+ len = min_t(int, len, PAGE_SIZE - base);
+ else if (base)
+ break;
+ payload -= len;
+ if (payload <= 0)
+ goto proceed;
+ if (npages == 1 || (len & ~PAGE_MASK))
+ break;
+ }
+ }
+
+ if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+ - NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+ goto proceed;
+
+ return -EINVAL;
+
+proceed:
+ /* skip the virtnet head */
+ iov++;
+ count--;
+
+ /* Translate address to kernel */
+ info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
+ if (!info)
+ return -ENOMEM;
+ info->len = total_len;
+ info->hdr[0].iov_base = vq->hdr[0].iov_base;
+ info->hdr[0].iov_len = vq->hdr[0].iov_len;
+ info->offset = frags[0].offset;
+ info->desc_pos = iocb->ki_pos;
+ info->log = iocb->ki_user_data;
+ info->ctrl = vq;
+
+ iov--;
+ count++;
+
+ memcpy(info->iov, vq->iov, sizeof(struct iovec) * count);
+
+ spin_lock_irqsave(&ctor->read_lock, flag);
+ list_add_tail(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+ if (!vq->receiver) {
+ vq->receiver = mp_recvmsg_notify;
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ vq->num * 4096,
+ vq->num * 4096);
+ }
+
+ return 0;
+}
+
+static void __mp_detach(struct mp_struct *mp)
+{
+ mp->mfile = NULL;
+
+ mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+ page_ctor_detach(mp);
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+ /* Drop the extra count on the net device */
+ dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+ mutex_lock(&mp_mutex);
+ __mp_detach(mp);
+ mutex_unlock(&mp_mutex);
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+ if (atomic_dec_and_test(&mfile->count))
+ mp_detach(mfile->mp);
+}
+
+static int mp_release(struct socket *sock)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct mp_file *mfile = mp->mfile;
+
+ mp_put(mfile);
+ sock_put(mp->socket.sk);
+ put_net(mfile->net);
+
+ return 0;
+}
+
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+ .sendmsg = mp_sendmsg,
+ .recvmsg = mp_recvmsg,
+ .release = mp_release,
+};
+
+static struct proto mp_proto = {
+ .name = "mp",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct mp_sock),
+};
+
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+ struct mp_file *mfile;
+ cycle_kernel_lock();
+ DBG1(KERN_INFO "mp: mp_chr_open\n");
+
+ mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
+ if (!mfile)
+ return -ENOMEM;
+ atomic_set(&mfile->count, 0);
+ mfile->mp = NULL;
+ mfile->net = get_net(current->nsproxy->net_ns);
+ file->private_data = mfile;
+ return 0;
+}
+
+
+static struct mp_struct *mp_get(struct mp_file *mfile)
+{
+ struct mp_struct *mp = NULL;
+ if (atomic_inc_not_zero(&mfile->count))
+ mp = mfile->mp;
+
+ return mp;
+}
+
+
+static int mp_attach(struct mp_struct *mp, struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+ int err;
+
+ netif_tx_lock_bh(mp->dev);
+
+ err = -EINVAL;
+
+ if (mfile->mp)
+ goto out;
+
+ err = -EBUSY;
+ if (mp->mfile)
+ goto out;
+
+ err = 0;
+ mfile->mp = mp;
+ mp->mfile = mfile;
+ mp->socket.file = file;
+ dev_hold(mp->dev);
+ sock_hold(mp->socket.sk);
+ atomic_inc(&mfile->count);
+
+out:
+ netif_tx_unlock_bh(mp->dev);
+ return err;
+}
+
+static void mp_sock_destruct(struct sock *sk)
+{
+ struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+ kfree(mp);
+}
+
+static int do_unbind(struct mp_file *mfile)
+{
+ struct mp_struct *mp = mp_get(mfile);
+
+ if (!mp)
+ return -EINVAL;
+
+ mp_detach(mp);
+ sock_put(mp->socket.sk);
+ mp_put(mfile);
+ return 0;
+}
+
+static void mp_sock_data_ready(struct sock *sk, int len)
+{
+ if (sk_has_sleeper(sk))
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+ if (sk_has_sleeper(sk))
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+ struct net_device *dev;
+ void __user* argp = (void __user *)arg;
+ struct ifreq ifr;
+ struct sock *sk;
+ int ret;
+
+ ret = -EINVAL;
+
+ switch (cmd) {
+ case MPASSTHRU_BINDDEV:
+ ret = -EFAULT;
+ if (copy_from_user(&ifr, argp, sizeof ifr))
+ break;
+
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+ ret = -EBUSY;
+
+ if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+ break;
+
+ ret = -ENODEV;
+ dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+ if (!dev)
+ break;
+
+ mutex_lock(&mp_mutex);
+
+ ret = -EBUSY;
+ mp = mfile->mp;
+ if (mp)
+ goto err_dev_put;
+
+ mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+ if (!mp) {
+ ret = -ENOMEM;
+ goto err_dev_put;
+ }
+ mp->dev = dev;
+ ret = -ENOMEM;
+
+ sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
+ if (!sk)
+ goto err_free_mp;
+
+ init_waitqueue_head(&mp->socket.wait);
+ mp->socket.ops = &mp_socket_ops;
+ sock_init_data(&mp->socket, sk);
+ sk->sk_sndbuf = INT_MAX;
+ container_of(sk, struct mp_sock, sk)->mp = mp;
+
+ sk->sk_destruct = mp_sock_destruct;
+ sk->sk_data_ready = mp_sock_data_ready;
+ sk->sk_write_space = mp_sock_write_space;
+
+ ret = mp_attach(mp, file);
+ if (ret < 0)
+ goto err_free_sk;
+
+ ret = page_ctor_attach(mp);
+ if (ret < 0)
+ goto err_free_sk;
+
+ ifr.ifr_flags |= IFF_MPASSTHRU_EXCL;
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+out:
+ mutex_unlock(&mp_mutex);
+ break;
+err_free_sk:
+ sk_free(sk);
+err_free_mp:
+ kfree(mp);
+err_dev_put:
+ dev_put(dev);
+ goto out;
+
+ case MPASSTHRU_UNBINDDEV:
+ ret = do_unbind(mfile);
+ break;
+
+ default:
+ break;
+ }
+ return ret;
+}
+
+static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp = mp_get(mfile);
+ struct sock *sk;
+ unsigned int mask = 0;
+
+ if (!mp)
+ return POLLERR;
+
+ sk = mp->socket.sk;
+
+ poll_wait(file, &mp->socket.wait, wait);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+ sock_writeable(sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+ if (mp->dev->reg_state != NETREG_REGISTERED)
+ mask = POLLERR;
+
+ mp_put(mfile);
+ return mask;
+}
+
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
+
+ if (!mp)
+ return -EBADFD;
+
+ /* currently, async is not supported.
+ * but we may support real async aio from user application,
+ * maybe qemu virtio-net backend.
+ */
+ if (!is_sync_kiocb(iocb))
+ return -EFAULT;
+
+ len = iov_length(iov, count);
+
+ if (unlikely(len) < ETH_HLEN)
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+ file->f_flags & O_NONBLOCK, &err);
+
+ if (!skb)
+ return -EFAULT;
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, len);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+ kfree_skb(skb);
+ return -EAGAIN;
+ }
+
+ skb->protocol = eth_type_trans(skb, mp->dev);
+ skb->dev = mp->dev;
+
+ dev_queue_xmit(skb);
+
+ mp_put(file->private_data);
+ return result;
+}
+
+static int mp_chr_close(struct inode *inode, struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+
+ /*
+ * Ignore return value since an error only means there was nothing to
+ * do
+ */
+ do_unbind(mfile);
+
+ put_net(mfile->net);
+ kfree(mfile);
+
+ return 0;
+}
+
+static const struct file_operations mp_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .write = do_sync_write,
+ .aio_write = mp_chr_aio_write,
+ .poll = mp_chr_poll,
+ .unlocked_ioctl = mp_chr_ioctl,
+ .open = mp_chr_open,
+ .release = mp_chr_close,
+};
+
+static struct miscdevice mp_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mp",
+ .nodename = "net/mp",
+ .fops = &mp_fops,
+};
+
+static int mp_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct mpassthru_port *port;
+ struct mp_struct *mp = NULL;
+ struct socket *sock = NULL;
+
+ port = dev->mp_port;
+ if (port == NULL)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ sock = dev->mp_port->sock;
+ mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ do_unbind(mp->mfile);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mp_notifier_block __read_mostly = {
+ .notifier_call = mp_device_event,
+};
+
+static int mp_init(void)
+{
+ int ret = 0;
+
+ ret = misc_register(&mp_miscdev);
+ if (ret)
+ printk(KERN_ERR "mp: Can't register misc device\n");
+ else {
+ printk(KERN_INFO "Registering mp misc device - minor = %d\n",
+ mp_miscdev.minor);
+ register_netdevice_notifier(&mp_notifier_block);
+ }
+ return ret;
+}
+
+void mp_cleanup(void)
+{
+ unregister_netdevice_notifier(&mp_notifier_block);
+ misc_deregister(&mp_miscdev);
+}
+
+/* Get an underlying socket object from mp file. Returns error unless file is
+ * attached to a device. The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *mp_get_socket(struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+
+ if (file->f_op != &mp_fops)
+ return ERR_PTR(-EINVAL);
+ mp = mp_get(mfile);
+ if (!mp)
+ return ERR_PTR(-EBADFD);
+ mp_put(mfile);
+ return &mp->socket;
+}
+EXPORT_SYMBOL_GPL(mp_get_socket);
+
+module_init(mp_init);
+module_exit(mp_cleanup);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 0000000..2be21c5
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,29 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
+
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU 0x0001
+#define IFF_MPASSTHRU_EXCL 0x0002
+
+#ifdef __KERNEL__
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct socket *mp_get_socket(struct file *);
+#else
+#include <linux/err.h>
+#include <linux/errno.h>
+struct file;
+struct socket;
+static inline struct socket *mp_get_socket(struct file *f)
+{
+ return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_VHOST_PASSTHRU */
+#endif /* __KERNEL__ */
+#endif /* __MPASSTHRU_H */
--
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
x***@intel.com
2010-04-09 09:37:45 UTC
Permalink
From: Xin Xiaohui <***@intel.com>

The patch let host NIC driver to receive user space skb,
then the driver has chance to directly DMA to guest user
space buffers thru single ethX interface.

Signed-off-by: Xin Xiaohui <***@intel.com>
Signed-off-by: Zhao Yu <***@gmail.com>
Reviewed-by: Jeff Dike <***@linux.intel.com>
---

alloc_skb() is cleanup by Jeff Dike <***@linux.intel.com>

include/linux/netdevice.h | 69 ++++++++++++++++++++++++++++++++++++++++-
include/linux/skbuff.h | 30 ++++++++++++++++--
net/core/dev.c | 63 ++++++++++++++++++++++++++++++++++++++
net/core/skbuff.c | 74 ++++++++++++++++++++++++++++++++++++++++----
4 files changed, 224 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c1..ba48eb0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -485,6 +485,17 @@ struct netdev_queue {
unsigned long tx_dropped;
} ____cacheline_aligned_in_smp;

+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct mpassthru_port {
+ int hdr_len;
+ int data_len;
+ int npages;
+ unsigned flags;
+ struct socket *sock;
+ struct skb_user_page *(*ctor)(struct mpassthru_port *,
+ struct sk_buff *, int);
+};
+#endif

/*
* This structure defines the management hooks for network devices.
@@ -636,6 +647,10 @@ struct net_device_ops {
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
u16 xid);
#endif
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ int (*ndo_mp_port_prep)(struct net_device *dev,
+ struct mpassthru_port *port);
+#endif
};

/*
@@ -891,7 +906,8 @@ struct net_device
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port *garp_port;
-
+ /* mpassthru */
+ struct mpassthru_port *mp_port;
/* class/net/name entry */
struct device dev;
/* space for optional statistics and wireless sysfs groups */
@@ -2013,6 +2029,55 @@ static inline u32 dev_ethtool_get_flags(struct net_device *dev)
return 0;
return dev->ethtool_ops->get_flags(dev);
}
-#endif /* __KERNEL__ */

+/* To support zero-copy between user space application and NIC driver,
+ * we'd better ask NIC driver for the capability it can provide, especially
+ * for packet split mode, now we only ask for the header size, and the
+ * payload once a descriptor may carry.
+ */
+
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+static inline int netdev_mp_port_prep(struct net_device *dev,
+ struct mpassthru_port *port)
+{
+ int rc;
+ int npages, data_len;
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* needed by packet split */
+ if (ops->ndo_mp_port_prep) {
+ rc = ops->ndo_mp_port_prep(dev, port);
+ if (rc)
+ return rc;
+ } else {
+ /* If the NIC driver did not report this,
+ * then we try to use it as igb driver.
+ */
+ port->hdr_len = 128;
+ port->data_len = 2048;
+ port->npages = 1;
+ }
+
+ if (port->hdr_len <= 0)
+ goto err;
+
+ npages = port->npages;
+ data_len = port->data_len;
+ if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+ (data_len < PAGE_SIZE * (npages - 1) ||
+ data_len > PAGE_SIZE * npages))
+ goto err;
+
+ return 0;
+err:
+ dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+ return -EINVAL;
+}
+
+extern int netdev_mp_port_attach(struct net_device *dev,
+ struct mpassthru_port *port);
+extern void netdev_mp_port_detach(struct net_device *dev);
+#endif /* CONFIG_VHOST_PASSTHRU */
+#endif /* __KERNEL__ */
#endif /* _LINUX_NETDEVICE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index df7b23a..e59fa57 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -209,6 +209,13 @@ struct skb_shared_info {
void * destructor_arg;
};

+struct skb_user_page {
+ u8 *start;
+ int size;
+ struct skb_frag_struct *frags;
+ struct skb_shared_info *ushinfo;
+ void (*dtor)(struct skb_user_page *);
+};
/* We divide dataref into two halves. The higher 16 bits hold references
* to the payload part of skb->data. The lower 16 bits hold references to
* the entire skb->data. A clone of a headerless skb holds the length of
@@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
extern void consume_skb(struct sk_buff *skb);
extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *__alloc_skb(unsigned int size,
- gfp_t priority, int fclone, int node);
+ gfp_t priority, int fclone,
+ int node, struct net_device *dev);
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
- return __alloc_skb(size, priority, 0, -1);
+ return __alloc_skb(size, priority, 0, -1, NULL);
}

static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
gfp_t priority)
{
- return __alloc_skb(size, priority, 1, -1);
+ return __alloc_skb(size, priority, 1, -1, NULL);
}

extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1509,6 +1517,22 @@ static inline void netdev_free_page(struct net_device *dev, struct page *page)
__free_page(page);
}

+extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+ struct sk_buff *skb, int npages);
+
+static inline struct skb_user_page *netdev_alloc_user_page(
+ struct net_device *dev,
+ struct sk_buff *skb, unsigned int size)
+{
+ struct skb_user_page *user;
+ int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE);
+
+ user = netdev_alloc_user_pages(dev, skb, npages);
+ if (likely(user))
+ return user;
+ return NULL;
+}
+
/**
* skb_clone_writable - is the header of a clone writable
* @skb: buffer to check
diff --git a/net/core/dev.c b/net/core/dev.c
index b8f74cf..b50bdcb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2265,6 +2265,61 @@ void netif_nit_deliver(struct sk_buff *skb)
rcu_read_unlock();
}

+/* Add a hook to intercept zero-copy packets, and insert it
+ * to the socket queue specially.
+ */
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+int netdev_mp_port_attach(struct net_device *dev,
+ struct mpassthru_port *port)
+{
+ /* locked by mp_mutex */
+ if (rcu_dereference(dev->mp_port))
+ return -EBUSY;
+
+ rcu_assign_pointer(dev->mp_port, port);
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_mp_port_attach);
+
+void netdev_mp_port_detach(struct net_device *dev)
+{
+ /* locked by mp_mutex */
+ if (!rcu_dereference(dev->mp_port))
+ return;
+
+ rcu_assign_pointer(dev->mp_port, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(netdev_mp_port_detach);
+
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+ struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ struct mpassthru_port *mp_port = NULL;
+ struct sock *sk = NULL;
+
+ if (skb->dev)
+ mp_port = skb->dev->mp_port;
+ if (!mp_port)
+ return skb;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ sk = mp_port->sock->sk;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk, skb->len);
+
+ return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
@@ -2342,6 +2397,9 @@ int netif_receive_skb(struct sk_buff *skb)
goto out;
ncls:
#endif
+ skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto out;

skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
@@ -2455,6 +2513,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
if (skb_is_gso(skb) || skb_has_frags(skb))
goto normal;

+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (skb->dev && skb->dev->mp_port)
+ goto normal;
+#endif
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 80a9616..e684898 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -170,13 +170,15 @@ EXPORT_SYMBOL(skb_under_panic);
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone, int node)
+ int fclone, int node, struct net_device *dev)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
-
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ struct skb_user_page *user = NULL;
+#endif
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;

/* Get the HEAD */
@@ -185,8 +187,26 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
goto out;

size = SKB_DATA_ALIGN(size);
- data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
- gfp_mask, node);
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (!dev || !dev->mp_port) { /* Legacy alloc func */
+#endif
+ data = kmalloc_node_track_caller(
+ size + sizeof(struct skb_shared_info),
+ gfp_mask, node);
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ } else { /* Allocation may from page constructor of device */
+ user = netdev_alloc_user_page(dev, skb, size);
+ if (!user) {
+ data = kmalloc_node_track_caller(
+ size + sizeof(struct skb_shared_info),
+ gfp_mask, node);
+ printk(KERN_INFO "can't alloc user buffer.\n");
+ } else {
+ data = user->start;
+ size = SKB_DATA_ALIGN(user->size);
+ }
+ }
+#endif
if (!data)
goto nodata;

@@ -208,6 +228,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->mac_header = ~0U;
#endif

+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (user)
+ memcpy(user->ushinfo, skb_shinfo(skb),
+ sizeof(struct skb_shared_info));
+#endif
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -231,6 +256,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,

child->fclone = SKB_FCLONE_UNAVAILABLE;
}
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ shinfo->destructor_arg = user;
+#endif
+
out:
return skb;
nodata:
@@ -259,7 +288,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
struct sk_buff *skb;

- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
@@ -278,6 +307,24 @@ struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
}
EXPORT_SYMBOL(__netdev_alloc_page);

+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+ struct sk_buff *skb, int npages)
+{
+ struct mpassthru_port *ctor;
+ struct skb_user_page *user = NULL;
+
+ ctor = rcu_dereference(dev->mp_port);
+ if (!ctor)
+ goto out;
+ BUG_ON(npages > ctor->npages);
+ user = ctor->ctor(ctor, skb, npages);
+out:
+ return user;
+}
+EXPORT_SYMBOL(netdev_alloc_user_pages);
+#endif
+
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size)
{
@@ -338,6 +385,10 @@ static void skb_clone_fraglist(struct sk_buff *skb)

static void skb_release_data(struct sk_buff *skb)
{
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ struct skb_user_page *user = skb_shinfo(skb)->destructor_arg;
+#endif
+
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&skb_shinfo(skb)->dataref)) {
@@ -349,7 +400,10 @@ static void skb_release_data(struct sk_buff *skb)

if (skb_has_frags(skb))
skb_drop_fraglist(skb);
-
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (skb->dev && skb->dev->mp_port && user && user->dtor)
+ user->dtor(user);
+#endif
kfree(skb->head);
}
}
@@ -503,8 +557,14 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
if (skb_shared(skb) || skb_cloned(skb))
return 0;

- skb_release_head_state(skb);
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+ if (skb->dev && skb->dev->mp_port)
+ return 0;
+#endif
+
shinfo = skb_shinfo(skb);
+
+ skb_release_head_state(skb);
atomic_set(&shinfo->dataref, 1);
shinfo->nr_frags = 0;
shinfo->gso_size = 0;
--
1.5.4.4
x***@intel.com
2010-04-09 09:37:44 UTC
Permalink
From: Xin Xiaohui <***@intel.com>

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui <***@intel.com>
---
drivers/vhost/net.c | 203 +++++++++++++++++++++++++++++++++++++++++++++++--
drivers/vhost/vhost.c | 115 ++++++++++++++++------------
drivers/vhost/vhost.h | 15 ++++
3 files changed, 278 insertions(+), 55 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..d3fb3fc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/file.h>
+#include <linux/aio.h>

#include <linux/net.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>
+#include <linux/mpassthru.h>

#include <net/sock.h>

@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct kmem_cache *cache;
/* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
@@ -91,11 +94,100 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
}

+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ if (!list_empty(&vq->notifier)) {
+ iocb = list_first_entry(&vq->notifier,
+ struct kiocb, ki_list);
+ list_del(&iocb->ki_list);
+ }
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+ return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ struct vhost_log *vq_log = NULL;
+ int rx_total_len = 0;
+ unsigned int head, log, in, out;
+ int size;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ if (vq->receiver)
+ vq->receiver(vq);
+
+ vq_log = unlikely(vhost_has_feature(
+ &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, iocb->ki_nbytes);
+ log = (int)iocb->ki_user_data;
+ size = iocb->ki_nbytes;
+ head = iocb->ki_pos;
+ rx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+ kmem_cache_free(net->cache, iocb);
+
+ /* when log is enabled, recomputing the log info is needed,
+ * since these buffers are in async queue, and may not get
+ * the log info before.
+ */
+ if (unlikely(vq_log)) {
+ if (!log)
+ __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, vq_log,
+ &log, head);
+ vhost_log_write(vq, vq_log, log, size);
+ }
+ if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ int tx_total_len = 0;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, 0);
+ tx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+
+ kmem_cache_free(net->cache, iocb);
+ if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, s;
struct msghdr msg = {
.msg_name = NULL,
@@ -124,6 +216,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;

+ handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -151,6 +245,15 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+ iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+ if (!iocb)
+ break;
+ iocb->ki_pos = head;
+ iocb->private = (void *)vq;
+ }
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
@@ -160,12 +263,18 @@ static void handle_tx(struct vhost_net *net)
break;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
- err = sock->ops->sendmsg(NULL, sock, &msg, len);
+ err = sock->ops->sendmsg(iocb, sock, &msg, len);
if (unlikely(err < 0)) {
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
tx_poll_start(net, sock);
break;
}
+
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ continue;
+
if (err != len)
pr_err("Truncated TX packet: "
" len %d != %zd\n", err, len);
@@ -177,6 +286,8 @@ static void handle_tx(struct vhost_net *net)
}
}

+ handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
@@ -186,6 +297,7 @@ static void handle_tx(struct vhost_net *net)
static void handle_rx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, log, s;
struct vhost_log *vq_log;
struct msghdr msg = {
@@ -206,7 +318,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
- if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+ if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+ vq->link_state == VHOST_VQ_LINK_SYNC))
return;

use_mm(net->dev.mm);
@@ -214,9 +327,17 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;

+ /* In async cases, when write log is enabled, in case the submitted
+ * buffers did not get log info before the log enabling, so we'd
+ * better recompute the log info when needed. We do this in
+ * handle_async_rx_events_notify().
+ */
+
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;

+ handle_async_rx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -245,6 +366,14 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+ iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+ if (!iocb)
+ break;
+ iocb->private = vq;
+ iocb->ki_pos = head;
+ iocb->ki_user_data = log;
+ }
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -252,13 +381,20 @@ static void handle_rx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
- err = sock->ops->recvmsg(NULL, sock, &msg,
+
+ err = sock->ops->recvmsg(iocb, sock, &msg,
len, MSG_DONTWAIT | MSG_TRUNC);
/* TODO: Check specific error and bomb out unless EAGAIN? */
if (err < 0) {
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
break;
}
+
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ continue;
+
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_err("Discarded truncated rx packet: "
@@ -284,10 +420,13 @@ static void handle_rx(struct vhost_net *net)
}
}

+ handle_async_rx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}

+
static void handle_tx_kick(struct work_struct *work)
{
struct vhost_virtqueue *vq;
@@ -338,6 +477,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+ n->cache = NULL;
return 0;
}

@@ -398,6 +538,18 @@ static void vhost_net_flush(struct vhost_net *n)
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}

+static void vhost_async_cleanup(struct vhost_net *n)
+{
+ /* clean the notifier */
+ struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
+ if (n->cache) {
+ while ((iocb = notify_dequeue(vq)) != NULL)
+ kmem_cache_free(n->cache, iocb);
+ kmem_cache_destroy(n->cache);
+ }
+}
+
static int vhost_net_release(struct inode *inode, struct file *f)
{
struct vhost_net *n = f->private_data;
@@ -414,6 +566,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ vhost_async_cleanup(n);
kfree(n);
return 0;
}
@@ -462,7 +615,19 @@ static struct socket *get_tun_socket(int fd)
return sock;
}

-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+ struct file *file = fget(fd);
+ struct socket *sock;
+ if (!file)
+ return ERR_PTR(-EBADF);
+ sock = mp_get_socket(file);
+ if (IS_ERR(sock))
+ fput(file);
+ return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
{
struct socket *sock;
if (fd == -1)
@@ -473,9 +638,31 @@ static struct socket *get_socket(int fd)
sock = get_tun_socket(fd);
if (!IS_ERR(sock))
return sock;
+ sock = get_mp_socket(fd);
+ if (!IS_ERR(sock)) {
+ vq->link_state = VHOST_VQ_LINK_ASYNC;
+ return sock;
+ }
return ERR_PTR(-ENOTSOCK);
}

+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+ struct vhost_virtqueue *vq = n->vqs + index;
+
+ WARN_ON(!mutex_is_locked(&vq->mutex));
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+ vq->receiver = NULL;
+ INIT_LIST_HEAD(&vq->notifier);
+ spin_lock_init(&vq->notify_lock);
+ if (!n->cache) {
+ n->cache = kmem_cache_create("vhost_kiocb",
+ sizeof(struct kiocb), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ }
+ }
+}
+
static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
struct socket *sock, *oldsock;
@@ -493,12 +680,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
}
vq = n->vqs + index;
mutex_lock(&vq->mutex);
- sock = get_socket(fd);
+ vq->link_state = VHOST_VQ_LINK_SYNC;
+ sock = get_socket(vq, fd);
if (IS_ERR(sock)) {
r = PTR_ERR(sock);
goto err;
}

+ vhost_init_link_state(n, index);
+
/* start polling new socket */
oldsock = vq->private_data;
if (sock == oldsock)
@@ -507,8 +697,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, sock);
vhost_net_enable_vq(n, vq);
- mutex_unlock(&vq->mutex);
done:
+ mutex_unlock(&vq->mutex);
mutex_unlock(&n->dev.mutex);
if (oldsock) {
vhost_net_flush_vq(n, index);
@@ -516,6 +706,7 @@ done:
}
return r;
err:
+ mutex_unlock(&vq->mutex);
mutex_unlock(&n->dev.mutex);
return r;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
return 0;
}

-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head)
{
struct vring_desc desc;
- unsigned int i, head, found = 0;
- u16 last_avail_idx;
+ unsigned int i = head, found = 0;
int ret;

- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
- if (get_user(vq->avail_idx, &vq->avail->idx)) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return vq->num;
- }
-
- if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return vq->num;
- }
-
- /* If there's nothing new since last we looked, return invalid. */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
-
- /* Only get avail ring entries after they have been exposed by guest. */
- rmb();
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return vq->num;
- }
-
- /* If their number is silly, that's an error. */
- if (head >= vq->num) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return vq->num;
- }
-
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
if (unlikely(log))
*log_num = 0;

- i = head;
do {
unsigned iov_count = *in_num + *out_num;
if (i >= vq->num) {
@@ -833,8 +788,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
*out_num += ret;
}
} while ((i = next_desc(&desc)) != -1);
+ return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0;
+ u16 last_avail_idx;
+ unsigned int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+ if (get_user(vq->avail_idx, &vq->avail->idx)) {
+ vq_err(vq, "Failed to access avail idx at %p\n",
+ &vq->avail->idx);
+ return vq->num;
+ }
+
+ if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return vq->num;
+ }
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->avail_idx == last_avail_idx)
+ return vq->num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ rmb();
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return vq->num;
+ }
+
+ /* If their number is silly, that's an error. */
+ if (head >= vq->num) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return vq->num;
+ }
+
+ ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, head);

/* On success, increment avail index. */
+ if (ret == vq->num)
+ return ret;
vq->last_avail_idx++;
return head;
}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d1f0453..a74a6d4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
u64 len;
};

+enum vhost_vq_link_state {
+ VHOST_VQ_LINK_SYNC = 0,
+ VHOST_VQ_LINK_ASYNC = 1,
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -96,6 +101,11 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
+ /*Differiate async socket for 0-copy from normal*/
+ enum vhost_vq_link_state link_state;
+ struct list_head notifier;
+ spinlock_t notify_lock;
+ void (*receiver)(struct vhost_virtqueue *);
};

struct vhost_dev {
@@ -122,6 +132,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head);
void vhost_discard_vq_desc(struct vhost_virtqueue *);

int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
--
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2010-04-14 14:55:21 UTC
Permalink
Post by x***@intel.com
Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.
Sorry for taking so long before finding the time to look
at your code in more detail.

It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?

I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?
Post by x***@intel.com
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..86d2525
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1264 @@
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
This should probably just use the existing dev_dbg/pr_debug infrastructure.
Post by x***@intel.com
[... skipping buffer management code for now]
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len)
+{
[...]
This function looks like we should be able to easily include it into
macvtap and get zero-copy transmits without introducing the new
user-level interface.
Post by x***@intel.com
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len,
+ int flags)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
It smells like a layering violation to look at the iocb->private field
from a lower-level driver. I would have hoped that it's possible to implement
this without having this driver know about the higher-level vhost driver
internals. Can you explain why this is needed?
Post by x***@intel.com
+ spin_lock_irqsave(&ctor->read_lock, flag);
+ list_add_tail(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+ if (!vq->receiver) {
+ vq->receiver = mp_recvmsg_notify;
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ vq->num * 4096,
+ vq->num * 4096);
+ }
+
+ return 0;
+}
Not sure what I'm missing, but who calls the vq->receiver? This seems
to be neither in the upstream version of vhost nor introduced by your
patch.
Post by x***@intel.com
+static void __mp_detach(struct mp_struct *mp)
+{
+ mp->mfile = NULL;
+
+ mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+ page_ctor_detach(mp);
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+ /* Drop the extra count on the net device */
+ dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+ mutex_lock(&mp_mutex);
+ __mp_detach(mp);
+ mutex_unlock(&mp_mutex);
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+ if (atomic_dec_and_test(&mfile->count))
+ mp_detach(mfile->mp);
+}
+
+static int mp_release(struct socket *sock)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct mp_file *mfile = mp->mfile;
+
+ mp_put(mfile);
+ sock_put(mp->socket.sk);
+ put_net(mfile->net);
+
+ return 0;
+}
Doesn't this prevent the underlying interface from going away while the chardev
is open? You also have logic to handle that case, so why do you keep the extra
reference on the netdev?
Post by x***@intel.com
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+ .sendmsg = mp_sendmsg,
+ .recvmsg = mp_recvmsg,
+ .release = mp_release,
+};
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+ struct mp_file *mfile;
+ cycle_kernel_lock();
I don't think you really want to use the BKL here, just kill that line.
Post by x***@intel.com
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+ struct net_device *dev;
+ void __user* argp = (void __user *)arg;
+ struct ifreq ifr;
+ struct sock *sk;
+ int ret;
+
+ ret = -EINVAL;
+
+ switch (cmd) {
+ ret = -EFAULT;
+ if (copy_from_user(&ifr, argp, sizeof ifr))
+ break;
This is broken for 32 bit compat mode ioctls, because struct ifreq
is different between 32 and 64 bit systems. Since you are only
using the device name anyway, a fixed length string or just the
interface index would be simpler and work better.
Post by x***@intel.com
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+ ret = -EBUSY;
+
+ if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+ break;
Your current use of the IFF_MPASSTHRU* flags does not seem to make
any sense whatsoever. You check that this flag is never set, but set
it later yourself and then ignore all flags.
Post by x***@intel.com
+ ret = -ENODEV;
+ dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+ if (!dev)
+ break;
There is no permission checking on who can access what device, which
seems a bit simplistic. Any user that has access to the mpassthru device
seems to be able to bind to any network interface in the namespace.
This is one point where the macvtap model seems more appropriate, it
separates the permissions for creating logical interfaces and using them.
Post by x***@intel.com
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
Can you explain what this function is even there for? AFAICT, vhost-net
doesn't call it, the interface is incompatible with the existing
tap interface, and you don't provide a read function.
Post by x***@intel.com
diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 0000000..2be21c5
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,29 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
These definitions are slightly wrong, because you pass more than just an 'int'.
Post by x***@intel.com
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU 0x0001
+#define IFF_MPASSTHRU_EXCL 0x0002
As mentioned above, these flags don't make any sense with your current code.

Arnd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin
2010-04-14 15:26:15 UTC
Permalink
Post by Arnd Bergmann
Post by x***@intel.com
Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.
Sorry for taking so long before finding the time to look
at your code in more detail.
It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?
Hmm, I have not noticed a lot of duplication.
BTW macvtap also duplicates tun code, it might be
a good idea for tun to export some functionality.
Post by Arnd Bergmann
I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?
If macvtap would get zero copy tx and rx, maybe not. But
it's not immediately obvious whether zero-copy support
for macvtap might work, though, especially for zero copy rx.
The approach with mpassthru is much simpler in that
it takes complete control of the device.
Post by Arnd Bergmann
Post by x***@intel.com
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..86d2525
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1264 @@
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
This should probably just use the existing dev_dbg/pr_debug infrastructure.
Post by x***@intel.com
[... skipping buffer management code for now]
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len)
+{
[...]
This function looks like we should be able to easily include it into
macvtap and get zero-copy transmits without introducing the new
user-level interface.
Post by x***@intel.com
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len,
+ int flags)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
It smells like a layering violation to look at the iocb->private field
from a lower-level driver. I would have hoped that it's possible to implement
this without having this driver know about the higher-level vhost driver
internals.
I agree.
Post by Arnd Bergmann
Can you explain why this is needed?
Post by x***@intel.com
+ spin_lock_irqsave(&ctor->read_lock, flag);
+ list_add_tail(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+ if (!vq->receiver) {
+ vq->receiver = mp_recvmsg_notify;
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ vq->num * 4096,
+ vq->num * 4096);
+ }
+
+ return 0;
+}
Not sure what I'm missing, but who calls the vq->receiver? This seems
to be neither in the upstream version of vhost nor introduced by your
patch.
Post by x***@intel.com
+static void __mp_detach(struct mp_struct *mp)
+{
+ mp->mfile = NULL;
+
+ mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+ page_ctor_detach(mp);
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+ /* Drop the extra count on the net device */
+ dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+ mutex_lock(&mp_mutex);
+ __mp_detach(mp);
+ mutex_unlock(&mp_mutex);
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+ if (atomic_dec_and_test(&mfile->count))
+ mp_detach(mfile->mp);
+}
+
+static int mp_release(struct socket *sock)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct mp_file *mfile = mp->mfile;
+
+ mp_put(mfile);
+ sock_put(mp->socket.sk);
+ put_net(mfile->net);
+
+ return 0;
+}
Doesn't this prevent the underlying interface from going away while the chardev
is open? You also have logic to handle that case, so why do you keep the extra
reference on the netdev?
Post by x***@intel.com
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+ .sendmsg = mp_sendmsg,
+ .recvmsg = mp_recvmsg,
+ .release = mp_release,
+};
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+ struct mp_file *mfile;
+ cycle_kernel_lock();
I don't think you really want to use the BKL here, just kill that line.
Post by x***@intel.com
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+ struct net_device *dev;
+ void __user* argp = (void __user *)arg;
+ struct ifreq ifr;
+ struct sock *sk;
+ int ret;
+
+ ret = -EINVAL;
+
+ switch (cmd) {
+ ret = -EFAULT;
+ if (copy_from_user(&ifr, argp, sizeof ifr))
+ break;
This is broken for 32 bit compat mode ioctls, because struct ifreq
is different between 32 and 64 bit systems. Since you are only
using the device name anyway, a fixed length string or just the
interface index would be simpler and work better.
Post by x***@intel.com
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+ ret = -EBUSY;
+
+ if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+ break;
Your current use of the IFF_MPASSTHRU* flags does not seem to make
any sense whatsoever. You check that this flag is never set, but set
it later yourself and then ignore all flags.
Post by x***@intel.com
+ ret = -ENODEV;
+ dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+ if (!dev)
+ break;
There is no permission checking on who can access what device, which
seems a bit simplistic. Any user that has access to the mpassthru device
seems to be able to bind to any network interface in the namespace.
This is one point where the macvtap model seems more appropriate, it
separates the permissions for creating logical interfaces and using them.
Post by x***@intel.com
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
Can you explain what this function is even there for? AFAICT, vhost-net
doesn't call it, the interface is incompatible with the existing
tap interface, and you don't provide a read function.
qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
Post by Arnd Bergmann
Post by x***@intel.com
diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 0000000..2be21c5
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,29 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
These definitions are slightly wrong, because you pass more than just an 'int'.
Post by x***@intel.com
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU 0x0001
+#define IFF_MPASSTHRU_EXCL 0x0002
As mentioned above, these flags don't make any sense with your current code.
Arnd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2010-04-14 15:57:54 UTC
Permalink
Post by Michael S. Tsirkin
Post by Arnd Bergmann
It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?
Hmm, I have not noticed a lot of duplication.
The code is indeed quite distinct, but the idea of adding another
character device to pass into vhost for direct device access is.
Post by Michael S. Tsirkin
BTW macvtap also duplicates tun code, it might be
a good idea for tun to export some functionality.
Yes, that's something I plan to look into.
Post by Michael S. Tsirkin
Post by Arnd Bergmann
I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?
If macvtap would get zero copy tx and rx, maybe not. But
it's not immediately obvious whether zero-copy support
for macvtap might work, though, especially for zero copy rx.
The approach with mpassthru is much simpler in that
it takes complete control of the device.
As far as I can tell, the most significant limitation of mpassthru
is that there can only ever be a single guest on a physical NIC.

Given that limitation, I believe we can do the same on macvtap,
and simply disable zero-copy RX when you want to use more than one
guest, or both guest and host on the same NIC.

The logical next step here would be to allow VMDq and similar
technologies to separate out the RX traffic in the hardware.
We don't have a configuration interface for that yet, but
since this is logically the same as macvlan, I think we should
use the same interfaces for both, essentially treating VMDq
as a hardware acceleration for macvlan. We can probably handle
it in similar ways to how we handle hardware support for vlan.

At that stage, macvtap would be the logical interface for
connecting a VMDq (hardware macvlan) device to a guest!
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by x***@intel.com
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
Can you explain what this function is even there for? AFAICT, vhost-net
doesn't call it, the interface is incompatible with the existing
tap interface, and you don't provide a read function.
qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?

Moreover, it seems weird to have a new type of interface here that
duplicates tap/macvtap with less functionality. Coming back
to your original comment, this means that while mpassthru is currently
not duplicating the actual code from macvtap, it would need to do
exactly that to get the qemu interface right!

Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin
2010-04-14 16:16:10 UTC
Permalink
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?
Hmm, I have not noticed a lot of duplication.
The code is indeed quite distinct, but the idea of adding another
character device to pass into vhost for direct device access is.
All backends besides tap seem to do this, btw :)
Post by Arnd Bergmann
Post by Michael S. Tsirkin
BTW macvtap also duplicates tun code, it might be
a good idea for tun to export some functionality.
Yes, that's something I plan to look into.
Post by Michael S. Tsirkin
Post by Arnd Bergmann
I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?
If macvtap would get zero copy tx and rx, maybe not. But
it's not immediately obvious whether zero-copy support
for macvtap might work, though, especially for zero copy rx.
The approach with mpassthru is much simpler in that
it takes complete control of the device.
As far as I can tell, the most significant limitation of mpassthru
is that there can only ever be a single guest on a physical NIC.
Given that limitation, I believe we can do the same on macvtap,
and simply disable zero-copy RX when you want to use more than one
guest, or both guest and host on the same NIC.
The logical next step here would be to allow VMDq and similar
technologies to separate out the RX traffic in the hardware.
We don't have a configuration interface for that yet, but
since this is logically the same as macvlan, I think we should
use the same interfaces for both, essentially treating VMDq
as a hardware acceleration for macvlan. We can probably handle
it in similar ways to how we handle hardware support for vlan.
At that stage, macvtap would be the logical interface for
connecting a VMDq (hardware macvlan) device to a guest!
I won't object to that but ... code walks.
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by x***@intel.com
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
Can you explain what this function is even there for? AFAICT, vhost-net
doesn't call it, the interface is incompatible with the existing
tap interface, and you don't provide a read function.
qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?
I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).
Post by Arnd Bergmann
Moreover, it seems weird to have a new type of interface here that
duplicates tap/macvtap with less functionality. Coming back
to your original comment, this means that while mpassthru is currently
not duplicating the actual code from macvtap, it would need to do
exactly that to get the qemu interface right!
Arnd
I don't think so, see above. anyway, both can reuse tun.c :)
--
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2010-04-14 16:35:57 UTC
Permalink
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by Michael S. Tsirkin
qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?
I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).
Ah, I see. I didn't realize that qemu needs to write to the
device even if vhost is used. But for the case of migration to
another machine without vhost, wouldn't qemu also need to read?
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Moreover, it seems weird to have a new type of interface here that
duplicates tap/macvtap with less functionality. Coming back
to your original comment, this means that while mpassthru is currently
not duplicating the actual code from macvtap, it would need to do
exactly that to get the qemu interface right!
I don't think so, see above. anyway, both can reuse tun.c :)
There is one significant difference between macvtap/mpassthru and
tun/tap in that the directions are reversed. While macvtap and
mpassthru forward data from write into dev_queue_xmit and from
skb_receive into read, tun/tap forwards data from write into
skb_receive and from start_xmit into read.

Also, I'm not really objecting to duplicating code between
macvtap and mpassthru, as the implementation can always be merged.
My main objection is instead to having two different _user_interfaces_
for doing the same thing.

Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin
2010-04-14 20:31:42 UTC
Permalink
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by Michael S. Tsirkin
qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?
I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).
Ah, I see. I didn't realize that qemu needs to write to the
device even if vhost is used. But for the case of migration to
another machine without vhost, wouldn't qemu also need to read?
Not that I know. Why?
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Moreover, it seems weird to have a new type of interface here that
duplicates tap/macvtap with less functionality. Coming back
to your original comment, this means that while mpassthru is currently
not duplicating the actual code from macvtap, it would need to do
exactly that to get the qemu interface right!
I don't think so, see above. anyway, both can reuse tun.c :)
There is one significant difference between macvtap/mpassthru and
tun/tap in that the directions are reversed. While macvtap and
mpassthru forward data from write into dev_queue_xmit and from
skb_receive into read, tun/tap forwards data from write into
skb_receive and from start_xmit into read.
Also, I'm not really objecting to duplicating code between
macvtap and mpassthru, as the implementation can always be merged.
My main objection is instead to having two different _user_interfaces_
for doing the same thing.
Arnd
They *could* do the same thing :)
--
MST
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2010-04-14 20:39:49 UTC
Permalink
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by Michael S. Tsirkin
qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?
I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).
Ah, I see. I didn't realize that qemu needs to write to the
device even if vhost is used. But for the case of migration to
another machine without vhost, wouldn't qemu also need to read?
Not that I know. Why?
Well, if the guest not only wants to send data but also receive
frames coming from other machines, they need to get from the kernel
into qemu, and the only way I can see for doing that is to read
from this device if there is no vhost support around on the new
machine.

Maybe we're talking about different things here.

Arnd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin
2010-04-14 20:40:03 UTC
Permalink
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by Michael S. Tsirkin
qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?
I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).
Ah, I see. I didn't realize that qemu needs to write to the
device even if vhost is used. But for the case of migration to
another machine without vhost, wouldn't qemu also need to read?
Not that I know. Why?
Well, if the guest not only wants to send data but also receive
frames coming from other machines, they need to get from the kernel
into qemu, and the only way I can see for doing that is to read
from this device if there is no vhost support around on the new
machine.
Maybe we're talking about different things here.
Arnd
mpassthrough is currently useless without vhost.
If the new machine has no vhost, it can't use mpassthrough :)
--
MST
Arnd Bergmann
2010-04-14 20:52:43 UTC
Permalink
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Well, if the guest not only wants to send data but also receive
frames coming from other machines, they need to get from the kernel
into qemu, and the only way I can see for doing that is to read
from this device if there is no vhost support around on the new
machine.
Maybe we're talking about different things here.
mpassthrough is currently useless without vhost.
If the new machine has no vhost, it can't use mpassthrough :)
Ok. Is that a planned feature though? vhost is currently limited
to guests with a virtio-net driver and even if you extend it to other
guest emulations, it will probably always be a subset of the qemu
supported drivers, but it may be useful to support zero-copy on other
drivers as well.

Arnd
Xin, Xiaohui
2010-04-15 09:01:10 UTC
Permalink
Arnd,
Post by Arnd Bergmann
Post by x***@intel.com
Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.
Sorry for taking so long before finding the time to look
at your code in more detail.
It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?
I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?
I have not looked into your macvtap code in detail before.
Does the two interface exactly the same? We just want to create a simple
way to do zero-copy. Now it can only support vhost, but in future
we also want it to support directly read/write operations from user space too.

Basically, compared to the interface, I'm more worried about the modification
to net core we have made to implement zero-copy now. If this hardest part
can be done, then any user space interface modifications or integrations are
more easily to be done after that.
Post by Arnd Bergmann
Post by x***@intel.com
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..86d2525
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1264 @@
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
This should probably just use the existing dev_dbg/pr_debug infrastructure.
Thanks. Will try that.
Post by Arnd Bergmann
[... skipping buffer management code for now]
Post by x***@intel.com
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len)
+{
[...]
This function looks like we should be able to easily include it into
macvtap and get zero-copy transmits without introducing the new
user-level interface.
Post by x***@intel.com
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len,
+ int flags)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
It smells like a layering violation to look at the iocb->private field
from a lower-level driver. I would have hoped that it's possible to implement
this without having this driver know about the higher-level vhost driver
internals. Can you explain why this is needed?
I don't like this too, but since the kiocb is maintained by vhost with a list_head.
And mp device is responsible to collect the kiocb into the list_head,
We need something known by vhost/mp both.
Post by Arnd Bergmann
Post by x***@intel.com
+ spin_lock_irqsave(&ctor->read_lock, flag);
+ list_add_tail(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+ if (!vq->receiver) {
+ vq->receiver = mp_recvmsg_notify;
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ vq->num * 4096,
+ vq->num * 4096);
+ }
+
+ return 0;
+}
Not sure what I'm missing, but who calls the vq->receiver? This seems
to be neither in the upstream version of vhost nor introduced by your
patch.
See Patch v3 2/3 I have sent out, it is called by handle_rx() in vhost.
Post by Arnd Bergmann
Post by x***@intel.com
+static void __mp_detach(struct mp_struct *mp)
+{
+ mp->mfile = NULL;
+
+ mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+ page_ctor_detach(mp);
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+ /* Drop the extra count on the net device */
+ dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+ mutex_lock(&mp_mutex);
+ __mp_detach(mp);
+ mutex_unlock(&mp_mutex);
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+ if (atomic_dec_and_test(&mfile->count))
+ mp_detach(mfile->mp);
+}
+
+static int mp_release(struct socket *sock)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct mp_file *mfile = mp->mfile;
+
+ mp_put(mfile);
+ sock_put(mp->socket.sk);
+ put_net(mfile->net);
+
+ return 0;
+}
Doesn't this prevent the underlying interface from going away while the chardev
is open? You also have logic to handle that case, so why do you keep the extra
reference on the netdev?
Let me think.
Post by Arnd Bergmann
Post by x***@intel.com
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+ .sendmsg = mp_sendmsg,
+ .recvmsg = mp_recvmsg,
+ .release = mp_release,
+};
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+ struct mp_file *mfile;
+ cycle_kernel_lock();
I don't think you really want to use the BKL here, just kill that line.
Post by x***@intel.com
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+ struct net_device *dev;
+ void __user* argp = (void __user *)arg;
+ struct ifreq ifr;
+ struct sock *sk;
+ int ret;
+
+ ret = -EINVAL;
+
+ switch (cmd) {
+ ret = -EFAULT;
+ if (copy_from_user(&ifr, argp, sizeof ifr))
+ break;
This is broken for 32 bit compat mode ioctls, because struct ifreq
is different between 32 and 64 bit systems. Since you are only
using the device name anyway, a fixed length string or just the
interface index would be simpler and work better.
Thanks, will look into this.
Post by Arnd Bergmann
Post by x***@intel.com
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+ ret = -EBUSY;
+
+ if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+ break;
Your current use of the IFF_MPASSTHRU* flags does not seem to make
any sense whatsoever. You check that this flag is never set, but set
it later yourself and then ignore all flags.
Using that flag is tried to prevent if another one wants to bind the same device
Again. But I will see if it really ignore all other flags.
Post by Arnd Bergmann
Post by x***@intel.com
+ ret = -ENODEV;
+ dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+ if (!dev)
+ break;
There is no permission checking on who can access what device, which
seems a bit simplistic. Any user that has access to the mpassthru device
seems to be able to bind to any network interface in the namespace.
This is one point where the macvtap model seems more appropriate, it
separates the permissions for creating logical interfaces and using them.
Yes, that's a problem I have not addressed yet.
Post by Arnd Bergmann
Post by x***@intel.com
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
Can you explain what this function is even there for? AFAICT, vhost-net
doesn't call it, the interface is incompatible with the existing
tap interface, and you don't provide a read function.
I saw Michael have given the answer already.
Post by Arnd Bergmann
Post by x***@intel.com
diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 0000000..2be21c5
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,29 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
These definitions are slightly wrong, because you pass more than just an 'int'.
Thanks. I wrote them too quickly. :-(
Post by Arnd Bergmann
Post by x***@intel.com
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU 0x0001
+#define IFF_MPASSTHRU_EXCL 0x0002
As mentioned above, these flags don't make any sense with your current code.
I used them try to prevent the one who want to bind the same device again.
Arnd
Michael S. Tsirkin
2010-04-15 09:03:24 UTC
Permalink
Post by Xin, Xiaohui
Post by Arnd Bergmann
It smells like a layering violation to look at the iocb->private field
from a lower-level driver. I would have hoped that it's possible to implement
this without having this driver know about the higher-level vhost driver
internals. Can you explain why this is needed?
I don't like this too, but since the kiocb is maintained by vhost with a list_head.
And mp device is responsible to collect the kiocb into the list_head,
We need something known by vhost/mp both.
Can't vhost supply a kiocb completion callback that will handle the list?
--
MST
x***@intel.com
2010-04-22 08:24:18 UTC
Permalink
From: Xin Xiaohui <***@intel.com>

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui <***@intel.com>
Signed-off-by: Zhao Yu <***@gmail.com>
Reviewed-by: Jeff Dike <***@linux.intel.com>
---

Michael,
Thanks. I have updated the patch with your suggestion.
It looks much clean now. Please have a review.

Thanks
Xiaohui

drivers/vhost/Kconfig | 10 +
drivers/vhost/Makefile | 2 +
drivers/vhost/mpassthru.c | 1239 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/mpassthru.h | 29 +
4 files changed, 1280 insertions(+), 0 deletions(-)
create mode 100644 drivers/vhost/mpassthru.c
create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..91806b1 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
To compile this driver as a module, choose M here: the module will
be called vhost_net.

+config MEDIATE_PASSTHRU
+ tristate "mediate passthru network driver (EXPERIMENTAL)"
+ depends on VHOST_NET
+ ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
obj-$(CONFIG_VHOST_NET) += vhost_net.o
vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..cc99b14
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1239 @@
+/*
+ * MPASSTHRU - Mediate passthrough device.
+ * Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME "mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/aio.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/nsproxy.h>
+#include <linux/uaccess.h>
+#include <linux/virtio_net.h>
+#include <linux/mpassthru.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+ u16 offset;
+ u16 size;
+};
+
+struct page_ctor {
+ struct list_head readq;
+ int w_len;
+ int r_len;
+ spinlock_t read_lock;
+ struct kmem_cache *cache;
+ /* record the locked pages */
+ int lock_pages;
+ struct rlimit o_rlim;
+ struct net_device *dev;
+ struct mpassthru_port port;
+};
+
+struct page_info {
+ struct list_head list;
+ int header;
+ /* indicate the actual length of bytes
+ * send/recv in the user space buffers
+ */
+ int total;
+ int offset;
+ struct page *pages[MAX_SKB_FRAGS+1];
+ struct skb_frag_struct frag[MAX_SKB_FRAGS+1];
+ struct sk_buff *skb;
+ struct page_ctor *ctor;
+
+ /* The pointer relayed to skb, to indicate
+ * it's a user space allocated skb or kernel
+ */
+ struct skb_user_page user;
+ struct skb_shared_info ushinfo;
+
+#define INFO_READ 0
+#define INFO_WRITE 1
+ unsigned flags;
+ unsigned pnum;
+
+ /* It's meaningful for receive, means
+ * the max length allowed
+ */
+ size_t len;
+
+ /* The fields after that is for backend
+ * driver, now for vhost-net.
+ */
+
+ struct kiocb *iocb;
+ unsigned int desc_pos;
+ unsigned int log;
+ struct iovec hdr[MAX_SKB_FRAGS + 2];
+ struct iovec iov[MAX_SKB_FRAGS + 2];
+};
+
+struct mp_struct {
+ struct mp_file *mfile;
+ struct net_device *dev;
+ struct page_ctor *ctor;
+ struct socket socket;
+
+#ifdef MPASSTHRU_DEBUG
+ int debug;
+#endif
+};
+
+struct mp_file {
+ atomic_t count;
+ struct mp_struct *mp;
+ struct net *net;
+};
+
+struct mp_sock {
+ struct sock sk;
+ struct mp_struct *mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+ int ret = 0;
+
+ rtnl_lock();
+ ret = dev_change_flags(dev, flags);
+ rtnl_unlock();
+
+ if (ret < 0)
+ printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+ return ret;
+}
+
+/* The main function to allocate user space buffers */
+static struct skb_user_page *page_ctor(struct mpassthru_port *port,
+ struct sk_buff *skb, int npages)
+{
+ int i;
+ unsigned long flags;
+ struct page_ctor *ctor;
+ struct page_info *info = NULL;
+
+ ctor = container_of(port, struct page_ctor, port);
+
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq, struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ if (!info)
+ return NULL;
+
+ for (i = 0; i < info->pnum; i++) {
+ get_page(info->pages[i]);
+ info->frag[i].page = info->pages[i];
+ info->frag[i].page_offset = i ? 0 : info->offset;
+ info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+ port->data_len;
+ }
+ info->skb = skb;
+ info->user.frags = info->frag;
+ info->user.ushinfo = &info->ushinfo;
+ return &info->user;
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+ struct page_info *info = (struct page_info *)(iocb->private);
+ int i;
+
+ if (info->flags == INFO_READ) {
+ for (i = 0; i < info->pnum; i++) {
+ if (info->pages[i]) {
+ set_page_dirty_lock(info->pages[i]);
+ put_page(info->pages[i]);
+ }
+ }
+ skb_shinfo(info->skb)->destructor_arg = &info->user;
+ info->skb->destructor = NULL;
+ kfree_skb(info->skb);
+ }
+ /* Decrement the number of locked pages */
+ info->ctor->lock_pages -= info->pnum;
+ kmem_cache_free(info->ctor->cache, info);
+
+ return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+ struct kiocb *iocb = NULL;
+
+ iocb = info->iocb;
+ if (!iocb)
+ return iocb;
+ iocb->ki_flags = 0;
+ iocb->ki_users = 1;
+ iocb->ki_key = 0;
+ iocb->ki_ctx = NULL;
+ iocb->ki_cancel = NULL;
+ iocb->ki_retry = NULL;
+ iocb->ki_iovec = NULL;
+ iocb->ki_eventfd = NULL;
+ iocb->ki_pos = info->desc_pos;
+ iocb->ki_nbytes = size;
+ iocb->ki_user_data = info->log;
+ iocb->ki_dtor(iocb);
+ iocb->private = (void *)info;
+ iocb->ki_dtor = mp_ki_dtor;
+
+ return iocb;
+}
+
+/* The callback to destruct the user space buffers or skb */
+static void page_dtor(struct skb_user_page *user)
+{
+ struct page_info *info;
+ struct page_ctor *ctor;
+ struct sock *sk;
+ struct sk_buff *skb;
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+ int i;
+
+ if (!user)
+ return;
+ info = container_of(user, struct page_info, user);
+ if (!info)
+ return;
+ ctor = info->ctor;
+ skb = info->skb;
+
+ if ((info->flags == INFO_READ) && info->skb)
+ info->skb->head = NULL;
+
+ /* If the info->total is 0, make it to be reused */
+ if (!info->total) {
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ list_add(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ return;
+ }
+
+ if (info->flags == INFO_READ)
+ return;
+
+ /* For transmit, we should wait for the DMA finish by hardware.
+ * Queue the notifier to wake up the backend driver
+ */
+
+ iocb = create_iocb(info, info->total);
+
+ sk = ctor->port.sock->sk;
+ sk->sk_write_space(sk);
+
+ return;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+ int rc;
+ struct page_ctor *ctor;
+ struct net_device *dev = mp->dev;
+
+ /* locked by mp_mutex */
+ if (rcu_dereference(mp->ctor))
+ return -EBUSY;
+
+ ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
+ if (!ctor)
+ return -ENOMEM;
+ rc = netdev_mp_port_prep(dev, &ctor->port);
+ if (rc)
+ goto fail;
+
+ ctor->cache = kmem_cache_create("skb_page_info",
+ sizeof(struct page_info), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+
+ if (!ctor->cache)
+ goto cache_fail;
+
+ INIT_LIST_HEAD(&ctor->readq);
+ spin_lock_init(&ctor->read_lock);
+
+ ctor->w_len = 0;
+ ctor->r_len = 0;
+
+ dev_hold(dev);
+ ctor->dev = dev;
+ ctor->port.ctor = page_ctor;
+ ctor->port.sock = &mp->socket;
+ ctor->lock_pages = 0;
+ rc = netdev_mp_port_attach(dev, &ctor->port);
+ if (rc)
+ goto fail;
+
+ /* locked by mp_mutex */
+ rcu_assign_pointer(mp->ctor, ctor);
+
+ /* XXX:Need we do set_offload here ? */
+
+ return 0;
+
+fail:
+ kmem_cache_destroy(ctor->cache);
+cache_fail:
+ kfree(ctor);
+ dev_put(dev);
+
+ return rc;
+}
+
+struct page_info *info_dequeue(struct page_ctor *ctor)
+{
+ unsigned long flags;
+ struct page_info *info = NULL;
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq,
+ struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ return info;
+}
+
+static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
+ unsigned long cur, unsigned long max)
+{
+ struct rlimit new_rlim, *old_rlim;
+ int retval;
+
+ if (resource != RLIMIT_MEMLOCK)
+ return -EINVAL;
+ new_rlim.rlim_cur = cur;
+ new_rlim.rlim_max = max;
+
+ old_rlim = current->signal->rlim + resource;
+
+ /* remember the old rlimit value when backend enabled */
+ ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
+ ctor->o_rlim.rlim_max = old_rlim->rlim_max;
+
+ if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+ !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ retval = security_task_setrlimit(resource, &new_rlim);
+ if (retval)
+ return retval;
+
+ task_lock(current->group_leader);
+ *old_rlim = new_rlim;
+ task_unlock(current->group_leader);
+ return 0;
+}
+
+static int page_ctor_detach(struct mp_struct *mp)
+{
+ struct page_ctor *ctor;
+ struct page_info *info;
+ struct kiocb *iocb = NULL;
+ int i;
+ unsigned long flags;
+
+ /* locked by mp_mutex */
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -ENODEV;
+
+ while ((info = info_dequeue(ctor))) {
+ for (i = 0; i < info->pnum; i++)
+ if (info->pages[i])
+ put_page(info->pages[i]);
+ iocb = create_iocb(info, 0);
+ kmem_cache_free(ctor->cache, info);
+ }
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ ctor->o_rlim.rlim_cur,
+ ctor->o_rlim.rlim_max);
+ kmem_cache_destroy(ctor->cache);
+ netdev_mp_port_detach(ctor->dev);
+ dev_put(ctor->dev);
+
+ /* locked by mp_mutex */
+ rcu_assign_pointer(mp->ctor, NULL);
+ synchronize_rcu();
+
+ kfree(ctor);
+ return 0;
+}
+
+/* For small user space buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
+ struct kiocb *iocb, int total)
+{
+ struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+ if (!info)
+ return NULL;
+ info->total = total;
+ info->user.dtor = page_dtor;
+ info->ctor = ctor;
+ info->flags = INFO_WRITE;
+ info->iocb = iocb;
+ return info;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the user space address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+ struct kiocb *iocb, struct iovec *iov,
+ int count, struct frag *frags,
+ int npages, int total)
+{
+ int rc;
+ int i, j, n = 0;
+ int len;
+ unsigned long base, lock_limit;
+ struct page_info *info = NULL;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ if (ctor->lock_pages + count > lock_limit) {
+ printk(KERN_INFO "exceed the locked memory rlimit %d!",
+ lock_limit);
+ return NULL;
+ }
+
+ info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+ if (!info)
+ return NULL;
+
+ for (i = j = 0; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base;
+ len = iov[i].iov_len;
+
+ if (!len)
+ continue;
+ n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+ rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+ &info->pages[j]);
+ if (rc != n)
+ goto failed;
+
+ while (n--) {
+ frags[j].offset = base & ~PAGE_MASK;
+ frags[j].size = min_t(int, len,
+ PAGE_SIZE - frags[j].offset);
+ len -= frags[j].size;
+ base += frags[j].size;
+ j++;
+ }
+ }
+
+#ifdef CONFIG_HIGHMEM
+ if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+ for (i = 0; i < j; i++) {
+ if (PageHighMem(info->pages[i]))
+ goto failed;
+ }
+ }
+#endif
+
+ info->total = total;
+ info->user.dtor = page_dtor;
+ info->ctor = ctor;
+ info->pnum = j;
+ info->iocb = iocb;
+ if (!npages)
+ info->flags = INFO_WRITE;
+ if (info->flags == INFO_READ) {
+ info->user.start = (u8 *)(((unsigned long)
+ (pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
+ frags[0].offset));
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ info->user.size = SKB_DATA_ALIGN(
+ iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD);
+#else
+ info->user.size = SKB_DATA_ALIGN(
+ iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD) -
+ NET_IP_ALIGN - NET_SKB_PAD;
+#endif
+ }
+ /* increment the number of locked pages */
+ ctor->lock_pages += j;
+ return info;
+
+failed:
+ for (i = 0; i < j; i++)
+ put_page(info->pages[i]);
+
+ kmem_cache_free(ctor->cache, info);
+
+ return NULL;
+}
+
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct iovec *iov = m->msg_iov;
+ struct page_info *info = NULL;
+ struct frag frags[MAX_SKB_FRAGS];
+ struct sk_buff *skb;
+ int count = m->msg_iovlen;
+ int total = 0, header, n, i, len, rc;
+ unsigned long base;
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -ENODEV;
+
+ total = iov_length(iov, count);
+
+ if (total < ETH_HLEN)
+ return -EINVAL;
+
+ if (total <= COPY_THRESHOLD)
+ goto copy;
+
+ n = 0;
+ for (i = 0; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base;
+ len = iov[i].iov_len;
+ if (!len)
+ continue;
+ n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (n > MAX_SKB_FRAGS)
+ return -EINVAL;
+ }
+
+copy:
+ header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
+
+ skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
+ if (!skb)
+ goto drop;
+
+ skb_reserve(skb, NET_IP_ALIGN);
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ memcpy_fromiovec(skb->data, iov, header);
+ skb_put(skb, header);
+ skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+
+ if (header == total) {
+ rc = total;
+ info = alloc_small_page_info(ctor, iocb, total);
+ } else {
+ info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total);
+ if (info)
+ for (i = 0; info->pages[i]; i++) {
+ skb_add_rx_frag(skb, i, info->pages[i],
+ frags[i].offset, frags[i].size);
+ info->pages[i] = NULL;
+ }
+ }
+ if (info != NULL) {
+ info->desc_pos = iocb->ki_pos;
+ info->total = total;
+ info->skb = skb;
+ skb_shinfo(skb)->destructor_arg = &info->user;
+ skb->dev = mp->dev;
+ dev_queue_xmit(skb);
+ return 0;
+ }
+drop:
+ kfree_skb(skb);
+ if (info) {
+ for (i = 0; info->pages[i]; i++)
+ put_page(info->pages[i]);
+ kmem_cache_free(info->ctor->cache, info);
+ }
+ mp->dev->stats.tx_dropped++;
+ return -ENOMEM;
+}
+
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len,
+ int flags)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct iovec *iov = m->msg_iov;
+ int count = m->msg_iovlen;
+ int npages, payload;
+ struct page_info *info;
+ struct frag frags[MAX_SKB_FRAGS];
+ unsigned long base;
+ int i, len;
+ unsigned long flag;
+
+ if (!(flags & MSG_DONTWAIT))
+ return -EINVAL;
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -EINVAL;
+
+ /* Error detections in case invalid user space buffer */
+ if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
+ mp->dev->features & NETIF_F_SG) {
+ return -EINVAL;
+ }
+
+ npages = ctor->port.npages;
+ payload = ctor->port.data_len;
+
+ /* If KVM guest virtio-net FE driver use SG feature */
+ if (count > 2) {
+ for (i = 2; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+ len = iov[i].iov_len;
+ if (npages == 1)
+ len = min_t(int, len, PAGE_SIZE - base);
+ else if (base)
+ break;
+ payload -= len;
+ if (payload <= 0)
+ goto proceed;
+ if (npages == 1 || (len & ~PAGE_MASK))
+ break;
+ }
+ }
+
+ if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+ - NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+ goto proceed;
+
+ return -EINVAL;
+
+proceed:
+ /* skip the virtnet head */
+ iov++;
+ count--;
+
+ if (!ctor->lock_pages)
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ (((1UL << 32) -1) & iocb->ki_user_data) * 4096,
+ (((1UL << 32) -1) & iocb->ki_user_data) * 4096);
+
+ /* Translate address to kernel */
+ info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
+ if (!info)
+ return -ENOMEM;
+ info->len = total_len;
+ info->hdr[0].iov_base = iocb->ki_iovec[0].iov_base;
+ info->hdr[0].iov_len = iocb->ki_iovec[0].iov_len;
+ info->offset = frags[0].offset;
+ info->desc_pos = iocb->ki_pos;
+ info->log = iocb->ki_user_data;
+
+ iov--;
+ count++;
+
+ memcpy(info->iov, iov, sizeof(struct iovec) * count);
+
+ spin_lock_irqsave(&ctor->read_lock, flag);
+ list_add_tail(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+ return 0;
+}
+
+static void __mp_detach(struct mp_struct *mp)
+{
+ mp->mfile = NULL;
+
+ mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+ page_ctor_detach(mp);
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+ /* Drop the extra count on the net device */
+ dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+ mutex_lock(&mp_mutex);
+ __mp_detach(mp);
+ mutex_unlock(&mp_mutex);
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+ if (atomic_dec_and_test(&mfile->count))
+ mp_detach(mfile->mp);
+}
+
+static int mp_release(struct socket *sock)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct mp_file *mfile = mp->mfile;
+
+ mp_put(mfile);
+ sock_put(mp->socket.sk);
+ put_net(mfile->net);
+
+ return 0;
+}
+
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+ .sendmsg = mp_sendmsg,
+ .recvmsg = mp_recvmsg,
+ .release = mp_release,
+};
+
+static struct proto mp_proto = {
+ .name = "mp",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct mp_sock),
+};
+
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+ struct mp_file *mfile;
+ cycle_kernel_lock();
+ DBG1(KERN_INFO "mp: mp_chr_open\n");
+
+ mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
+ if (!mfile)
+ return -ENOMEM;
+ atomic_set(&mfile->count, 0);
+ mfile->mp = NULL;
+ mfile->net = get_net(current->nsproxy->net_ns);
+ file->private_data = mfile;
+ return 0;
+}
+
+
+static struct mp_struct *mp_get(struct mp_file *mfile)
+{
+ struct mp_struct *mp = NULL;
+ if (atomic_inc_not_zero(&mfile->count))
+ mp = mfile->mp;
+
+ return mp;
+}
+
+
+static int mp_attach(struct mp_struct *mp, struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+ int err;
+
+ netif_tx_lock_bh(mp->dev);
+
+ err = -EINVAL;
+
+ if (mfile->mp)
+ goto out;
+
+ err = -EBUSY;
+ if (mp->mfile)
+ goto out;
+
+ err = 0;
+ mfile->mp = mp;
+ mp->mfile = mfile;
+ mp->socket.file = file;
+ dev_hold(mp->dev);
+ sock_hold(mp->socket.sk);
+ atomic_inc(&mfile->count);
+
+out:
+ netif_tx_unlock_bh(mp->dev);
+ return err;
+}
+
+static void mp_sock_destruct(struct sock *sk)
+{
+ struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+ kfree(mp);
+}
+
+static int do_unbind(struct mp_file *mfile)
+{
+ struct mp_struct *mp = mp_get(mfile);
+
+ if (!mp)
+ return -EINVAL;
+
+ mp_detach(mp);
+ sock_put(mp->socket.sk);
+ mp_put(mfile);
+ return 0;
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+ if (sk_has_sleeper(sk))
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+ struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor = NULL;
+ struct sk_buff *skb = NULL;
+ struct page_info *info = NULL;
+ struct ethhdr *eth;
+ struct kiocb *iocb = NULL;
+ int len, i;
+ unsigned long flags;
+
+ struct virtio_net_hdr hdr = {
+ .flags = 0,
+ .gso_type = VIRTIO_NET_HDR_GSO_NONE
+ };
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (skb_shinfo(skb)->destructor_arg) {
+ info = container_of(skb_shinfo(skb)->destructor_arg,
+ struct page_info, user);
+ info->skb = skb;
+ if (skb->len > info->len) {
+ mp->dev->stats.rx_dropped++;
+ DBG(KERN_INFO "Discarded truncated rx packet: "
+ " len %d > %zd\n", skb->len, info->len);
+ info->total = skb->len;
+ goto clean;
+ } else {
+ int i;
+ struct skb_shared_info *gshinfo =
+ (struct skb_shared_info *)(&info->ushinfo);
+ struct skb_shared_info *hshinfo =
+ skb_shinfo(skb);
+
+ if (gshinfo->nr_frags < hshinfo->nr_frags)
+ goto clean;
+ eth = eth_hdr(skb);
+ skb_push(skb, ETH_HLEN);
+
+ hdr.hdr_len = skb_headlen(skb);
+ info->total = skb->len;
+
+ for (i = 0; i < gshinfo->nr_frags; i++)
+ gshinfo->frags[i].size = 0;
+ for (i = 0; i < hshinfo->nr_frags; i++)
+ gshinfo->frags[i].size =
+ hshinfo->frags[i].size;
+ memcpy(skb_shinfo(skb), &info->ushinfo,
+ sizeof(struct skb_shared_info));
+ }
+ } else {
+ /* The skb composed with kernel buffers
+ * in case user space buffers are not sufficent.
+ * The case should be rare.
+ */
+ unsigned long flags;
+ int i;
+ struct skb_shared_info *gshinfo = NULL;
+
+ info = NULL;
+
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq,
+ struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ if (!info) {
+ DBG(KERN_INFO "No user buffer avaliable %p\n",
+ skb);
+ skb_queue_head(&sk->sk_receive_queue,
+ skb);
+ break;
+ }
+ info->skb = skb;
+ /* compute the guest skb frags info */
+ gshinfo = (struct skb_shared_info *)(info->user.start +
+ SKB_DATA_ALIGN(info->user.size));
+
+ if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
+ goto clean;
+
+ eth = eth_hdr(skb);
+ skb_push(skb, ETH_HLEN);
+ info->total = skb->len;
+
+ for (i = 0; i < gshinfo->nr_frags; i++)
+ gshinfo->frags[i].size = 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ gshinfo->frags[i].size =
+ skb_shinfo(skb)->frags[i].size;
+ hdr.hdr_len = min_t(int, skb->len,
+ info->iov[1].iov_len);
+ skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
+ }
+
+ len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
+ sizeof hdr);
+ if (len) {
+ DBG(KERN_INFO
+ "Unable to write vnet_hdr at addr %p: %d\n",
+ info->hdr->iov_base, len);
+ goto clean;
+ }
+
+ iocb = create_iocb(info, skb->len + sizeof(hdr));
+ continue;
+
+clean:
+ kfree_skb(skb);
+ for (i = 0; info->pages[i]; i++)
+ put_page(info->pages[i]);
+ kmem_cache_free(ctor->cache, info);
+ }
+ return;
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+ if (sk_has_sleeper(sk))
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+ struct net_device *dev;
+ void __user* argp = (void __user *)arg;
+ struct ifreq ifr;
+ struct sock *sk;
+ int ret;
+
+ ret = -EINVAL;
+
+ switch (cmd) {
+ case MPASSTHRU_BINDDEV:
+ ret = -EFAULT;
+ if (copy_from_user(&ifr, argp, sizeof ifr))
+ break;
+
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+ ret = -EBUSY;
+
+ if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+ break;
+
+ ret = -ENODEV;
+ dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+ if (!dev)
+ break;
+
+ mutex_lock(&mp_mutex);
+
+ ret = -EBUSY;
+ mp = mfile->mp;
+ if (mp)
+ goto err_dev_put;
+
+ mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+ if (!mp) {
+ ret = -ENOMEM;
+ goto err_dev_put;
+ }
+ mp->dev = dev;
+ ret = -ENOMEM;
+
+ sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
+ if (!sk)
+ goto err_free_mp;
+
+ init_waitqueue_head(&mp->socket.wait);
+ mp->socket.ops = &mp_socket_ops;
+ sock_init_data(&mp->socket, sk);
+ sk->sk_sndbuf = INT_MAX;
+ container_of(sk, struct mp_sock, sk)->mp = mp;
+
+ sk->sk_destruct = mp_sock_destruct;
+ sk->sk_data_ready = mp_sock_data_ready;
+ sk->sk_write_space = mp_sock_write_space;
+ sk->sk_state_change = mp_sock_state_change;
+ ret = mp_attach(mp, file);
+ if (ret < 0)
+ goto err_free_sk;
+
+ ret = page_ctor_attach(mp);
+ if (ret < 0)
+ goto err_free_sk;
+
+ ifr.ifr_flags |= IFF_MPASSTHRU_EXCL;
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+out:
+ mutex_unlock(&mp_mutex);
+ break;
+err_free_sk:
+ sk_free(sk);
+err_free_mp:
+ kfree(mp);
+err_dev_put:
+ dev_put(dev);
+ goto out;
+
+ case MPASSTHRU_UNBINDDEV:
+ ret = do_unbind(mfile);
+ break;
+
+ default:
+ break;
+ }
+ return ret;
+}
+
+static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp = mp_get(mfile);
+ struct sock *sk;
+ unsigned int mask = 0;
+
+ if (!mp)
+ return POLLERR;
+
+ sk = mp->socket.sk;
+
+ poll_wait(file, &mp->socket.wait, wait);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+ sock_writeable(sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+ if (mp->dev->reg_state != NETREG_REGISTERED)
+ mask = POLLERR;
+
+ mp_put(mfile);
+ return mask;
+}
+
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
+
+ if (!mp)
+ return -EBADFD;
+
+ /* currently, async is not supported.
+ * but we may support real async aio from user application,
+ * maybe qemu virtio-net backend.
+ */
+ if (!is_sync_kiocb(iocb))
+ return -EFAULT;
+
+ len = iov_length(iov, count);
+
+ if (unlikely(len) < ETH_HLEN)
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+ file->f_flags & O_NONBLOCK, &err);
+
+ if (!skb)
+ return -EFAULT;
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, len);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+ kfree_skb(skb);
+ return -EAGAIN;
+ }
+
+ skb->protocol = eth_type_trans(skb, mp->dev);
+ skb->dev = mp->dev;
+
+ dev_queue_xmit(skb);
+
+ mp_put(file->private_data);
+ return result;
+}
+
+static int mp_chr_close(struct inode *inode, struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+
+ /*
+ * Ignore return value since an error only means there was nothing to
+ * do
+ */
+ do_unbind(mfile);
+
+ put_net(mfile->net);
+ kfree(mfile);
+
+ return 0;
+}
+
+static const struct file_operations mp_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .write = do_sync_write,
+ .aio_write = mp_chr_aio_write,
+ .poll = mp_chr_poll,
+ .unlocked_ioctl = mp_chr_ioctl,
+ .open = mp_chr_open,
+ .release = mp_chr_close,
+};
+
+static struct miscdevice mp_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mp",
+ .nodename = "net/mp",
+ .fops = &mp_fops,
+};
+
+static int mp_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct mpassthru_port *port;
+ struct mp_struct *mp = NULL;
+ struct socket *sock = NULL;
+
+ port = dev->mp_port;
+ if (port == NULL)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ sock = dev->mp_port->sock;
+ mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ do_unbind(mp->mfile);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mp_notifier_block __read_mostly = {
+ .notifier_call = mp_device_event,
+};
+
+static int mp_init(void)
+{
+ int ret = 0;
+
+ ret = misc_register(&mp_miscdev);
+ if (ret)
+ printk(KERN_ERR "mp: Can't register misc device\n");
+ else {
+ printk(KERN_INFO "Registering mp misc device - minor = %d\n",
+ mp_miscdev.minor);
+ register_netdevice_notifier(&mp_notifier_block);
+ }
+ return ret;
+}
+
+void mp_cleanup(void)
+{
+ unregister_netdevice_notifier(&mp_notifier_block);
+ misc_deregister(&mp_miscdev);
+}
+
+/* Get an underlying socket object from mp file. Returns error unless file is
+ * attached to a device. The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *mp_get_socket(struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+
+ if (file->f_op != &mp_fops)
+ return ERR_PTR(-EINVAL);
+ mp = mp_get(mfile);
+ if (!mp)
+ return ERR_PTR(-EBADFD);
+ mp_put(mfile);
+ return &mp->socket;
+}
+EXPORT_SYMBOL_GPL(mp_get_socket);
+
+module_init(mp_init);
+module_exit(mp_cleanup);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 0000000..e3983d3
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,29 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
+
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU 0x0001
+#define IFF_MPASSTHRU_EXCL 0x0002
+
+#ifdef __KERNEL__
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+struct socket *mp_get_socket(struct file *);
+#else
+#include <linux/err.h>
+#include <linux/errno.h>
+struct file;
+struct socket;
+static inline struct socket *mp_get_socket(struct file *f)
+{
+ return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_MEDIATE_PASSTHRU */
+#endif /* __KERNEL__ */
+#endif /* __MPASSTHRU_H */
--
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Xin, Xiaohui
2010-04-22 08:29:57 UTC
Permalink
Michael,
Sorry, it's based on the suggestion to hook an iocb completion callback
to handle the iocb list in vhost-net.

Thanks
Xiaohui

-----Original Message-----
From: Xin, Xiaohui
Sent: Thursday, April 22, 2010 4:24 PM
To: ***@redhat.com
Cc: ***@arndb.de; ***@vger.kernel.org; ***@vger.kernel.org; linux-***@vger.kernel.org; ***@elte.hu; ***@davemloft.net; ***@linux.intel.com; Xin, Xiaohui
Subject: Re:[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

From: Xin Xiaohui <***@intel.com>

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui <***@intel.com>
Signed-off-by: Zhao Yu <***@gmail.com>
Reviewed-by: Jeff Dike <***@linux.intel.com>
---

Michael,
Thanks. I have updated the patch with your suggestion.
It looks much clean now. Please have a review.

Thanks
Xiaohui

drivers/vhost/Kconfig | 10 +
drivers/vhost/Makefile | 2 +
drivers/vhost/mpassthru.c | 1239 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/mpassthru.h | 29 +
4 files changed, 1280 insertions(+), 0 deletions(-)
create mode 100644 drivers/vhost/mpassthru.c
create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..91806b1 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
To compile this driver as a module, choose M here: the module will
be called vhost_net.

+config MEDIATE_PASSTHRU
+ tristate "mediate passthru network driver (EXPERIMENTAL)"
+ depends on VHOST_NET
+ ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
obj-$(CONFIG_VHOST_NET) += vhost_net.o
vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..cc99b14
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1239 @@
+/*
+ * MPASSTHRU - Mediate passthrough device.
+ * Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME "mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/aio.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/nsproxy.h>
+#include <linux/uaccess.h>
+#include <linux/virtio_net.h>
+#include <linux/mpassthru.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+ u16 offset;
+ u16 size;
+};
+
+struct page_ctor {
+ struct list_head readq;
+ int w_len;
+ int r_len;
+ spinlock_t read_lock;
+ struct kmem_cache *cache;
+ /* record the locked pages */
+ int lock_pages;
+ struct rlimit o_rlim;
+ struct net_device *dev;
+ struct mpassthru_port port;
+};
+
+struct page_info {
+ struct list_head list;
+ int header;
+ /* indicate the actual length of bytes
+ * send/recv in the user space buffers
+ */
+ int total;
+ int offset;
+ struct page *pages[MAX_SKB_FRAGS+1];
+ struct skb_frag_struct frag[MAX_SKB_FRAGS+1];
+ struct sk_buff *skb;
+ struct page_ctor *ctor;
+
+ /* The pointer relayed to skb, to indicate
+ * it's a user space allocated skb or kernel
+ */
+ struct skb_user_page user;
+ struct skb_shared_info ushinfo;
+
+#define INFO_READ 0
+#define INFO_WRITE 1
+ unsigned flags;
+ unsigned pnum;
+
+ /* It's meaningful for receive, means
+ * the max length allowed
+ */
+ size_t len;
+
+ /* The fields after that is for backend
+ * driver, now for vhost-net.
+ */
+
+ struct kiocb *iocb;
+ unsigned int desc_pos;
+ unsigned int log;
+ struct iovec hdr[MAX_SKB_FRAGS + 2];
+ struct iovec iov[MAX_SKB_FRAGS + 2];
+};
+
+struct mp_struct {
+ struct mp_file *mfile;
+ struct net_device *dev;
+ struct page_ctor *ctor;
+ struct socket socket;
+
+#ifdef MPASSTHRU_DEBUG
+ int debug;
+#endif
+};
+
+struct mp_file {
+ atomic_t count;
+ struct mp_struct *mp;
+ struct net *net;
+};
+
+struct mp_sock {
+ struct sock sk;
+ struct mp_struct *mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+ int ret = 0;
+
+ rtnl_lock();
+ ret = dev_change_flags(dev, flags);
+ rtnl_unlock();
+
+ if (ret < 0)
+ printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+ return ret;
+}
+
+/* The main function to allocate user space buffers */
+static struct skb_user_page *page_ctor(struct mpassthru_port *port,
+ struct sk_buff *skb, int npages)
+{
+ int i;
+ unsigned long flags;
+ struct page_ctor *ctor;
+ struct page_info *info = NULL;
+
+ ctor = container_of(port, struct page_ctor, port);
+
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq, struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ if (!info)
+ return NULL;
+
+ for (i = 0; i < info->pnum; i++) {
+ get_page(info->pages[i]);
+ info->frag[i].page = info->pages[i];
+ info->frag[i].page_offset = i ? 0 : info->offset;
+ info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+ port->data_len;
+ }
+ info->skb = skb;
+ info->user.frags = info->frag;
+ info->user.ushinfo = &info->ushinfo;
+ return &info->user;
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+ struct page_info *info = (struct page_info *)(iocb->private);
+ int i;
+
+ if (info->flags == INFO_READ) {
+ for (i = 0; i < info->pnum; i++) {
+ if (info->pages[i]) {
+ set_page_dirty_lock(info->pages[i]);
+ put_page(info->pages[i]);
+ }
+ }
+ skb_shinfo(info->skb)->destructor_arg = &info->user;
+ info->skb->destructor = NULL;
+ kfree_skb(info->skb);
+ }
+ /* Decrement the number of locked pages */
+ info->ctor->lock_pages -= info->pnum;
+ kmem_cache_free(info->ctor->cache, info);
+
+ return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+ struct kiocb *iocb = NULL;
+
+ iocb = info->iocb;
+ if (!iocb)
+ return iocb;
+ iocb->ki_flags = 0;
+ iocb->ki_users = 1;
+ iocb->ki_key = 0;
+ iocb->ki_ctx = NULL;
+ iocb->ki_cancel = NULL;
+ iocb->ki_retry = NULL;
+ iocb->ki_iovec = NULL;
+ iocb->ki_eventfd = NULL;
+ iocb->ki_pos = info->desc_pos;
+ iocb->ki_nbytes = size;
+ iocb->ki_user_data = info->log;
+ iocb->ki_dtor(iocb);
+ iocb->private = (void *)info;
+ iocb->ki_dtor = mp_ki_dtor;
+
+ return iocb;
+}
+
+/* The callback to destruct the user space buffers or skb */
+static void page_dtor(struct skb_user_page *user)
+{
+ struct page_info *info;
+ struct page_ctor *ctor;
+ struct sock *sk;
+ struct sk_buff *skb;
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+ int i;
+
+ if (!user)
+ return;
+ info = container_of(user, struct page_info, user);
+ if (!info)
+ return;
+ ctor = info->ctor;
+ skb = info->skb;
+
+ if ((info->flags == INFO_READ) && info->skb)
+ info->skb->head = NULL;
+
+ /* If the info->total is 0, make it to be reused */
+ if (!info->total) {
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ list_add(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ return;
+ }
+
+ if (info->flags == INFO_READ)
+ return;
+
+ /* For transmit, we should wait for the DMA finish by hardware.
+ * Queue the notifier to wake up the backend driver
+ */
+
+ iocb = create_iocb(info, info->total);
+
+ sk = ctor->port.sock->sk;
+ sk->sk_write_space(sk);
+
+ return;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+ int rc;
+ struct page_ctor *ctor;
+ struct net_device *dev = mp->dev;
+
+ /* locked by mp_mutex */
+ if (rcu_dereference(mp->ctor))
+ return -EBUSY;
+
+ ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
+ if (!ctor)
+ return -ENOMEM;
+ rc = netdev_mp_port_prep(dev, &ctor->port);
+ if (rc)
+ goto fail;
+
+ ctor->cache = kmem_cache_create("skb_page_info",
+ sizeof(struct page_info), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+
+ if (!ctor->cache)
+ goto cache_fail;
+
+ INIT_LIST_HEAD(&ctor->readq);
+ spin_lock_init(&ctor->read_lock);
+
+ ctor->w_len = 0;
+ ctor->r_len = 0;
+
+ dev_hold(dev);
+ ctor->dev = dev;
+ ctor->port.ctor = page_ctor;
+ ctor->port.sock = &mp->socket;
+ ctor->lock_pages = 0;
+ rc = netdev_mp_port_attach(dev, &ctor->port);
+ if (rc)
+ goto fail;
+
+ /* locked by mp_mutex */
+ rcu_assign_pointer(mp->ctor, ctor);
+
+ /* XXX:Need we do set_offload here ? */
+
+ return 0;
+
+fail:
+ kmem_cache_destroy(ctor->cache);
+cache_fail:
+ kfree(ctor);
+ dev_put(dev);
+
+ return rc;
+}
+
+struct page_info *info_dequeue(struct page_ctor *ctor)
+{
+ unsigned long flags;
+ struct page_info *info = NULL;
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq,
+ struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ return info;
+}
+
+static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
+ unsigned long cur, unsigned long max)
+{
+ struct rlimit new_rlim, *old_rlim;
+ int retval;
+
+ if (resource != RLIMIT_MEMLOCK)
+ return -EINVAL;
+ new_rlim.rlim_cur = cur;
+ new_rlim.rlim_max = max;
+
+ old_rlim = current->signal->rlim + resource;
+
+ /* remember the old rlimit value when backend enabled */
+ ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
+ ctor->o_rlim.rlim_max = old_rlim->rlim_max;
+
+ if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+ !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ retval = security_task_setrlimit(resource, &new_rlim);
+ if (retval)
+ return retval;
+
+ task_lock(current->group_leader);
+ *old_rlim = new_rlim;
+ task_unlock(current->group_leader);
+ return 0;
+}
+
+static int page_ctor_detach(struct mp_struct *mp)
+{
+ struct page_ctor *ctor;
+ struct page_info *info;
+ struct kiocb *iocb = NULL;
+ int i;
+ unsigned long flags;
+
+ /* locked by mp_mutex */
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -ENODEV;
+
+ while ((info = info_dequeue(ctor))) {
+ for (i = 0; i < info->pnum; i++)
+ if (info->pages[i])
+ put_page(info->pages[i]);
+ iocb = create_iocb(info, 0);
+ kmem_cache_free(ctor->cache, info);
+ }
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ ctor->o_rlim.rlim_cur,
+ ctor->o_rlim.rlim_max);
+ kmem_cache_destroy(ctor->cache);
+ netdev_mp_port_detach(ctor->dev);
+ dev_put(ctor->dev);
+
+ /* locked by mp_mutex */
+ rcu_assign_pointer(mp->ctor, NULL);
+ synchronize_rcu();
+
+ kfree(ctor);
+ return 0;
+}
+
+/* For small user space buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
+ struct kiocb *iocb, int total)
+{
+ struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+ if (!info)
+ return NULL;
+ info->total = total;
+ info->user.dtor = page_dtor;
+ info->ctor = ctor;
+ info->flags = INFO_WRITE;
+ info->iocb = iocb;
+ return info;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the user space address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+ struct kiocb *iocb, struct iovec *iov,
+ int count, struct frag *frags,
+ int npages, int total)
+{
+ int rc;
+ int i, j, n = 0;
+ int len;
+ unsigned long base, lock_limit;
+ struct page_info *info = NULL;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+
+ if (ctor->lock_pages + count > lock_limit) {
+ printk(KERN_INFO "exceed the locked memory rlimit %d!",
+ lock_limit);
+ return NULL;
+ }
+
+ info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+ if (!info)
+ return NULL;
+
+ for (i = j = 0; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base;
+ len = iov[i].iov_len;
+
+ if (!len)
+ continue;
+ n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+ rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+ &info->pages[j]);
+ if (rc != n)
+ goto failed;
+
+ while (n--) {
+ frags[j].offset = base & ~PAGE_MASK;
+ frags[j].size = min_t(int, len,
+ PAGE_SIZE - frags[j].offset);
+ len -= frags[j].size;
+ base += frags[j].size;
+ j++;
+ }
+ }
+
+#ifdef CONFIG_HIGHMEM
+ if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+ for (i = 0; i < j; i++) {
+ if (PageHighMem(info->pages[i]))
+ goto failed;
+ }
+ }
+#endif
+
+ info->total = total;
+ info->user.dtor = page_dtor;
+ info->ctor = ctor;
+ info->pnum = j;
+ info->iocb = iocb;
+ if (!npages)
+ info->flags = INFO_WRITE;
+ if (info->flags == INFO_READ) {
+ info->user.start = (u8 *)(((unsigned long)
+ (pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
+ frags[0].offset));
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ info->user.size = SKB_DATA_ALIGN(
+ iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD);
+#else
+ info->user.size = SKB_DATA_ALIGN(
+ iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD) -
+ NET_IP_ALIGN - NET_SKB_PAD;
+#endif
+ }
+ /* increment the number of locked pages */
+ ctor->lock_pages += j;
+ return info;
+
+failed:
+ for (i = 0; i < j; i++)
+ put_page(info->pages[i]);
+
+ kmem_cache_free(ctor->cache, info);
+
+ return NULL;
+}
+
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct iovec *iov = m->msg_iov;
+ struct page_info *info = NULL;
+ struct frag frags[MAX_SKB_FRAGS];
+ struct sk_buff *skb;
+ int count = m->msg_iovlen;
+ int total = 0, header, n, i, len, rc;
+ unsigned long base;
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -ENODEV;
+
+ total = iov_length(iov, count);
+
+ if (total < ETH_HLEN)
+ return -EINVAL;
+
+ if (total <= COPY_THRESHOLD)
+ goto copy;
+
+ n = 0;
+ for (i = 0; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base;
+ len = iov[i].iov_len;
+ if (!len)
+ continue;
+ n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (n > MAX_SKB_FRAGS)
+ return -EINVAL;
+ }
+
+copy:
+ header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
+
+ skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
+ if (!skb)
+ goto drop;
+
+ skb_reserve(skb, NET_IP_ALIGN);
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ memcpy_fromiovec(skb->data, iov, header);
+ skb_put(skb, header);
+ skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+
+ if (header == total) {
+ rc = total;
+ info = alloc_small_page_info(ctor, iocb, total);
+ } else {
+ info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total);
+ if (info)
+ for (i = 0; info->pages[i]; i++) {
+ skb_add_rx_frag(skb, i, info->pages[i],
+ frags[i].offset, frags[i].size);
+ info->pages[i] = NULL;
+ }
+ }
+ if (info != NULL) {
+ info->desc_pos = iocb->ki_pos;
+ info->total = total;
+ info->skb = skb;
+ skb_shinfo(skb)->destructor_arg = &info->user;
+ skb->dev = mp->dev;
+ dev_queue_xmit(skb);
+ return 0;
+ }
+drop:
+ kfree_skb(skb);
+ if (info) {
+ for (i = 0; info->pages[i]; i++)
+ put_page(info->pages[i]);
+ kmem_cache_free(info->ctor->cache, info);
+ }
+ mp->dev->stats.tx_dropped++;
+ return -ENOMEM;
+}
+
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *m, size_t total_len,
+ int flags)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor;
+ struct iovec *iov = m->msg_iov;
+ int count = m->msg_iovlen;
+ int npages, payload;
+ struct page_info *info;
+ struct frag frags[MAX_SKB_FRAGS];
+ unsigned long base;
+ int i, len;
+ unsigned long flag;
+
+ if (!(flags & MSG_DONTWAIT))
+ return -EINVAL;
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return -EINVAL;
+
+ /* Error detections in case invalid user space buffer */
+ if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
+ mp->dev->features & NETIF_F_SG) {
+ return -EINVAL;
+ }
+
+ npages = ctor->port.npages;
+ payload = ctor->port.data_len;
+
+ /* If KVM guest virtio-net FE driver use SG feature */
+ if (count > 2) {
+ for (i = 2; i < count; i++) {
+ base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+ len = iov[i].iov_len;
+ if (npages == 1)
+ len = min_t(int, len, PAGE_SIZE - base);
+ else if (base)
+ break;
+ payload -= len;
+ if (payload <= 0)
+ goto proceed;
+ if (npages == 1 || (len & ~PAGE_MASK))
+ break;
+ }
+ }
+
+ if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+ - NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+ goto proceed;
+
+ return -EINVAL;
+
+proceed:
+ /* skip the virtnet head */
+ iov++;
+ count--;
+
+ if (!ctor->lock_pages)
+ set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+ (((1UL << 32) -1) & iocb->ki_user_data) * 4096,
+ (((1UL << 32) -1) & iocb->ki_user_data) * 4096);
+
+ /* Translate address to kernel */
+ info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
+ if (!info)
+ return -ENOMEM;
+ info->len = total_len;
+ info->hdr[0].iov_base = iocb->ki_iovec[0].iov_base;
+ info->hdr[0].iov_len = iocb->ki_iovec[0].iov_len;
+ info->offset = frags[0].offset;
+ info->desc_pos = iocb->ki_pos;
+ info->log = iocb->ki_user_data;
+
+ iov--;
+ count++;
+
+ memcpy(info->iov, iov, sizeof(struct iovec) * count);
+
+ spin_lock_irqsave(&ctor->read_lock, flag);
+ list_add_tail(&info->list, &ctor->readq);
+ spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+ return 0;
+}
+
+static void __mp_detach(struct mp_struct *mp)
+{
+ mp->mfile = NULL;
+
+ mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+ page_ctor_detach(mp);
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+ /* Drop the extra count on the net device */
+ dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+ mutex_lock(&mp_mutex);
+ __mp_detach(mp);
+ mutex_unlock(&mp_mutex);
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+ if (atomic_dec_and_test(&mfile->count))
+ mp_detach(mfile->mp);
+}
+
+static int mp_release(struct socket *sock)
+{
+ struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ struct mp_file *mfile = mp->mfile;
+
+ mp_put(mfile);
+ sock_put(mp->socket.sk);
+ put_net(mfile->net);
+
+ return 0;
+}
+
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+ .sendmsg = mp_sendmsg,
+ .recvmsg = mp_recvmsg,
+ .release = mp_release,
+};
+
+static struct proto mp_proto = {
+ .name = "mp",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct mp_sock),
+};
+
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+ struct mp_file *mfile;
+ cycle_kernel_lock();
+ DBG1(KERN_INFO "mp: mp_chr_open\n");
+
+ mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
+ if (!mfile)
+ return -ENOMEM;
+ atomic_set(&mfile->count, 0);
+ mfile->mp = NULL;
+ mfile->net = get_net(current->nsproxy->net_ns);
+ file->private_data = mfile;
+ return 0;
+}
+
+
+static struct mp_struct *mp_get(struct mp_file *mfile)
+{
+ struct mp_struct *mp = NULL;
+ if (atomic_inc_not_zero(&mfile->count))
+ mp = mfile->mp;
+
+ return mp;
+}
+
+
+static int mp_attach(struct mp_struct *mp, struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+ int err;
+
+ netif_tx_lock_bh(mp->dev);
+
+ err = -EINVAL;
+
+ if (mfile->mp)
+ goto out;
+
+ err = -EBUSY;
+ if (mp->mfile)
+ goto out;
+
+ err = 0;
+ mfile->mp = mp;
+ mp->mfile = mfile;
+ mp->socket.file = file;
+ dev_hold(mp->dev);
+ sock_hold(mp->socket.sk);
+ atomic_inc(&mfile->count);
+
+out:
+ netif_tx_unlock_bh(mp->dev);
+ return err;
+}
+
+static void mp_sock_destruct(struct sock *sk)
+{
+ struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+ kfree(mp);
+}
+
+static int do_unbind(struct mp_file *mfile)
+{
+ struct mp_struct *mp = mp_get(mfile);
+
+ if (!mp)
+ return -EINVAL;
+
+ mp_detach(mp);
+ sock_put(mp->socket.sk);
+ mp_put(mfile);
+ return 0;
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+ if (sk_has_sleeper(sk))
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+ struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+ struct page_ctor *ctor = NULL;
+ struct sk_buff *skb = NULL;
+ struct page_info *info = NULL;
+ struct ethhdr *eth;
+ struct kiocb *iocb = NULL;
+ int len, i;
+ unsigned long flags;
+
+ struct virtio_net_hdr hdr = {
+ .flags = 0,
+ .gso_type = VIRTIO_NET_HDR_GSO_NONE
+ };
+
+ ctor = rcu_dereference(mp->ctor);
+ if (!ctor)
+ return;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (skb_shinfo(skb)->destructor_arg) {
+ info = container_of(skb_shinfo(skb)->destructor_arg,
+ struct page_info, user);
+ info->skb = skb;
+ if (skb->len > info->len) {
+ mp->dev->stats.rx_dropped++;
+ DBG(KERN_INFO "Discarded truncated rx packet: "
+ " len %d > %zd\n", skb->len, info->len);
+ info->total = skb->len;
+ goto clean;
+ } else {
+ int i;
+ struct skb_shared_info *gshinfo =
+ (struct skb_shared_info *)(&info->ushinfo);
+ struct skb_shared_info *hshinfo =
+ skb_shinfo(skb);
+
+ if (gshinfo->nr_frags < hshinfo->nr_frags)
+ goto clean;
+ eth = eth_hdr(skb);
+ skb_push(skb, ETH_HLEN);
+
+ hdr.hdr_len = skb_headlen(skb);
+ info->total = skb->len;
+
+ for (i = 0; i < gshinfo->nr_frags; i++)
+ gshinfo->frags[i].size = 0;
+ for (i = 0; i < hshinfo->nr_frags; i++)
+ gshinfo->frags[i].size =
+ hshinfo->frags[i].size;
+ memcpy(skb_shinfo(skb), &info->ushinfo,
+ sizeof(struct skb_shared_info));
+ }
+ } else {
+ /* The skb composed with kernel buffers
+ * in case user space buffers are not sufficent.
+ * The case should be rare.
+ */
+ unsigned long flags;
+ int i;
+ struct skb_shared_info *gshinfo = NULL;
+
+ info = NULL;
+
+ spin_lock_irqsave(&ctor->read_lock, flags);
+ if (!list_empty(&ctor->readq)) {
+ info = list_first_entry(&ctor->readq,
+ struct page_info, list);
+ list_del(&info->list);
+ }
+ spin_unlock_irqrestore(&ctor->read_lock, flags);
+ if (!info) {
+ DBG(KERN_INFO "No user buffer avaliable %p\n",
+ skb);
+ skb_queue_head(&sk->sk_receive_queue,
+ skb);
+ break;
+ }
+ info->skb = skb;
+ /* compute the guest skb frags info */
+ gshinfo = (struct skb_shared_info *)(info->user.start +
+ SKB_DATA_ALIGN(info->user.size));
+
+ if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
+ goto clean;
+
+ eth = eth_hdr(skb);
+ skb_push(skb, ETH_HLEN);
+ info->total = skb->len;
+
+ for (i = 0; i < gshinfo->nr_frags; i++)
+ gshinfo->frags[i].size = 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ gshinfo->frags[i].size =
+ skb_shinfo(skb)->frags[i].size;
+ hdr.hdr_len = min_t(int, skb->len,
+ info->iov[1].iov_len);
+ skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
+ }
+
+ len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
+ sizeof hdr);
+ if (len) {
+ DBG(KERN_INFO
+ "Unable to write vnet_hdr at addr %p: %d\n",
+ info->hdr->iov_base, len);
+ goto clean;
+ }
+
+ iocb = create_iocb(info, skb->len + sizeof(hdr));
+ continue;
+
+clean:
+ kfree_skb(skb);
+ for (i = 0; info->pages[i]; i++)
+ put_page(info->pages[i]);
+ kmem_cache_free(ctor->cache, info);
+ }
+ return;
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+ if (sk_has_sleeper(sk))
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+ struct net_device *dev;
+ void __user* argp = (void __user *)arg;
+ struct ifreq ifr;
+ struct sock *sk;
+ int ret;
+
+ ret = -EINVAL;
+
+ switch (cmd) {
+ case MPASSTHRU_BINDDEV:
+ ret = -EFAULT;
+ if (copy_from_user(&ifr, argp, sizeof ifr))
+ break;
+
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+ ret = -EBUSY;
+
+ if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+ break;
+
+ ret = -ENODEV;
+ dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+ if (!dev)
+ break;
+
+ mutex_lock(&mp_mutex);
+
+ ret = -EBUSY;
+ mp = mfile->mp;
+ if (mp)
+ goto err_dev_put;
+
+ mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+ if (!mp) {
+ ret = -ENOMEM;
+ goto err_dev_put;
+ }
+ mp->dev = dev;
+ ret = -ENOMEM;
+
+ sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
+ if (!sk)
+ goto err_free_mp;
+
+ init_waitqueue_head(&mp->socket.wait);
+ mp->socket.ops = &mp_socket_ops;
+ sock_init_data(&mp->socket, sk);
+ sk->sk_sndbuf = INT_MAX;
+ container_of(sk, struct mp_sock, sk)->mp = mp;
+
+ sk->sk_destruct = mp_sock_destruct;
+ sk->sk_data_ready = mp_sock_data_ready;
+ sk->sk_write_space = mp_sock_write_space;
+ sk->sk_state_change = mp_sock_state_change;
+ ret = mp_attach(mp, file);
+ if (ret < 0)
+ goto err_free_sk;
+
+ ret = page_ctor_attach(mp);
+ if (ret < 0)
+ goto err_free_sk;
+
+ ifr.ifr_flags |= IFF_MPASSTHRU_EXCL;
+ mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+out:
+ mutex_unlock(&mp_mutex);
+ break;
+err_free_sk:
+ sk_free(sk);
+err_free_mp:
+ kfree(mp);
+err_dev_put:
+ dev_put(dev);
+ goto out;
+
+ case MPASSTHRU_UNBINDDEV:
+ ret = do_unbind(mfile);
+ break;
+
+ default:
+ break;
+ }
+ return ret;
+}
+
+static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp = mp_get(mfile);
+ struct sock *sk;
+ unsigned int mask = 0;
+
+ if (!mp)
+ return POLLERR;
+
+ sk = mp->socket.sk;
+
+ poll_wait(file, &mp->socket.wait, wait);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+ sock_writeable(sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+ if (mp->dev->reg_state != NETREG_REGISTERED)
+ mask = POLLERR;
+
+ mp_put(mfile);
+ return mask;
+}
+
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct mp_struct *mp = mp_get(file->private_data);
+ struct sock *sk = mp->socket.sk;
+ struct sk_buff *skb;
+ int len, err;
+ ssize_t result;
+
+ if (!mp)
+ return -EBADFD;
+
+ /* currently, async is not supported.
+ * but we may support real async aio from user application,
+ * maybe qemu virtio-net backend.
+ */
+ if (!is_sync_kiocb(iocb))
+ return -EFAULT;
+
+ len = iov_length(iov, count);
+
+ if (unlikely(len) < ETH_HLEN)
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+ file->f_flags & O_NONBLOCK, &err);
+
+ if (!skb)
+ return -EFAULT;
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, len);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+ kfree_skb(skb);
+ return -EAGAIN;
+ }
+
+ skb->protocol = eth_type_trans(skb, mp->dev);
+ skb->dev = mp->dev;
+
+ dev_queue_xmit(skb);
+
+ mp_put(file->private_data);
+ return result;
+}
+
+static int mp_chr_close(struct inode *inode, struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+
+ /*
+ * Ignore return value since an error only means there was nothing to
+ * do
+ */
+ do_unbind(mfile);
+
+ put_net(mfile->net);
+ kfree(mfile);
+
+ return 0;
+}
+
+static const struct file_operations mp_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .write = do_sync_write,
+ .aio_write = mp_chr_aio_write,
+ .poll = mp_chr_poll,
+ .unlocked_ioctl = mp_chr_ioctl,
+ .open = mp_chr_open,
+ .release = mp_chr_close,
+};
+
+static struct miscdevice mp_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mp",
+ .nodename = "net/mp",
+ .fops = &mp_fops,
+};
+
+static int mp_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct mpassthru_port *port;
+ struct mp_struct *mp = NULL;
+ struct socket *sock = NULL;
+
+ port = dev->mp_port;
+ if (port == NULL)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ sock = dev->mp_port->sock;
+ mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+ do_unbind(mp->mfile);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mp_notifier_block __read_mostly = {
+ .notifier_call = mp_device_event,
+};
+
+static int mp_init(void)
+{
+ int ret = 0;
+
+ ret = misc_register(&mp_miscdev);
+ if (ret)
+ printk(KERN_ERR "mp: Can't register misc device\n");
+ else {
+ printk(KERN_INFO "Registering mp misc device - minor = %d\n",
+ mp_miscdev.minor);
+ register_netdevice_notifier(&mp_notifier_block);
+ }
+ return ret;
+}
+
+void mp_cleanup(void)
+{
+ unregister_netdevice_notifier(&mp_notifier_block);
+ misc_deregister(&mp_miscdev);
+}
+
+/* Get an underlying socket object from mp file. Returns error unless file is
+ * attached to a device. The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *mp_get_socket(struct file *file)
+{
+ struct mp_file *mfile = file->private_data;
+ struct mp_struct *mp;
+
+ if (file->f_op != &mp_fops)
+ return ERR_PTR(-EINVAL);
+ mp = mp_get(mfile);
+ if (!mp)
+ return ERR_PTR(-EBADFD);
+ mp_put(mfile);
+ return &mp->socket;
+}
+EXPORT_SYMBOL_GPL(mp_get_socket);
+
+module_init(mp_init);
+module_exit(mp_cleanup);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 0000000..e3983d3
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,29 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
+
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU 0x0001
+#define IFF_MPASSTHRU_EXCL 0x0002
+
+#ifdef __KERNEL__
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+struct socket *mp_get_socket(struct file *);
+#else
+#include <linux/err.h>
+#include <linux/errno.h>
+struct file;
+struct socket;
+static inline struct socket *mp_get_socket(struct file *f)
+{
+ return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_MEDIATE_PASSTHRU */
+#endif /* __KERNEL__ */
+#endif /* __MPASSTHRU_H */
--
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
x***@intel.com
2010-04-22 08:37:16 UTC
Permalink
From: Xin Xiaohui <***@intel.com>

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui <***@intel.com>
---

Michael,
Post by Michael S. Tsirkin
Can't vhost supply a kiocb completion callback that will handle the list?
Yes, thanks. And with it I also remove the vq->receiver finally.

Thanks
Xiaohui

drivers/vhost/net.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++--
drivers/vhost/vhost.c | 115 ++++++++++++++-----------
drivers/vhost/vhost.h | 14 +++
3 files changed, 301 insertions(+), 55 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..4a70f66 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/file.h>
+#include <linux/aio.h>

#include <linux/net.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>
+#include <linux/mpassthru.h>

#include <net/sock.h>

@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct kmem_cache *cache;
/* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
@@ -91,11 +94,132 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
}

+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ if (!list_empty(&vq->notifier)) {
+ iocb = list_first_entry(&vq->notifier,
+ struct kiocb, ki_list);
+ list_del(&iocb->ki_list);
+ }
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+ return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+ struct vhost_virtqueue *vq = iocb->private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ list_add_tail(&iocb->ki_list, &vq->notifier);
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+ struct kiocb *iocb = NULL;
+ struct vhost_log *vq_log = NULL;
+ int rx_total_len = 0;
+ unsigned int head, log, in, out;
+ int size;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ if (sock->sk->sk_data_ready)
+ sock->sk->sk_data_ready(sock->sk, 0);
+
+ vq_log = unlikely(vhost_has_feature(
+ &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, iocb->ki_nbytes);
+ log = (int)(iocb->ki_user_data >> 32);
+ size = iocb->ki_nbytes;
+ head = iocb->ki_pos;
+ rx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+ kmem_cache_free(net->cache, iocb);
+
+ /* when log is enabled, recomputing the log info is needed,
+ * since these buffers are in async queue, and may not get
+ * the log info before.
+ */
+ if (unlikely(vq_log)) {
+ if (!log)
+ __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, vq_log,
+ &log, head);
+ vhost_log_write(vq, vq_log, log, size);
+ }
+ if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ int tx_total_len = 0;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, 0);
+ tx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+
+ kmem_cache_free(net->cache, iocb);
+ if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ unsigned head, unsigned log)
+{
+ struct kiocb *iocb = NULL;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return NULL;
+ iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+ if (!iocb)
+ return NULL;
+ iocb->private = vq;
+ iocb->ki_pos = head;
+ iocb->ki_dtor = handle_iocb;
+ if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
+ iocb->ki_user_data = ((unsigned long)log << 32 | vq->num);
+ iocb->ki_iovec = vq->hdr;
+ }
+ return iocb;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, s;
struct msghdr msg = {
.msg_name = NULL,
@@ -124,6 +248,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;

+ handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -151,6 +277,11 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+ iocb = create_iocb(net, vq, head, 0);
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
+ break;
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
@@ -160,12 +291,18 @@ static void handle_tx(struct vhost_net *net)
break;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
- err = sock->ops->sendmsg(NULL, sock, &msg, len);
+ err = sock->ops->sendmsg(iocb, sock, &msg, len);
if (unlikely(err < 0)) {
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
tx_poll_start(net, sock);
break;
}
+
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ continue;
+
if (err != len)
pr_err("Truncated TX packet: "
" len %d != %zd\n", err, len);
@@ -177,6 +314,8 @@ static void handle_tx(struct vhost_net *net)
}
}

+ handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
@@ -186,6 +325,7 @@ static void handle_tx(struct vhost_net *net)
static void handle_rx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, log, s;
struct vhost_log *vq_log;
struct msghdr msg = {
@@ -206,7 +346,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
- if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+ if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+ vq->link_state == VHOST_VQ_LINK_SYNC))
return;

use_mm(net->dev.mm);
@@ -214,9 +355,17 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;

+ /* In async cases, when write log is enabled, in case the submitted
+ * buffers did not get log info before the log enabling, so we'd
+ * better recompute the log info when needed. We do this in
+ * handle_async_rx_events_notify().
+ */
+
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;

+ handle_async_rx_events_notify(net, vq, sock);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -245,6 +394,11 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+
+ iocb = create_iocb(net, vq, head, log);
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
+ break;
+
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -252,13 +406,20 @@ static void handle_rx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
- err = sock->ops->recvmsg(NULL, sock, &msg,
+
+ err = sock->ops->recvmsg(iocb, sock, &msg,
len, MSG_DONTWAIT | MSG_TRUNC);
/* TODO: Check specific error and bomb out unless EAGAIN? */
if (err < 0) {
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
break;
}
+
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ continue;
+
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_err("Discarded truncated rx packet: "
@@ -284,10 +445,13 @@ static void handle_rx(struct vhost_net *net)
}
}

+ handle_async_rx_events_notify(net, vq, sock);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}

+
static void handle_tx_kick(struct work_struct *work)
{
struct vhost_virtqueue *vq;
@@ -338,6 +502,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+ n->cache = NULL;
return 0;
}

@@ -398,6 +563,18 @@ static void vhost_net_flush(struct vhost_net *n)
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}

+static void vhost_async_cleanup(struct vhost_net *n)
+{
+ /* clean the notifier */
+ struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
+ if (n->cache) {
+ while ((iocb = notify_dequeue(vq)) != NULL)
+ kmem_cache_free(n->cache, iocb);
+ kmem_cache_destroy(n->cache);
+ }
+}
+
static int vhost_net_release(struct inode *inode, struct file *f)
{
struct vhost_net *n = f->private_data;
@@ -414,6 +591,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ vhost_async_cleanup(n);
kfree(n);
return 0;
}
@@ -462,7 +640,19 @@ static struct socket *get_tun_socket(int fd)
return sock;
}

-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+ struct file *file = fget(fd);
+ struct socket *sock;
+ if (!file)
+ return ERR_PTR(-EBADF);
+ sock = mp_get_socket(file);
+ if (IS_ERR(sock))
+ fput(file);
+ return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
{
struct socket *sock;
if (fd == -1)
@@ -473,9 +663,30 @@ static struct socket *get_socket(int fd)
sock = get_tun_socket(fd);
if (!IS_ERR(sock))
return sock;
+ sock = get_mp_socket(fd);
+ if (!IS_ERR(sock)) {
+ vq->link_state = VHOST_VQ_LINK_ASYNC;
+ return sock;
+ }
return ERR_PTR(-ENOTSOCK);
}

+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+ struct vhost_virtqueue *vq = n->vqs + index;
+
+ WARN_ON(!mutex_is_locked(&vq->mutex));
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+ INIT_LIST_HEAD(&vq->notifier);
+ spin_lock_init(&vq->notify_lock);
+ if (!n->cache) {
+ n->cache = kmem_cache_create("vhost_kiocb",
+ sizeof(struct kiocb), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ }
+ }
+}
+
static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
struct socket *sock, *oldsock;
@@ -493,12 +704,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
}
vq = n->vqs + index;
mutex_lock(&vq->mutex);
- sock = get_socket(fd);
+ vq->link_state = VHOST_VQ_LINK_SYNC;
+ sock = get_socket(vq, fd);
if (IS_ERR(sock)) {
r = PTR_ERR(sock);
goto err;
}

+ vhost_init_link_state(n, index);
+
/* start polling new socket */
oldsock = vq->private_data;
if (sock == oldsock)
@@ -507,8 +721,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, sock);
vhost_net_enable_vq(n, vq);
- mutex_unlock(&vq->mutex);
done:
+ mutex_unlock(&vq->mutex);
mutex_unlock(&n->dev.mutex);
if (oldsock) {
vhost_net_flush_vq(n, index);
@@ -516,6 +730,7 @@ done:
}
return r;
err:
+ mutex_unlock(&vq->mutex);
mutex_unlock(&n->dev.mutex);
return r;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
return 0;
}

-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head)
{
struct vring_desc desc;
- unsigned int i, head, found = 0;
- u16 last_avail_idx;
+ unsigned int i = head, found = 0;
int ret;

- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
- if (get_user(vq->avail_idx, &vq->avail->idx)) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return vq->num;
- }
-
- if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return vq->num;
- }
-
- /* If there's nothing new since last we looked, return invalid. */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
-
- /* Only get avail ring entries after they have been exposed by guest. */
- rmb();
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return vq->num;
- }
-
- /* If their number is silly, that's an error. */
- if (head >= vq->num) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return vq->num;
- }
-
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
if (unlikely(log))
*log_num = 0;

- i = head;
do {
unsigned iov_count = *in_num + *out_num;
if (i >= vq->num) {
@@ -833,8 +788,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
*out_num += ret;
}
} while ((i = next_desc(&desc)) != -1);
+ return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0;
+ u16 last_avail_idx;
+ unsigned int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+ if (get_user(vq->avail_idx, &vq->avail->idx)) {
+ vq_err(vq, "Failed to access avail idx at %p\n",
+ &vq->avail->idx);
+ return vq->num;
+ }
+
+ if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return vq->num;
+ }
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->avail_idx == last_avail_idx)
+ return vq->num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ rmb();
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return vq->num;
+ }
+
+ /* If their number is silly, that's an error. */
+ if (head >= vq->num) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return vq->num;
+ }
+
+ ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, head);

/* On success, increment avail index. */
+ if (ret == vq->num)
+ return ret;
vq->last_avail_idx++;
return head;
}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d1f0453..8b95df8 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
u64 len;
};

+enum vhost_vq_link_state {
+ VHOST_VQ_LINK_SYNC = 0,
+ VHOST_VQ_LINK_ASYNC = 1,
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -96,6 +101,10 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
+ /*Differiate async socket for 0-copy from normal*/
+ enum vhost_vq_link_state link_state;
+ struct list_head notifier;
+ spinlock_t notify_lock;
};

struct vhost_dev {
@@ -122,6 +131,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head);
void vhost_discard_vq_desc(struct vhost_virtqueue *);

int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
--
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin
2010-04-22 09:49:52 UTC
Permalink
Post by x***@intel.com
The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.
---
Michael,
Post by Michael S. Tsirkin
Can't vhost supply a kiocb completion callback that will handle the list?
Yes, thanks. And with it I also remove the vq->receiver finally.
Thanks
Xiaohui
Nice progress. I commented on some minor issues below.
Thanks!
Post by x***@intel.com
drivers/vhost/net.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++--
drivers/vhost/vhost.c | 115 ++++++++++++++-----------
drivers/vhost/vhost.h | 14 +++
3 files changed, 301 insertions(+), 55 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..4a70f66 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/file.h>
+#include <linux/aio.h>
#include <linux/net.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>
+#include <linux/mpassthru.h>
#include <net/sock.h>
@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct kmem_cache *cache;
/* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
@@ -91,11 +94,132 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
}
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ if (!list_empty(&vq->notifier)) {
+ iocb = list_first_entry(&vq->notifier,
+ struct kiocb, ki_list);
+ list_del(&iocb->ki_list);
+ }
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+ return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+ struct vhost_virtqueue *vq = iocb->private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ list_add_tail(&iocb->ki_list, &vq->notifier);
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
checkpatch.pl does not complain about the above?
Post by x***@intel.com
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
continuation lines should start to the right of (.
Post by x***@intel.com
+{
+ struct kiocb *iocb = NULL;
+ struct vhost_log *vq_log = NULL;
+ int rx_total_len = 0;
+ unsigned int head, log, in, out;
+ int size;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ if (sock->sk->sk_data_ready)
+ sock->sk->sk_data_ready(sock->sk, 0);
+
+ vq_log = unlikely(vhost_has_feature(
+ &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
split the above line at ?, continuation being to the left of ( looks
ugly.
Post by x***@intel.com
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, iocb->ki_nbytes);
+ log = (int)(iocb->ki_user_data >> 32);
how about we always do the recompute step, and not encode
the log bit in ki_user_data?
Post by x***@intel.com
+ size = iocb->ki_nbytes;
+ head = iocb->ki_pos;
+ rx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+ kmem_cache_free(net->cache, iocb);
+
+ /* when log is enabled, recomputing the log info is needed,
+ * since these buffers are in async queue, and may not get
+ * the log info before.
+ */
+ if (unlikely(vq_log)) {
+ if (!log)
+ __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, vq_log,
+ &log, head);
+ vhost_log_write(vq, vq_log, log, size);
+ }
+ if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ int tx_total_len = 0;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, 0);
+ tx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+
+ kmem_cache_free(net->cache, iocb);
+ if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ unsigned head, unsigned log)
+{
+ struct kiocb *iocb = NULL;
+
+ if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+ return NULL;
+ iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+ if (!iocb)
+ return NULL;
+ iocb->private = vq;
+ iocb->ki_pos = head;
+ iocb->ki_dtor = handle_iocb;
+ if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
+ iocb->ki_user_data = ((unsigned long)log << 32 | vq->num);
+ iocb->ki_iovec = vq->hdr;
+ }
+ return iocb;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+ struct kiocb *iocb = NULL;
do we need to init this?
Post by x***@intel.com
unsigned head, out, in, s;
struct msghdr msg = {
.msg_name = NULL,
@@ -124,6 +248,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;
+ handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -151,6 +277,11 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+ iocb = create_iocb(net, vq, head, 0);
For sync case, we can save some cycles by using iocb = NULL.
Post by x***@intel.com
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
+ break;
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
Generally, I would like to reduce the number of places
where we do if (link_state == XXX) in code.
It should be possible to do this by splitting common code
out into functions.
Post by x***@intel.com
@@ -160,12 +291,18 @@ static void handle_tx(struct vhost_net *net)
break;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
- err = sock->ops->sendmsg(NULL, sock, &msg, len);
+ err = sock->ops->sendmsg(iocb, sock, &msg, len);
if (unlikely(err < 0)) {
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
tx_poll_start(net, sock);
break;
}
+
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ continue;
+
if (err != len)
pr_err("Truncated TX packet: "
" len %d != %zd\n", err, len);
@@ -177,6 +314,8 @@ static void handle_tx(struct vhost_net *net)
}
}
+ handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
@@ -186,6 +325,7 @@ static void handle_tx(struct vhost_net *net)
static void handle_rx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, log, s;
struct vhost_log *vq_log;
struct msghdr msg = {
@@ -206,7 +346,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
- if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+ if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+ vq->link_state == VHOST_VQ_LINK_SYNC))
return;
use_mm(net->dev.mm);
@@ -214,9 +355,17 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;
+ /* In async cases, when write log is enabled, in case the submitted
+ * buffers did not get log info before the log enabling, so we'd
+ * better recompute the log info when needed. We do this in
+ * handle_async_rx_events_notify().
+ */
+
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
+ handle_async_rx_events_notify(net, vq, sock);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -245,6 +394,11 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+
+ iocb = create_iocb(net, vq, head, log);
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
+ break;
+
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -252,13 +406,20 @@ static void handle_rx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
- err = sock->ops->recvmsg(NULL, sock, &msg,
+
+ err = sock->ops->recvmsg(iocb, sock, &msg,
len, MSG_DONTWAIT | MSG_TRUNC);
/* TODO: Check specific error and bomb out unless EAGAIN? */
if (err < 0) {
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
break;
}
+
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+ continue;
+
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_err("Discarded truncated rx packet: "
@@ -284,10 +445,13 @@ static void handle_rx(struct vhost_net *net)
}
}
+ handle_async_rx_events_notify(net, vq, sock);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
+
don't do this
Post by x***@intel.com
static void handle_tx_kick(struct work_struct *work)
{
struct vhost_virtqueue *vq;
@@ -338,6 +502,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+ n->cache = NULL;
return 0;
}
@@ -398,6 +563,18 @@ static void vhost_net_flush(struct vhost_net *n)
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}
+static void vhost_async_cleanup(struct vhost_net *n)
+{
+ /* clean the notifier */
+ struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
+ if (n->cache) {
+ while ((iocb = notify_dequeue(vq)) != NULL)
+ kmem_cache_free(n->cache, iocb);
+ kmem_cache_destroy(n->cache);
+ }
+}
+
static int vhost_net_release(struct inode *inode, struct file *f)
{
struct vhost_net *n = f->private_data;
@@ -414,6 +591,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ vhost_async_cleanup(n);
kfree(n);
return 0;
}
@@ -462,7 +640,19 @@ static struct socket *get_tun_socket(int fd)
return sock;
}
-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+ struct file *file = fget(fd);
+ struct socket *sock;
+ if (!file)
+ return ERR_PTR(-EBADF);
+ sock = mp_get_socket(file);
+ if (IS_ERR(sock))
+ fput(file);
+ return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
{
struct socket *sock;
if (fd == -1)
@@ -473,9 +663,30 @@ static struct socket *get_socket(int fd)
sock = get_tun_socket(fd);
if (!IS_ERR(sock))
return sock;
+ sock = get_mp_socket(fd);
+ if (!IS_ERR(sock)) {
+ vq->link_state = VHOST_VQ_LINK_ASYNC;
+ return sock;
+ }
return ERR_PTR(-ENOTSOCK);
}
+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+ struct vhost_virtqueue *vq = n->vqs + index;
+
+ WARN_ON(!mutex_is_locked(&vq->mutex));
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+ INIT_LIST_HEAD(&vq->notifier);
+ spin_lock_init(&vq->notify_lock);
+ if (!n->cache) {
+ n->cache = kmem_cache_create("vhost_kiocb",
+ sizeof(struct kiocb), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ }
+ }
+}
+
static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
struct socket *sock, *oldsock;
@@ -493,12 +704,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
}
vq = n->vqs + index;
mutex_lock(&vq->mutex);
- sock = get_socket(fd);
+ vq->link_state = VHOST_VQ_LINK_SYNC;
+ sock = get_socket(vq, fd);
if (IS_ERR(sock)) {
r = PTR_ERR(sock);
goto err;
}
+ vhost_init_link_state(n, index);
+
I think we should just teach get_socket to return link_state
in addition to the socket pointer, and pass the returned value to
vhost_init_link_state.
Post by x***@intel.com
/* start polling new socket */
oldsock = vq->private_data;
if (sock == oldsock)
@@ -507,8 +721,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, sock);
vhost_net_enable_vq(n, vq);
- mutex_unlock(&vq->mutex);
+ mutex_unlock(&vq->mutex);
mutex_unlock(&n->dev.mutex);
if (oldsock) {
vhost_net_flush_vq(n, index);
why the change above? Are you sure it's safe? Need to be careful here:
doing everything under vq and dev mutex is much simpler.
If this change is required, need to review locking carefully
to make sure we are not introducing races.
Post by x***@intel.com
}
return r;
+ mutex_unlock(&vq->mutex);
mutex_unlock(&n->dev.mutex);
return r;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
return 0;
}
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head)
{
struct vring_desc desc;
- unsigned int i, head, found = 0;
- u16 last_avail_idx;
+ unsigned int i = head, found = 0;
int ret;
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
- if (get_user(vq->avail_idx, &vq->avail->idx)) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return vq->num;
- }
-
- if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return vq->num;
- }
-
- /* If there's nothing new since last we looked, return invalid. */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
-
- /* Only get avail ring entries after they have been exposed by guest. */
- rmb();
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return vq->num;
- }
-
- /* If their number is silly, that's an error. */
- if (head >= vq->num) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return vq->num;
- }
-
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
if (unlikely(log))
*log_num = 0;
- i = head;
do {
unsigned iov_count = *in_num + *out_num;
if (i >= vq->num) {
@@ -833,8 +788,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
*out_num += ret;
}
} while ((i = next_desc(&desc)) != -1);
+ return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0;
+ u16 last_avail_idx;
+ unsigned int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+ if (get_user(vq->avail_idx, &vq->avail->idx)) {
+ vq_err(vq, "Failed to access avail idx at %p\n",
+ &vq->avail->idx);
+ return vq->num;
+ }
+
+ if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return vq->num;
+ }
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->avail_idx == last_avail_idx)
+ return vq->num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ rmb();
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return vq->num;
+ }
+
+ /* If their number is silly, that's an error. */
+ if (head >= vq->num) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return vq->num;
+ }
+
+ ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, head);
/* On success, increment avail index. */
+ if (ret == vq->num)
+ return ret;
vq->last_avail_idx++;
return head;
}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d1f0453..8b95df8 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
u64 len;
};
+enum vhost_vq_link_state {
+ VHOST_VQ_LINK_SYNC = 0,
+ VHOST_VQ_LINK_ASYNC = 1,
don't try to align values to the right, just put a single space each
side of =.
Post by x***@intel.com
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -96,6 +101,10 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
+ /*Differiate async socket for 0-copy from normal*/
spaces after /* and before */.
Post by x***@intel.com
+ enum vhost_vq_link_state link_state;
+ struct list_head notifier;
+ spinlock_t notify_lock;
};
struct vhost_dev {
@@ -122,6 +131,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head);
void vhost_discard_vq_desc(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
--
1.5.4.4
x***@intel.com
2010-04-23 07:08:33 UTC
Permalink
From: Xin Xiaohui <***@intel.com>

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui <***@intel.com>
---

Michael,
Post by Michael S. Tsirkin
Post by Michael S. Tsirkin
Can't vhost supply a kiocb completion callback that will handle the list?
Yes, thanks. And with it I also remove the vq->receivr finally.
Thanks
Xiaohui
Nice progress. I commented on some minor issues below.
Thanks!
The updated patch addressed your comments on the minor issues.
Thanks!

Thanks
Xiaohui

drivers/vhost/net.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.c | 120 ++++++++++++++-----------
drivers/vhost/vhost.h | 14 +++
3 files changed, 314 insertions(+), 56 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 38989d1..18f6c41 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -23,6 +23,8 @@
#include <linux/if_arp.h>
#include <linux/if_tun.h>
#include <linux/if_macvlan.h>
+#include <linux/mpassthru.h>
+#include <linux/aio.h>

#include <net/sock.h>

@@ -48,6 +50,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct kmem_cache *cache;
/* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
@@ -92,11 +95,138 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
}

+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ if (!list_empty(&vq->notifier)) {
+ iocb = list_first_entry(&vq->notifier,
+ struct kiocb, ki_list);
+ list_del(&iocb->ki_list);
+ }
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+ return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+ struct vhost_virtqueue *vq = iocb->private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ list_add_tail(&iocb->ki_list, &vq->notifier);
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+ return (vq->link_state == VHOST_VQ_LINK_ASYNC);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+ struct kiocb *iocb = NULL;
+ struct vhost_log *vq_log = NULL;
+ int rx_total_len = 0;
+ unsigned int head, log, in, out;
+ int size;
+
+ if (!is_async_vq(vq))
+ return;
+
+ if (sock->sk->sk_data_ready)
+ sock->sk->sk_data_ready(sock->sk, 0);
+
+ vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+ vq->log : NULL;
+
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, iocb->ki_nbytes);
+ size = iocb->ki_nbytes;
+ head = iocb->ki_pos;
+ rx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+ kmem_cache_free(net->cache, iocb);
+
+ /* when log is enabled, recomputing the log info is needed,
+ * since these buffers are in async queue, and may not get
+ * the log info before.
+ */
+ if (unlikely(vq_log)) {
+ if (!log)
+ __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, vq_log,
+ &log, head);
+ vhost_log_write(vq, vq_log, log, size);
+ }
+ if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ int tx_total_len = 0;
+
+ if (!is_async_vq(vq))
+ return;
+
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, 0);
+ tx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
+
+ kmem_cache_free(net->cache, iocb);
+ if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ unsigned head)
+{
+ struct kiocb *iocb = NULL;
+
+ if (!is_async_vq(vq))
+ return NULL;
+
+ iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+ if (!iocb)
+ return NULL;
+ iocb->private = vq;
+ iocb->ki_pos = head;
+ iocb->ki_dtor = handle_iocb;
+ if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
+ iocb->ki_user_data = vq->num;
+ iocb->ki_iovec = vq->hdr;
+ }
+ return iocb;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, s;
struct msghdr msg = {
.msg_name = NULL,
@@ -129,6 +259,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;

+ handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -156,6 +288,13 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+ if (is_async_vq(vq)) {
+ iocb = create_iocb(net, vq, head);
+ if (!iocb)
+ break;
+ }
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
@@ -165,12 +304,18 @@ static void handle_tx(struct vhost_net *net)
break;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
- err = sock->ops->sendmsg(NULL, sock, &msg, len);
+ err = sock->ops->sendmsg(iocb, sock, &msg, len);
if (unlikely(err < 0)) {
+ if (is_async_vq(vq))
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
tx_poll_start(net, sock);
break;
}
+
+ if (is_async_vq(vq))
+ continue;
+
if (err != len)
pr_err("Truncated TX packet: "
" len %d != %zd\n", err, len);
@@ -182,6 +327,8 @@ static void handle_tx(struct vhost_net *net)
}
}

+ handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
@@ -191,6 +338,7 @@ static void handle_tx(struct vhost_net *net)
static void handle_rx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, log, s;
struct vhost_log *vq_log;
struct msghdr msg = {
@@ -211,7 +359,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
- if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+ if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+ vq->link_state == VHOST_VQ_LINK_SYNC))
return;

use_mm(net->dev.mm);
@@ -219,9 +368,17 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;

+ /* In async cases, when write log is enabled, in case the submitted
+ * buffers did not get log info before the log enabling, so we'd
+ * better recompute the log info when needed. We do this in
+ * handle_async_rx_events_notify().
+ */
+
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;

+ handle_async_rx_events_notify(net, vq, sock);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -250,6 +407,13 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+
+ if (is_async_vq(vq)) {
+ iocb = create_iocb(net, vq, head);
+ if (!iocb)
+ break;
+ }
+
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -257,13 +421,20 @@ static void handle_rx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
- err = sock->ops->recvmsg(NULL, sock, &msg,
+
+ err = sock->ops->recvmsg(iocb, sock, &msg,
len, MSG_DONTWAIT | MSG_TRUNC);
/* TODO: Check specific error and bomb out unless EAGAIN? */
if (err < 0) {
+ if (is_async_vq(vq))
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
break;
}
+
+ if (is_async_vq(vq))
+ continue;
+
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_err("Discarded truncated rx packet: "
@@ -289,6 +460,8 @@ static void handle_rx(struct vhost_net *net)
}
}

+ handle_async_rx_events_notify(net, vq, sock);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
@@ -342,6 +515,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+ n->cache = NULL;

f->private_data = n;

@@ -405,6 +579,18 @@ static void vhost_net_flush(struct vhost_net *n)
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}

+static void vhost_async_cleanup(struct vhost_net *n)
+{
+ /* clean the notifier */
+ struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
+ if (n->cache) {
+ while ((iocb = notify_dequeue(vq)) != NULL)
+ kmem_cache_free(n->cache, iocb);
+ kmem_cache_destroy(n->cache);
+ }
+}
+
static int vhost_net_release(struct inode *inode, struct file *f)
{
struct vhost_net *n = f->private_data;
@@ -421,6 +607,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ vhost_async_cleanup(n);
kfree(n);
return 0;
}
@@ -472,21 +659,58 @@ static struct socket *get_tap_socket(int fd)
return sock;
}

-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+ struct file *file = fget(fd);
+ struct socket *sock;
+ if (!file)
+ return ERR_PTR(-EBADF);
+ sock = mp_get_socket(file);
+ if (IS_ERR(sock))
+ fput(file);
+ return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd,
+ enum vhost_vq_link_state *state)
{
struct socket *sock;
/* special case to disable backend */
if (fd == -1)
return NULL;
+
+ *state = VHOST_VQ_LINK_SYNC;
+
sock = get_raw_socket(fd);
if (!IS_ERR(sock))
return sock;
sock = get_tap_socket(fd);
if (!IS_ERR(sock))
return sock;
+ sock = get_mp_socket(fd);
+ if (!IS_ERR(sock)) {
+ *state = VHOST_VQ_LINK_ASYNC;
+ return sock;
+ }
return ERR_PTR(-ENOTSOCK);
}

+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+ struct vhost_virtqueue *vq = n->vqs + index;
+
+ WARN_ON(!mutex_is_locked(&vq->mutex));
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+ INIT_LIST_HEAD(&vq->notifier);
+ spin_lock_init(&vq->notify_lock);
+ if (!n->cache) {
+ n->cache = kmem_cache_create("vhost_kiocb",
+ sizeof(struct kiocb), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ }
+ }
+}
+
static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
struct socket *sock, *oldsock;
@@ -510,12 +734,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
r = -EFAULT;
goto err_vq;
}
- sock = get_socket(fd);
+ sock = get_socket(vq, fd, &vq->link_state);
if (IS_ERR(sock)) {
r = PTR_ERR(sock);
goto err_vq;
}

+ vhost_init_link_state(n, index);
+
/* start polling new socket */
oldsock = vq->private_data;
if (sock == oldsock)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3f10194..add77d3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -860,61 +860,17 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
return 0;
}

-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+/* This computes the log info according to the index of buffer */
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head)
{
struct vring_desc desc;
unsigned int i, head, found = 0;
- u16 last_avail_idx;
- int ret;
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
- if (get_user(vq->avail_idx, &vq->avail->idx)) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return vq->num;
- }
-
- if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return vq->num;
- }
-
- /* If there's nothing new since last we looked, return invalid. */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
+ unsigned int ret;

- /* Only get avail ring entries after they have been exposed by guest. */
- smp_rmb();
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return vq->num;
- }
-
- /* If their number is silly, that's an error. */
- if (head >= vq->num) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return vq->num;
- }
-
- /* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
if (unlikely(log))
*log_num = 0;
@@ -978,8 +934,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
*out_num += ret;
}
} while ((i = next_desc(&desc)) != -1);
+ return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0;
+ u16 last_avail_idx;
+ unsigned int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+ if (get_user(vq->avail_idx, &vq->avail->idx)) {
+ vq_err(vq, "Failed to access avail idx at %p\n",
+ &vq->avail->idx);
+ return vq->num;
+ }
+
+ if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return vq->num;
+ }
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->avail_idx == last_avail_idx)
+ return vq->num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ rmb();
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return vq->num;
+ }
+
+ /* If their number is silly, that's an error. */
+ if (head >= vq->num) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return vq->num;
+ }
+
+ ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, head);

/* On success, increment avail index. */
+ if (ret == vq->num)
+ return ret;
vq->last_avail_idx++;
return head;
}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 44591ba..3c9cbce 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
u64 len;
};

+enum vhost_vq_link_state {
+ VHOST_VQ_LINK_SYNC = 0,
+ VHOST_VQ_LINK_ASYNC = 1,
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -96,6 +101,10 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
+ /* Differiate async socket for 0-copy from normal */
+ enum vhost_vq_link_state link_state;
+ struct list_head notifier;
+ spinlock_t notify_lock;
};

struct vhost_dev {
@@ -124,6 +133,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head);
void vhost_discard_vq_desc(struct vhost_virtqueue *);

int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
--
1.5.4.4
Michael S. Tsirkin
2010-04-24 19:32:32 UTC
Permalink
Post by x***@intel.com
The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.
---
Michael,
Post by Michael S. Tsirkin
Post by Michael S. Tsirkin
Can't vhost supply a kiocb completion callback that will handle the list?
Yes, thanks. And with it I also remove the vq->receivr finally.
Thanks
Xiaohui
Nice progress. I commented on some minor issues below.
Thanks!
The updated patch addressed your comments on the minor issues.
Thanks!
Thanks
Xiaohui
drivers/vhost/net.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.c | 120 ++++++++++++++-----------
drivers/vhost/vhost.h | 14 +++
3 files changed, 314 insertions(+), 56 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 38989d1..18f6c41 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -23,6 +23,8 @@
#include <linux/if_arp.h>
#include <linux/if_tun.h>
#include <linux/if_macvlan.h>
+#include <linux/mpassthru.h>
+#include <linux/aio.h>
#include <net/sock.h>
@@ -48,6 +50,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct kmem_cache *cache;
/* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
@@ -92,11 +95,138 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
}
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ if (!list_empty(&vq->notifier)) {
+ iocb = list_first_entry(&vq->notifier,
+ struct kiocb, ki_list);
+ list_del(&iocb->ki_list);
+ }
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
+ return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+ struct vhost_virtqueue *vq = iocb->private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vq->notify_lock, flags);
+ list_add_tail(&iocb->ki_list, &vq->notifier);
+ spin_unlock_irqrestore(&vq->notify_lock, flags);
Don't we need to wake up the wq as well?
Post by x***@intel.com
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+ return (vq->link_state == VHOST_VQ_LINK_ASYNC);
() not needed
Post by x***@intel.com
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+ struct kiocb *iocb = NULL;
+ struct vhost_log *vq_log = NULL;
+ int rx_total_len = 0;
+ unsigned int head, log, in, out;
+ int size;
+
+ if (!is_async_vq(vq))
+ return;
+
+ if (sock->sk->sk_data_ready)
+ sock->sk->sk_data_ready(sock->sk, 0);
+
+ vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+ vq->log : NULL;
+
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, iocb->ki_nbytes);
+ size = iocb->ki_nbytes;
+ head = iocb->ki_pos;
+ rx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
I am confused by the above. Isn't ki_dtor handle_iocb?
Why is it called here?
Post by x***@intel.com
+ kmem_cache_free(net->cache, iocb);
+
+ /* when log is enabled, recomputing the log info is needed,
+ * since these buffers are in async queue, and may not get
+ * the log info before.
+ */
+ if (unlikely(vq_log)) {
+ if (!log)
log is uninitialized now?
Post by x***@intel.com
+ __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in, vq_log,
+ &log, head);
+ vhost_log_write(vq, vq_log, log, size);
+ }
+ if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ int tx_total_len = 0;
+
+ if (!is_async_vq(vq))
+ return;
+
+ while ((iocb = notify_dequeue(vq)) != NULL) {
Please just write this as while (((iocb = notify_dequeue(vq)))
above as well
Post by x***@intel.com
+ vhost_add_used_and_signal(&net->dev, vq,
+ iocb->ki_pos, 0);
pls indent continuation lines to the roght of (
above as well
Post by x***@intel.com
+ tx_total_len += iocb->ki_nbytes;
+
+ if (iocb->ki_dtor)
+ iocb->ki_dtor(iocb);
same question as above
Post by x***@intel.com
+
+ kmem_cache_free(net->cache, iocb);
+ if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ unsigned head)
+{
+ struct kiocb *iocb = NULL;
+
+ if (!is_async_vq(vq))
+ return NULL;
+
+ iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+ if (!iocb)
+ return NULL;
+ iocb->private = vq;
+ iocb->ki_pos = head;
+ iocb->ki_dtor = handle_iocb;
So, dtor calls handle_iocb, but what causes vhost
to wake-up is really poll, right?
Post by x***@intel.com
+ if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
+ iocb->ki_user_data = vq->num;
Is the above used?
Post by x***@intel.com
+ iocb->ki_iovec = vq->hdr;
+ }
+ return iocb;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+ struct kiocb *iocb = NULL;
Why do we need to init iocb to NULL?
Post by x***@intel.com
unsigned head, out, in, s;
struct msghdr msg = {
.msg_name = NULL,
@@ -129,6 +259,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;
+ handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -156,6 +288,13 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+ if (is_async_vq(vq)) {
+ iocb = create_iocb(net, vq, head);
+ if (!iocb)
+ break;
+ }
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
@@ -165,12 +304,18 @@ static void handle_tx(struct vhost_net *net)
break;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
- err = sock->ops->sendmsg(NULL, sock, &msg, len);
+ err = sock->ops->sendmsg(iocb, sock, &msg, len);
if (unlikely(err < 0)) {
+ if (is_async_vq(vq))
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
tx_poll_start(net, sock);
break;
}
+
+ if (is_async_vq(vq))
+ continue;
+
if (err != len)
pr_err("Truncated TX packet: "
" len %d != %zd\n", err, len);
@@ -182,6 +327,8 @@ static void handle_tx(struct vhost_net *net)
}
}
+ handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
@@ -191,6 +338,7 @@ static void handle_tx(struct vhost_net *net)
static void handle_rx(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
unsigned head, out, in, log, s;
struct vhost_log *vq_log;
struct msghdr msg = {
@@ -211,7 +359,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
- if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+ if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+ vq->link_state == VHOST_VQ_LINK_SYNC))
return;
use_mm(net->dev.mm);
@@ -219,9 +368,17 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;
+ /* In async cases, when write log is enabled, in case the submitted
+ * buffers did not get log info before the log enabling, so we'd
+ * better recompute the log info when needed. We do this in
+ * handle_async_rx_events_notify().
+ */
+
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
+ handle_async_rx_events_notify(net, vq, sock);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -250,6 +407,13 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+
+ if (is_async_vq(vq)) {
+ iocb = create_iocb(net, vq, head);
+ if (!iocb)
+ break;
+ }
+
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -257,13 +421,20 @@ static void handle_rx(struct vhost_net *net)
iov_length(vq->hdr, s), hdr_size);
break;
}
- err = sock->ops->recvmsg(NULL, sock, &msg,
+
+ err = sock->ops->recvmsg(iocb, sock, &msg,
len, MSG_DONTWAIT | MSG_TRUNC);
/* TODO: Check specific error and bomb out unless EAGAIN? */
if (err < 0) {
+ if (is_async_vq(vq))
+ kmem_cache_free(net->cache, iocb);
vhost_discard_vq_desc(vq);
break;
}
+
+ if (is_async_vq(vq))
+ continue;
+
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_err("Discarded truncated rx packet: "
@@ -289,6 +460,8 @@ static void handle_rx(struct vhost_net *net)
}
}
+ handle_async_rx_events_notify(net, vq, sock);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
}
@@ -342,6 +515,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+ n->cache = NULL;
f->private_data = n;
@@ -405,6 +579,18 @@ static void vhost_net_flush(struct vhost_net *n)
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}
+static void vhost_async_cleanup(struct vhost_net *n)
+{
+ /* clean the notifier */
+ struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+ struct kiocb *iocb = NULL;
+ if (n->cache) {
+ while ((iocb = notify_dequeue(vq)) != NULL)
+ kmem_cache_free(n->cache, iocb);
+ kmem_cache_destroy(n->cache);
+ }
+}
+
static int vhost_net_release(struct inode *inode, struct file *f)
{
struct vhost_net *n = f->private_data;
@@ -421,6 +607,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ vhost_async_cleanup(n);
kfree(n);
return 0;
}
@@ -472,21 +659,58 @@ static struct socket *get_tap_socket(int fd)
return sock;
}
-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+ struct file *file = fget(fd);
+ struct socket *sock;
+ if (!file)
+ return ERR_PTR(-EBADF);
+ sock = mp_get_socket(file);
+ if (IS_ERR(sock))
+ fput(file);
+ return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd,
+ enum vhost_vq_link_state *state)
{
struct socket *sock;
/* special case to disable backend */
if (fd == -1)
return NULL;
+
+ *state = VHOST_VQ_LINK_SYNC;
+
sock = get_raw_socket(fd);
if (!IS_ERR(sock))
return sock;
sock = get_tap_socket(fd);
if (!IS_ERR(sock))
return sock;
+ sock = get_mp_socket(fd);
+ if (!IS_ERR(sock)) {
+ *state = VHOST_VQ_LINK_ASYNC;
+ return sock;
+ }
return ERR_PTR(-ENOTSOCK);
}
+static void vhost_init_link_state(struct vhost_net *n, int index)
so let's pass link state as parameter, and set it in this function.
And maybe pass in vq, no need for index tricks.
Post by x***@intel.com
+{
+ struct vhost_virtqueue *vq = n->vqs + index;
+
+ WARN_ON(!mutex_is_locked(&vq->mutex));
there's a single place of call, I don't think we need this check.
Post by x***@intel.com
+ if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+ INIT_LIST_HEAD(&vq->notifier);
+ spin_lock_init(&vq->notify_lock);
+ if (!n->cache) {
+ n->cache = kmem_cache_create("vhost_kiocb",
vhost_net_kiocb a better name
Post by x***@intel.com
+ sizeof(struct kiocb), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ }
no need for {} for single statement if.
Post by x***@intel.com
+ }
+}
+
static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
struct socket *sock, *oldsock;
@@ -510,12 +734,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
r = -EFAULT;
goto err_vq;
}
- sock = get_socket(fd);
+ sock = get_socket(vq, fd, &vq->link_state);
if (IS_ERR(sock)) {
r = PTR_ERR(sock);
goto err_vq;
}
+ vhost_init_link_state(n, index);
+
/* start polling new socket */
oldsock = vq->private_data;
if (sock == oldsock)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3f10194..add77d3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -860,61 +860,17 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
return 0;
}
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+/* This computes the log info according to the index of buffer */
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head)
{
struct vring_desc desc;
unsigned int i, head, found = 0;
- u16 last_avail_idx;
- int ret;
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
- if (get_user(vq->avail_idx, &vq->avail->idx)) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return vq->num;
- }
-
- if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return vq->num;
- }
-
- /* If there's nothing new since last we looked, return invalid. */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
+ unsigned int ret;
- /* Only get avail ring entries after they have been exposed by guest. */
- smp_rmb();
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return vq->num;
- }
-
- /* If their number is silly, that's an error. */
- if (head >= vq->num) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return vq->num;
- }
-
- /* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
if (unlikely(log))
*log_num = 0;
@@ -978,8 +934,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
*out_num += ret;
}
} while ((i = next_desc(&desc)) != -1);
+ return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0;
+ u16 last_avail_idx;
+ unsigned int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+ if (get_user(vq->avail_idx, &vq->avail->idx)) {
+ vq_err(vq, "Failed to access avail idx at %p\n",
+ &vq->avail->idx);
+ return vq->num;
+ }
+
+ if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return vq->num;
+ }
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->avail_idx == last_avail_idx)
+ return vq->num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ rmb();
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return vq->num;
+ }
+
+ /* If their number is silly, that's an error. */
+ if (head >= vq->num) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return vq->num;
+ }
+
+ ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+ out_num, in_num,
+ log, log_num, head);
/* On success, increment avail index. */
+ if (ret == vq->num)
+ return ret;
vq->last_avail_idx++;
return head;
}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 44591ba..3c9cbce 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
u64 len;
};
+enum vhost_vq_link_state {
+ VHOST_VQ_LINK_SYNC = 0,
+ VHOST_VQ_LINK_ASYNC = 1,
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -96,6 +101,10 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
+ /* Differiate async socket for 0-copy from normal */
+ enum vhost_vq_link_state link_state;
+ struct list_head notifier;
+ spinlock_t notify_lock;
};
struct vhost_dev {
@@ -124,6 +133,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int head);
void vhost_discard_vq_desc(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
--
1.5.4.4
Arnd Bergmann
2010-04-15 15:06:43 UTC
Permalink
Post by Xin, Xiaohui
Post by Arnd Bergmann
It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?
I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?
I have not looked into your macvtap code in detail before.
Does the two interface exactly the same? We just want to create a simple
way to do zero-copy. Now it can only support vhost, but in future
we also want it to support directly read/write operations from user space too.
Right now, the features are mostly distinct. Macvtap first of all provides
a "tap" style interface for users, and can also be used by vhost-net.
It also provides a way to share a NIC among a number of guests by software,
though I indent to add support for VMDq and SR-IOV as well. Zero-copy
is also not yet done in macvtap but should be added.

mpassthru right now does not allow sharing a NIC between guests, and
does not have a tap interface for non-vhost operation, but does the
zero-copy that is missing in macvtap.
Post by Xin, Xiaohui
Basically, compared to the interface, I'm more worried about the modification
to net core we have made to implement zero-copy now. If this hardest part
can be done, then any user space interface modifications or integrations are
more easily to be done after that.
I agree that the network stack modifications are the hard part for zero-copy,
and your work on that looks very promising and is complementary to what I've
done with macvtap. Your current user interface looks good for testing this out,
but I think we should not merge it (the interface) upstream if we can get the
same or better result by integrating your buffer management code into macvtap.

I can try to merge your code into macvtap myself if you agree, so you
can focus on getting the internals right.
Post by Xin, Xiaohui
Post by Arnd Bergmann
Not sure what I'm missing, but who calls the vq->receiver? This seems
to be neither in the upstream version of vhost nor introduced by your
patch.
See Patch v3 2/3 I have sent out, it is called by handle_rx() in vhost.
Ok, I see. As a general rule, it's preferred to split a patch series
in a way that makes it possible to apply each patch separately and still
get a working kernel, ideally with more features than the version before
the patch. I believe you could get there by reordering your patches to
make the actual driver the last one in the series.

Not a big problem though, I was mostly looking in the wrong place.
Post by Xin, Xiaohui
Post by Arnd Bergmann
Post by x***@intel.com
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+ ret = -EBUSY;
+
+ if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+ break;
Your current use of the IFF_MPASSTHRU* flags does not seem to make
any sense whatsoever. You check that this flag is never set, but set
it later yourself and then ignore all flags.
Using that flag is tried to prevent if another one wants to bind the same device
Again. But I will see if it really ignore all other flags.
The ifr variable is on the stack of the mp_chr_ioctl function, and you never
look at the value after setting it. In order to prevent multiple opens
of that device, you probably need to lock out any other users as well,
and make it a property of the underlying device. E.g. you also want to
prevent users on the host from setting an IP address on the NIC and
using it to send and receive data there.

Arnd
Continue reading on narkive:
Loading...