Patchwork [v3,1/1] Shared memory uio_pci driver

login
register
mail settings
Submitter Cam Macdonell
Date March 25, 2010, 6:09 a.m.
Message ID <1269497376-21903-1-git-send-email-cam@cs.ualberta.ca>
Download mbox | patch
Permalink /patch/48498/
State New
Headers show

Comments

Cam Macdonell - March 25, 2010, 6:09 a.m.
This patch adds a driver for my shared memory PCI device using the uio_pci
interface.  The driver has three memory regions.  The first memory region is for
device registers for sending interrupts. The second BAR is for receiving MSI-X
interrupts and the third memory region maps the shared memory.  The device only
exports the first and third memory regions to userspace.

This driver supports MSI-X and regular pin interrupts.  Currently, the number of
MSI vectors is set to 4 which could be increased, but the driver will work with
fewer vectors.  If MSI is not available, then regular interrupts will be used.
---
 drivers/uio/Kconfig       |    8 ++
 drivers/uio/Makefile      |    1 +
 drivers/uio/uio_ivshmem.c |  235 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 244 insertions(+), 0 deletions(-)
 create mode 100644 drivers/uio/uio_ivshmem.c
Michael S. Tsirkin - March 25, 2010, 9:05 a.m.
On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
> This patch adds a driver for my shared memory PCI device using the uio_pci
> interface.  The driver has three memory regions.  The first memory region is for
> device registers for sending interrupts. The second BAR is for receiving MSI-X
> interrupts and the third memory region maps the shared memory.  The device only
> exports the first and third memory regions to userspace.
> 
> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
> MSI vectors is set to 4 which could be increased, but the driver will work with
> fewer vectors.  If MSI is not available, then regular interrupts will be used.
> ---
>  drivers/uio/Kconfig       |    8 ++
>  drivers/uio/Makefile      |    1 +
>  drivers/uio/uio_ivshmem.c |  235 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 244 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/uio/uio_ivshmem.c
> 
> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
> index 1da73ec..b92cded 100644
> --- a/drivers/uio/Kconfig
> +++ b/drivers/uio/Kconfig
> @@ -74,6 +74,14 @@ config UIO_SERCOS3
>  
>  	  If you compile this as a module, it will be called uio_sercos3.
>  
> +config UIO_IVSHMEM
> +	tristate "KVM shared memory PCI driver"
> +	default n
> +	help
> +	  Userspace I/O interface for the KVM shared memory device.  This
> +	  driver will make available two memory regions, the first is
> +	  registers and the second is a region for sharing between VMs.
> +
>  config UIO_PCI_GENERIC
>  	tristate "Generic driver for PCI 2.3 and PCI Express cards"
>  	depends on PCI
> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
> index 18fd818..25c1ca5 100644
> --- a/drivers/uio/Makefile
> +++ b/drivers/uio/Makefile
> @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC)	+= uio_aec.o
>  obj-$(CONFIG_UIO_SERCOS3)	+= uio_sercos3.o
>  obj-$(CONFIG_UIO_PCI_GENERIC)	+= uio_pci_generic.o
>  obj-$(CONFIG_UIO_NETX)	+= uio_netx.o
> +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
> diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
> new file mode 100644
> index 0000000..607435b
> --- /dev/null
> +++ b/drivers/uio/uio_ivshmem.c
> @@ -0,0 +1,235 @@
> +/*
> + * UIO IVShmem Driver
> + *
> + * (C) 2009 Cam Macdonell
> + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch <hjk@linutronix.de>
> + *
> + * Licensed under GPL version 2 only.
> + *
> + */
> +
> +#include <linux/device.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/uio_driver.h>
> +
> +#include <asm/io.h>
> +
> +#define IntrStatus 0x04
> +#define IntrMask 0x00
> +
> +struct ivshmem_info {
> +    struct uio_info *uio;
> +    struct pci_dev *dev;
> +    char (*msix_names)[256];
> +    struct msix_entry *msix_entries;
> +    int nvectors;
> +};
> +
> +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
> +{
> +
> +    void __iomem *plx_intscr = dev_info->mem[0].internal_addr
> +                    + IntrStatus;
> +    u32 val;
> +
> +    val = readl(plx_intscr);
> +    if (val == 0)
> +        return IRQ_NONE;
> +
> +    printk(KERN_INFO "Regular interrupt (val = %d)\n", val);
> +    return IRQ_HANDLED;
> +}
> +
> +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
> +{
> +
> +    struct uio_info * dev_info = (struct uio_info *) opaque;
> +
> +    /* we have to do this explicitly when using MSI-X */
> +    uio_event_notify(dev_info);

How does userspace know which vector was triggered?

> +    printk(KERN_INFO "MSI-X interrupt (%d)\n", irq);
> +    return IRQ_HANDLED;
> +

extra empty line

> +}
> +
> +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
> +{
> +    int i, err;
> +    const char *name = "ivshmem";
> +
> +    printk(KERN_INFO "devname is %s\n", name);

These KERN_INFO messages need to be cleaned up, they would be
look confusing in the log.

> +    ivs_info->nvectors = nvectors;
> +
> +
> +    ivs_info->msix_entries = kmalloc(nvectors * sizeof *ivs_info->msix_entries,
> +            GFP_KERNEL);
> +    ivs_info->msix_names = kmalloc(nvectors * sizeof *ivs_info->msix_names,
> +            GFP_KERNEL);
> +

need to handle errors

> +    for (i = 0; i < nvectors; ++i)
> +        ivs_info->msix_entries[i].entry = i;
> +
> +    err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
> +                    ivs_info->nvectors);
> +    if (err > 0) {
> +        ivs_info->nvectors = err; /* msi-x positive error code
> +                                     returns the number available*/
> +        err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
> +                    ivs_info->nvectors);
> +        if (err > 0) {
> +            printk(KERN_INFO "no MSI (%d). Back to INTx.\n", err);
> +            return -ENOSPC;
> +        }
> +    }

we can also get err < 0.

> +
> +    printk(KERN_INFO "err is %d\n", err);
> +    if (err) return err;

coding style rule violation

> +
> +    for (i = 0; i < ivs_info->nvectors; i++) {
> +
> +        snprintf(ivs_info->msix_names[i], sizeof *ivs_info->msix_names,
> +            "%s-config", name);
> +
> +        ivs_info->msix_entries[i].entry = i;
> +        err = request_irq(ivs_info->msix_entries[i].vector,
> +            ivshmem_msix_handler, 0,
> +            ivs_info->msix_names[i], ivs_info->uio);
> +
> +        if (err) {
> +            return -ENOSPC;

coding style rule violation
no undo on error handling
why override error code with -ENOSPC?

> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int __devinit ivshmem_pci_probe(struct pci_dev *dev,
> +                    const struct pci_device_id *id)
> +{
> +    struct uio_info *info;
> +    struct ivshmem_info * ivshmem_info;
> +    int nvectors = 4;
> +
> +    info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
> +    if (!info)
> +        return -ENOMEM;
> +
> +    ivshmem_info = kzalloc(sizeof(struct ivshmem_info), GFP_KERNEL);
> +    if (!ivshmem_info) {
> +        kfree(info);
> +        return -ENOMEM;
> +    }
> +
> +    if (pci_enable_device(dev))
> +        goto out_free;
> +
> +    if (pci_request_regions(dev, "ivshmem"))
> +        goto out_disable;
> +
> +    info->mem[0].addr = pci_resource_start(dev, 0);
> +    if (!info->mem[0].addr)
> +        goto out_release;
> +
> +    info->mem[0].size = pci_resource_len(dev, 0);
> +    info->mem[0].internal_addr = pci_ioremap_bar(dev, 0);
> +    if (!info->mem[0].internal_addr) {
> +        printk(KERN_INFO "got a null");
> +        goto out_release;
> +    }
> +
> +    info->mem[0].memtype = UIO_MEM_PHYS;
> +
> +    info->mem[1].addr = pci_resource_start(dev, 2);
> +    if (!info->mem[1].addr)
> +        goto out_unmap;
> +    info->mem[1].internal_addr = pci_ioremap_bar(dev, 2);
> +    if (!info->mem[1].internal_addr)
> +        goto out_unmap;
> +
> +    info->mem[1].size = pci_resource_len(dev, 2);
> +    info->mem[1].memtype = UIO_MEM_PHYS;
> +
> +    ivshmem_info->uio = info;
> +    ivshmem_info->dev = dev;
> +
> +    if (request_msix_vectors(ivshmem_info, nvectors) != 0) {
> +        printk(KERN_INFO "regular IRQs\n");
> +        info->irq = dev->irq;
> +        info->irq_flags = IRQF_SHARED;
> +        info->handler = ivshmem_handler;
> +        writel(0xffffffff, info->mem[0].internal_addr + IntrMask);
> +    } else {
> +        printk(KERN_INFO "MSI-X enabled\n");
> +        info->irq = -1;
> +    }
> +
> +    info->name = "ivshmem";
> +    info->version = "0.0.1";
> +
> +    if (uio_register_device(&dev->dev, info))
> +        goto out_unmap2;
> +
> +    pci_set_drvdata(dev, info);
> +
> +
> +    return 0;
> +out_unmap2:
> +    iounmap(info->mem[2].internal_addr);
> +out_unmap:
> +    iounmap(info->mem[0].internal_addr);
> +out_release:
> +    pci_release_regions(dev);
> +out_disable:
> +    pci_disable_device(dev);
> +out_free:
> +    kfree (info);
> +    return -ENODEV;
> +}
> +
> +static void ivshmem_pci_remove(struct pci_dev *dev)
> +{
> +    struct uio_info *info = pci_get_drvdata(dev);
> +
> +    uio_unregister_device(info);
> +    pci_release_regions(dev);
> +    pci_disable_device(dev);
> +    pci_set_drvdata(dev, NULL);
> +    iounmap(info->mem[0].internal_addr);
> +
> +    kfree (info);
> +}
> +
> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
> +    {
> +        .vendor =    0x1af4,
> +        .device =    0x1110,

vendor ids must be registered with PCI SIG.
this one does not seem to be registered.

> +        .subvendor =    PCI_ANY_ID,
> +        .subdevice =    PCI_ANY_ID,
> +    },
> +    { 0, }
> +};
> +
> +static struct pci_driver ivshmem_pci_driver = {
> +    .name = "uio_ivshmem",
> +    .id_table = ivshmem_pci_ids,
> +    .probe = ivshmem_pci_probe,
> +    .remove = ivshmem_pci_remove,
> +};
> +
> +static int __init ivshmem_init_module(void)
> +{
> +    return pci_register_driver(&ivshmem_pci_driver);
> +}
> +
> +static void __exit ivshmem_exit_module(void)
> +{
> +    pci_unregister_driver(&ivshmem_pci_driver);
> +}
> +
> +module_init(ivshmem_init_module);
> +module_exit(ivshmem_exit_module);
> +
> +MODULE_DEVICE_TABLE(pci, ivshmem_pci_ids);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Cam Macdonell");
> -- 
> 1.6.6.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - March 25, 2010, 9:15 a.m.
On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
> This patch adds a driver for my shared memory PCI device using the uio_pci
> interface.  The driver has three memory regions.  The first memory region is for
> device registers for sending interrupts. The second BAR is for receiving MSI-X
> interrupts and the third memory region maps the shared memory.  The device only
> exports the first and third memory regions to userspace.
> 
> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
> MSI vectors is set to 4 which could be increased, but the driver will work with
> fewer vectors.  If MSI is not available, then regular interrupts will be used.

Some high level questions, sorry if they have been raised in the past:
- Can this device use virtio framework?
  This gives us some standards to work off, with feature negotiation,
  ability to detect config changes, support for non-PCI
  platforms, decent documentation that is easy to extend,
  legal id range to use.
  You would thus have your driver in uio/uio_virtio_shmem.c

- Why are you using 32 bit long memory accesses for interrupts?
  16 bit IO eould be enough and it's faster. This what virtio-pci does.

- How was the driver tested?

> ---
>  drivers/uio/Kconfig       |    8 ++
>  drivers/uio/Makefile      |    1 +
>  drivers/uio/uio_ivshmem.c |  235 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 244 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/uio/uio_ivshmem.c
> 
> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
> index 1da73ec..b92cded 100644
> --- a/drivers/uio/Kconfig
> +++ b/drivers/uio/Kconfig
> @@ -74,6 +74,14 @@ config UIO_SERCOS3
>  
>  	  If you compile this as a module, it will be called uio_sercos3.
>  
> +config UIO_IVSHMEM
> +	tristate "KVM shared memory PCI driver"
> +	default n
> +	help
> +	  Userspace I/O interface for the KVM shared memory device.  This
> +	  driver will make available two memory regions, the first is
> +	  registers and the second is a region for sharing between VMs.
> +
>  config UIO_PCI_GENERIC
>  	tristate "Generic driver for PCI 2.3 and PCI Express cards"
>  	depends on PCI
> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
> index 18fd818..25c1ca5 100644
> --- a/drivers/uio/Makefile
> +++ b/drivers/uio/Makefile
> @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC)	+= uio_aec.o
>  obj-$(CONFIG_UIO_SERCOS3)	+= uio_sercos3.o
>  obj-$(CONFIG_UIO_PCI_GENERIC)	+= uio_pci_generic.o
>  obj-$(CONFIG_UIO_NETX)	+= uio_netx.o
> +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
> diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
> new file mode 100644
> index 0000000..607435b
> --- /dev/null
> +++ b/drivers/uio/uio_ivshmem.c
> @@ -0,0 +1,235 @@
> +/*
> + * UIO IVShmem Driver
> + *
> + * (C) 2009 Cam Macdonell
> + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch <hjk@linutronix.de>
> + *
> + * Licensed under GPL version 2 only.
> + *
> + */
> +
> +#include <linux/device.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/uio_driver.h>
> +
> +#include <asm/io.h>
> +
> +#define IntrStatus 0x04
> +#define IntrMask 0x00
> +
> +struct ivshmem_info {
> +    struct uio_info *uio;
> +    struct pci_dev *dev;
> +    char (*msix_names)[256];
> +    struct msix_entry *msix_entries;
> +    int nvectors;
> +};
> +
> +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
> +{
> +
> +    void __iomem *plx_intscr = dev_info->mem[0].internal_addr
> +                    + IntrStatus;
> +    u32 val;
> +
> +    val = readl(plx_intscr);
> +    if (val == 0)
> +        return IRQ_NONE;
> +
> +    printk(KERN_INFO "Regular interrupt (val = %d)\n", val);
> +    return IRQ_HANDLED;
> +}
> +
> +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
> +{
> +
> +    struct uio_info * dev_info = (struct uio_info *) opaque;
> +
> +    /* we have to do this explicitly when using MSI-X */
> +    uio_event_notify(dev_info);
> +    printk(KERN_INFO "MSI-X interrupt (%d)\n", irq);
> +    return IRQ_HANDLED;
> +
> +}
> +
> +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
> +{
> +    int i, err;
> +    const char *name = "ivshmem";
> +
> +    printk(KERN_INFO "devname is %s\n", name);
> +    ivs_info->nvectors = nvectors;
> +
> +
> +    ivs_info->msix_entries = kmalloc(nvectors * sizeof *ivs_info->msix_entries,
> +            GFP_KERNEL);
> +    ivs_info->msix_names = kmalloc(nvectors * sizeof *ivs_info->msix_names,
> +            GFP_KERNEL);
> +
> +    for (i = 0; i < nvectors; ++i)
> +        ivs_info->msix_entries[i].entry = i;
> +
> +    err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
> +                    ivs_info->nvectors);
> +    if (err > 0) {
> +        ivs_info->nvectors = err; /* msi-x positive error code
> +                                     returns the number available*/
> +        err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
> +                    ivs_info->nvectors);
> +        if (err > 0) {
> +            printk(KERN_INFO "no MSI (%d). Back to INTx.\n", err);
> +            return -ENOSPC;
> +        }
> +    }
> +
> +    printk(KERN_INFO "err is %d\n", err);
> +    if (err) return err;
> +
> +    for (i = 0; i < ivs_info->nvectors; i++) {
> +
> +        snprintf(ivs_info->msix_names[i], sizeof *ivs_info->msix_names,
> +            "%s-config", name);
> +
> +        ivs_info->msix_entries[i].entry = i;
> +        err = request_irq(ivs_info->msix_entries[i].vector,
> +            ivshmem_msix_handler, 0,
> +            ivs_info->msix_names[i], ivs_info->uio);
> +
> +        if (err) {
> +            return -ENOSPC;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int __devinit ivshmem_pci_probe(struct pci_dev *dev,
> +                    const struct pci_device_id *id)
> +{
> +    struct uio_info *info;
> +    struct ivshmem_info * ivshmem_info;
> +    int nvectors = 4;
> +
> +    info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
> +    if (!info)
> +        return -ENOMEM;
> +
> +    ivshmem_info = kzalloc(sizeof(struct ivshmem_info), GFP_KERNEL);
> +    if (!ivshmem_info) {
> +        kfree(info);
> +        return -ENOMEM;
> +    }
> +
> +    if (pci_enable_device(dev))
> +        goto out_free;
> +
> +    if (pci_request_regions(dev, "ivshmem"))
> +        goto out_disable;
> +
> +    info->mem[0].addr = pci_resource_start(dev, 0);
> +    if (!info->mem[0].addr)
> +        goto out_release;
> +
> +    info->mem[0].size = pci_resource_len(dev, 0);
> +    info->mem[0].internal_addr = pci_ioremap_bar(dev, 0);
> +    if (!info->mem[0].internal_addr) {
> +        printk(KERN_INFO "got a null");
> +        goto out_release;
> +    }
> +
> +    info->mem[0].memtype = UIO_MEM_PHYS;
> +
> +    info->mem[1].addr = pci_resource_start(dev, 2);
> +    if (!info->mem[1].addr)
> +        goto out_unmap;
> +    info->mem[1].internal_addr = pci_ioremap_bar(dev, 2);
> +    if (!info->mem[1].internal_addr)
> +        goto out_unmap;
> +
> +    info->mem[1].size = pci_resource_len(dev, 2);
> +    info->mem[1].memtype = UIO_MEM_PHYS;
> +
> +    ivshmem_info->uio = info;
> +    ivshmem_info->dev = dev;
> +
> +    if (request_msix_vectors(ivshmem_info, nvectors) != 0) {
> +        printk(KERN_INFO "regular IRQs\n");
> +        info->irq = dev->irq;
> +        info->irq_flags = IRQF_SHARED;
> +        info->handler = ivshmem_handler;
> +        writel(0xffffffff, info->mem[0].internal_addr + IntrMask);
> +    } else {
> +        printk(KERN_INFO "MSI-X enabled\n");
> +        info->irq = -1;
> +    }
> +
> +    info->name = "ivshmem";
> +    info->version = "0.0.1";
> +
> +    if (uio_register_device(&dev->dev, info))
> +        goto out_unmap2;
> +
> +    pci_set_drvdata(dev, info);
> +
> +
> +    return 0;
> +out_unmap2:
> +    iounmap(info->mem[2].internal_addr);
> +out_unmap:
> +    iounmap(info->mem[0].internal_addr);
> +out_release:
> +    pci_release_regions(dev);
> +out_disable:
> +    pci_disable_device(dev);
> +out_free:
> +    kfree (info);
> +    return -ENODEV;
> +}
> +
> +static void ivshmem_pci_remove(struct pci_dev *dev)
> +{
> +    struct uio_info *info = pci_get_drvdata(dev);
> +
> +    uio_unregister_device(info);
> +    pci_release_regions(dev);
> +    pci_disable_device(dev);
> +    pci_set_drvdata(dev, NULL);
> +    iounmap(info->mem[0].internal_addr);
> +
> +    kfree (info);
> +}
> +
> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
> +    {
> +        .vendor =    0x1af4,
> +        .device =    0x1110,
> +        .subvendor =    PCI_ANY_ID,
> +        .subdevice =    PCI_ANY_ID,
> +    },
> +    { 0, }
> +};
> +
> +static struct pci_driver ivshmem_pci_driver = {
> +    .name = "uio_ivshmem",
> +    .id_table = ivshmem_pci_ids,
> +    .probe = ivshmem_pci_probe,
> +    .remove = ivshmem_pci_remove,
> +};
> +
> +static int __init ivshmem_init_module(void)
> +{
> +    return pci_register_driver(&ivshmem_pci_driver);
> +}
> +
> +static void __exit ivshmem_exit_module(void)
> +{
> +    pci_unregister_driver(&ivshmem_pci_driver);
> +}
> +
> +module_init(ivshmem_init_module);
> +module_exit(ivshmem_exit_module);
> +
> +MODULE_DEVICE_TABLE(pci, ivshmem_pci_ids);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Cam Macdonell");
> -- 
> 1.6.6.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity - March 25, 2010, 9:40 a.m.
On 03/25/2010 11:15 AM, Michael S. Tsirkin wrote:
>
> - Why are you using 32 bit long memory accesses for interrupts?
>    16 bit IO eould be enough and it's faster. This what virtio-pci does.
>
>    

Why is 16 bit io faster?
Michael S. Tsirkin - March 25, 2010, 9:44 a.m.
On Thu, Mar 25, 2010 at 11:40:09AM +0200, Avi Kivity wrote:
> On 03/25/2010 11:15 AM, Michael S. Tsirkin wrote:
>>
>> - Why are you using 32 bit long memory accesses for interrupts?
>>    16 bit IO eould be enough and it's faster. This what virtio-pci does.
>>
>>    
>
> Why is 16 bit io faster?

Something to do with need for emulation to get address/data
for pci memory accesses?

> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - March 25, 2010, 9:46 a.m.
On 03/25/2010 08:09 AM, Cam Macdonell wrote:
> This patch adds a driver for my shared memory PCI device using the uio_pci
> interface.  The driver has three memory regions.  The first memory region is for
> device registers for sending interrupts. The second BAR is for receiving MSI-X
> interrupts and the third memory region maps the shared memory.  The device only
> exports the first and third memory regions to userspace.
>
> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
> MSI vectors is set to 4 which could be increased, but the driver will work with
> fewer vectors.  If MSI is not available, then regular interrupts will be used.
>    

There is now a generic PCI 2.3 driver that can handle all PCI devices.  
It doesn't support MSI, but if we add MSI support then it can be used 
without the need for a specialized driver.
Avi Kivity - March 25, 2010, 9:47 a.m.
On 03/25/2010 11:05 AM, Michael S. Tsirkin wrote:
> coding style rule violation
>

Plus it uses spaces instead of tabs for indents.
Avi Kivity - March 25, 2010, 9:58 a.m.
On 03/25/2010 11:44 AM, Michael S. Tsirkin wrote:
> On Thu, Mar 25, 2010 at 11:40:09AM +0200, Avi Kivity wrote:
>    
>> On 03/25/2010 11:15 AM, Michael S. Tsirkin wrote:
>>      
>>> - Why are you using 32 bit long memory accesses for interrupts?
>>>     16 bit IO eould be enough and it's faster. This what virtio-pci does.
>>>
>>>
>>>        
>> Why is 16 bit io faster?
>>      
> Something to do with need for emulation to get address/data
> for pci memory accesses?
>    

pio is definitely faster than mmio (all that is needed is to set one bit 
on the BAR).  But 32 vs. 16 makes no difference.
Michael S. Tsirkin - March 25, 2010, 10:07 a.m.
On Thu, Mar 25, 2010 at 11:58:30AM +0200, Avi Kivity wrote:
> On 03/25/2010 11:44 AM, Michael S. Tsirkin wrote:
>> On Thu, Mar 25, 2010 at 11:40:09AM +0200, Avi Kivity wrote:
>>    
>>> On 03/25/2010 11:15 AM, Michael S. Tsirkin wrote:
>>>      
>>>> - Why are you using 32 bit long memory accesses for interrupts?
>>>>     16 bit IO eould be enough and it's faster. This what virtio-pci does.
>>>>
>>>>
>>>>        
>>> Why is 16 bit io faster?
>>>      
>> Something to do with need for emulation to get address/data
>> for pci memory accesses?
>>    
>
> pio is definitely faster than mmio (all that is needed is to set one bit  
> on the BAR).  But 32 vs. 16 makes no difference.

Right. That's what I meant.

> -- 
> error compiling committee.c: too many arguments to function
Cam Macdonell - March 25, 2010, 4:18 p.m.
On Thu, Mar 25, 2010 at 3:15 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
>> This patch adds a driver for my shared memory PCI device using the uio_pci
>> interface.  The driver has three memory regions.  The first memory region is for
>> device registers for sending interrupts. The second BAR is for receiving MSI-X
>> interrupts and the third memory region maps the shared memory.  The device only
>> exports the first and third memory regions to userspace.
>>
>> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
>> MSI vectors is set to 4 which could be increased, but the driver will work with
>> fewer vectors.  If MSI is not available, then regular interrupts will be used.
>
> Some high level questions, sorry if they have been raised in the past:
> - Can this device use virtio framework?
>  This gives us some standards to work off, with feature negotiation,
>  ability to detect config changes, support for non-PCI
>  platforms, decent documentation that is easy to extend,
>  legal id range to use.
>  You would thus have your driver in uio/uio_virtio_shmem.c

There has been previous discussion of virtio, however while virtio is
good for exporting guest memory, it's not ideal for importing memory
into a guest.

>
> - Why are you using 32 bit long memory accesses for interrupts?
>  16 bit IO eould be enough and it's faster. This what virtio-pci does.
>
> - How was the driver tested?

I have some test programs in the git repo I linked to.  I've been
using some simple producer/consumer tests to test the interrupt
framework.

>
>> ---
>>  drivers/uio/Kconfig       |    8 ++
>>  drivers/uio/Makefile      |    1 +
>>  drivers/uio/uio_ivshmem.c |  235 +++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 244 insertions(+), 0 deletions(-)
>>  create mode 100644 drivers/uio/uio_ivshmem.c
>>
>> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
>> index 1da73ec..b92cded 100644
>> --- a/drivers/uio/Kconfig
>> +++ b/drivers/uio/Kconfig
>> @@ -74,6 +74,14 @@ config UIO_SERCOS3
>>
>>         If you compile this as a module, it will be called uio_sercos3.
>>
>> +config UIO_IVSHMEM
>> +     tristate "KVM shared memory PCI driver"
>> +     default n
>> +     help
>> +       Userspace I/O interface for the KVM shared memory device.  This
>> +       driver will make available two memory regions, the first is
>> +       registers and the second is a region for sharing between VMs.
>> +
>>  config UIO_PCI_GENERIC
>>       tristate "Generic driver for PCI 2.3 and PCI Express cards"
>>       depends on PCI
>> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
>> index 18fd818..25c1ca5 100644
>> --- a/drivers/uio/Makefile
>> +++ b/drivers/uio/Makefile
>> @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o
>>  obj-$(CONFIG_UIO_SERCOS3)    += uio_sercos3.o
>>  obj-$(CONFIG_UIO_PCI_GENERIC)        += uio_pci_generic.o
>>  obj-$(CONFIG_UIO_NETX)       += uio_netx.o
>> +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
>> diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
>> new file mode 100644
>> index 0000000..607435b
>> --- /dev/null
>> +++ b/drivers/uio/uio_ivshmem.c
>> @@ -0,0 +1,235 @@
>> +/*
>> + * UIO IVShmem Driver
>> + *
>> + * (C) 2009 Cam Macdonell
>> + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch <hjk@linutronix.de>
>> + *
>> + * Licensed under GPL version 2 only.
>> + *
>> + */
>> +
>> +#include <linux/device.h>
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/uio_driver.h>
>> +
>> +#include <asm/io.h>
>> +
>> +#define IntrStatus 0x04
>> +#define IntrMask 0x00
>> +
>> +struct ivshmem_info {
>> +    struct uio_info *uio;
>> +    struct pci_dev *dev;
>> +    char (*msix_names)[256];
>> +    struct msix_entry *msix_entries;
>> +    int nvectors;
>> +};
>> +
>> +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
>> +{
>> +
>> +    void __iomem *plx_intscr = dev_info->mem[0].internal_addr
>> +                    + IntrStatus;
>> +    u32 val;
>> +
>> +    val = readl(plx_intscr);
>> +    if (val == 0)
>> +        return IRQ_NONE;
>> +
>> +    printk(KERN_INFO "Regular interrupt (val = %d)\n", val);
>> +    return IRQ_HANDLED;
>> +}
>> +
>> +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
>> +{
>> +
>> +    struct uio_info * dev_info = (struct uio_info *) opaque;
>> +
>> +    /* we have to do this explicitly when using MSI-X */
>> +    uio_event_notify(dev_info);
>> +    printk(KERN_INFO "MSI-X interrupt (%d)\n", irq);
>> +    return IRQ_HANDLED;
>> +
>> +}
>> +
>> +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
>> +{
>> +    int i, err;
>> +    const char *name = "ivshmem";
>> +
>> +    printk(KERN_INFO "devname is %s\n", name);
>> +    ivs_info->nvectors = nvectors;
>> +
>> +
>> +    ivs_info->msix_entries = kmalloc(nvectors * sizeof *ivs_info->msix_entries,
>> +            GFP_KERNEL);
>> +    ivs_info->msix_names = kmalloc(nvectors * sizeof *ivs_info->msix_names,
>> +            GFP_KERNEL);
>> +
>> +    for (i = 0; i < nvectors; ++i)
>> +        ivs_info->msix_entries[i].entry = i;
>> +
>> +    err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
>> +                    ivs_info->nvectors);
>> +    if (err > 0) {
>> +        ivs_info->nvectors = err; /* msi-x positive error code
>> +                                     returns the number available*/
>> +        err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
>> +                    ivs_info->nvectors);
>> +        if (err > 0) {
>> +            printk(KERN_INFO "no MSI (%d). Back to INTx.\n", err);
>> +            return -ENOSPC;
>> +        }
>> +    }
>> +
>> +    printk(KERN_INFO "err is %d\n", err);
>> +    if (err) return err;
>> +
>> +    for (i = 0; i < ivs_info->nvectors; i++) {
>> +
>> +        snprintf(ivs_info->msix_names[i], sizeof *ivs_info->msix_names,
>> +            "%s-config", name);
>> +
>> +        ivs_info->msix_entries[i].entry = i;
>> +        err = request_irq(ivs_info->msix_entries[i].vector,
>> +            ivshmem_msix_handler, 0,
>> +            ivs_info->msix_names[i], ivs_info->uio);
>> +
>> +        if (err) {
>> +            return -ENOSPC;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int __devinit ivshmem_pci_probe(struct pci_dev *dev,
>> +                    const struct pci_device_id *id)
>> +{
>> +    struct uio_info *info;
>> +    struct ivshmem_info * ivshmem_info;
>> +    int nvectors = 4;
>> +
>> +    info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
>> +    if (!info)
>> +        return -ENOMEM;
>> +
>> +    ivshmem_info = kzalloc(sizeof(struct ivshmem_info), GFP_KERNEL);
>> +    if (!ivshmem_info) {
>> +        kfree(info);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    if (pci_enable_device(dev))
>> +        goto out_free;
>> +
>> +    if (pci_request_regions(dev, "ivshmem"))
>> +        goto out_disable;
>> +
>> +    info->mem[0].addr = pci_resource_start(dev, 0);
>> +    if (!info->mem[0].addr)
>> +        goto out_release;
>> +
>> +    info->mem[0].size = pci_resource_len(dev, 0);
>> +    info->mem[0].internal_addr = pci_ioremap_bar(dev, 0);
>> +    if (!info->mem[0].internal_addr) {
>> +        printk(KERN_INFO "got a null");
>> +        goto out_release;
>> +    }
>> +
>> +    info->mem[0].memtype = UIO_MEM_PHYS;
>> +
>> +    info->mem[1].addr = pci_resource_start(dev, 2);
>> +    if (!info->mem[1].addr)
>> +        goto out_unmap;
>> +    info->mem[1].internal_addr = pci_ioremap_bar(dev, 2);
>> +    if (!info->mem[1].internal_addr)
>> +        goto out_unmap;
>> +
>> +    info->mem[1].size = pci_resource_len(dev, 2);
>> +    info->mem[1].memtype = UIO_MEM_PHYS;
>> +
>> +    ivshmem_info->uio = info;
>> +    ivshmem_info->dev = dev;
>> +
>> +    if (request_msix_vectors(ivshmem_info, nvectors) != 0) {
>> +        printk(KERN_INFO "regular IRQs\n");
>> +        info->irq = dev->irq;
>> +        info->irq_flags = IRQF_SHARED;
>> +        info->handler = ivshmem_handler;
>> +        writel(0xffffffff, info->mem[0].internal_addr + IntrMask);
>> +    } else {
>> +        printk(KERN_INFO "MSI-X enabled\n");
>> +        info->irq = -1;
>> +    }
>> +
>> +    info->name = "ivshmem";
>> +    info->version = "0.0.1";
>> +
>> +    if (uio_register_device(&dev->dev, info))
>> +        goto out_unmap2;
>> +
>> +    pci_set_drvdata(dev, info);
>> +
>> +
>> +    return 0;
>> +out_unmap2:
>> +    iounmap(info->mem[2].internal_addr);
>> +out_unmap:
>> +    iounmap(info->mem[0].internal_addr);
>> +out_release:
>> +    pci_release_regions(dev);
>> +out_disable:
>> +    pci_disable_device(dev);
>> +out_free:
>> +    kfree (info);
>> +    return -ENODEV;
>> +}
>> +
>> +static void ivshmem_pci_remove(struct pci_dev *dev)
>> +{
>> +    struct uio_info *info = pci_get_drvdata(dev);
>> +
>> +    uio_unregister_device(info);
>> +    pci_release_regions(dev);
>> +    pci_disable_device(dev);
>> +    pci_set_drvdata(dev, NULL);
>> +    iounmap(info->mem[0].internal_addr);
>> +
>> +    kfree (info);
>> +}
>> +
>> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
>> +    {
>> +        .vendor =    0x1af4,
>> +        .device =    0x1110,
>> +        .subvendor =    PCI_ANY_ID,
>> +        .subdevice =    PCI_ANY_ID,
>> +    },
>> +    { 0, }
>> +};
>> +
>> +static struct pci_driver ivshmem_pci_driver = {
>> +    .name = "uio_ivshmem",
>> +    .id_table = ivshmem_pci_ids,
>> +    .probe = ivshmem_pci_probe,
>> +    .remove = ivshmem_pci_remove,
>> +};
>> +
>> +static int __init ivshmem_init_module(void)
>> +{
>> +    return pci_register_driver(&ivshmem_pci_driver);
>> +}
>> +
>> +static void __exit ivshmem_exit_module(void)
>> +{
>> +    pci_unregister_driver(&ivshmem_pci_driver);
>> +}
>> +
>> +module_init(ivshmem_init_module);
>> +module_exit(ivshmem_exit_module);
>> +
>> +MODULE_DEVICE_TABLE(pci, ivshmem_pci_ids);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR("Cam Macdonell");
>> --
>> 1.6.6.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
Anthony Liguori - March 25, 2010, 4:23 p.m.
On 03/25/2010 11:18 AM, Cam Macdonell wrote:
> On Thu, Mar 25, 2010 at 3:15 AM, Michael S. Tsirkin<mst@redhat.com>  wrote:
>    
>> On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
>>      
>>> This patch adds a driver for my shared memory PCI device using the uio_pci
>>> interface.  The driver has three memory regions.  The first memory region is for
>>> device registers for sending interrupts. The second BAR is for receiving MSI-X
>>> interrupts and the third memory region maps the shared memory.  The device only
>>> exports the first and third memory regions to userspace.
>>>
>>> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
>>> MSI vectors is set to 4 which could be increased, but the driver will work with
>>> fewer vectors.  If MSI is not available, then regular interrupts will be used.
>>>        
>> Some high level questions, sorry if they have been raised in the past:
>> - Can this device use virtio framework?
>>   This gives us some standards to work off, with feature negotiation,
>>   ability to detect config changes, support for non-PCI
>>   platforms, decent documentation that is easy to extend,
>>   legal id range to use.
>>   You would thus have your driver in uio/uio_virtio_shmem.c
>>      
> There has been previous discussion of virtio, however while virtio is
> good for exporting guest memory, it's not ideal for importing memory
> into a guest.
>    

virtio is a DMA-based API which means that it doesn't assume cache 
coherent shared memory.  The PCI transport takes advantage of cache 
coherent shared memory but it's not strictly required.

Memory sharing in virtio would be a layering violation because it forces 
cache coherent shared memory for all virtio transports.

Regards,

Anthony Liguori
Cam Macdonell - March 25, 2010, 4:24 p.m.
On Thu, Mar 25, 2010 at 3:46 AM, Avi Kivity <avi@redhat.com> wrote:
> On 03/25/2010 08:09 AM, Cam Macdonell wrote:
>>
>> This patch adds a driver for my shared memory PCI device using the uio_pci
>> interface.  The driver has three memory regions.  The first memory region
>> is for
>> device registers for sending interrupts. The second BAR is for receiving
>> MSI-X
>> interrupts and the third memory region maps the shared memory.  The device
>> only
>> exports the first and third memory regions to userspace.
>>
>> This driver supports MSI-X and regular pin interrupts.  Currently, the
>> number of
>> MSI vectors is set to 4 which could be increased, but the driver will work
>> with
>> fewer vectors.  If MSI is not available, then regular interrupts will be
>> used.
>>
>
> There is now a generic PCI 2.3 driver that can handle all PCI devices.  It
> doesn't support MSI, but if we add MSI support then it can be used without
> the need for a specialized driver.

Agreed, I'd be happy to use the generic driver if MSI is there.  What
would MSI support for UIO look like?  An array of "struct uio_irq" for
the different vectors?

Cam

>
> --
> error compiling committee.c: too many arguments to function
>
>
Cam Macdonell - March 25, 2010, 4:30 p.m.
On Thu, Mar 25, 2010 at 3:05 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
>> This patch adds a driver for my shared memory PCI device using the uio_pci
>> interface.  The driver has three memory regions.  The first memory region is for
>> device registers for sending interrupts. The second BAR is for receiving MSI-X
>> interrupts and the third memory region maps the shared memory.  The device only
>> exports the first and third memory regions to userspace.
>>
>> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
>> MSI vectors is set to 4 which could be increased, but the driver will work with
>> fewer vectors.  If MSI is not available, then regular interrupts will be used.
>> ---
>>  drivers/uio/Kconfig       |    8 ++
>>  drivers/uio/Makefile      |    1 +
>>  drivers/uio/uio_ivshmem.c |  235 +++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 244 insertions(+), 0 deletions(-)
>>  create mode 100644 drivers/uio/uio_ivshmem.c
>>
>> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
>> index 1da73ec..b92cded 100644
>> --- a/drivers/uio/Kconfig
>> +++ b/drivers/uio/Kconfig
>> @@ -74,6 +74,14 @@ config UIO_SERCOS3
>>
>>         If you compile this as a module, it will be called uio_sercos3.
>>
>> +config UIO_IVSHMEM
>> +     tristate "KVM shared memory PCI driver"
>> +     default n
>> +     help
>> +       Userspace I/O interface for the KVM shared memory device.  This
>> +       driver will make available two memory regions, the first is
>> +       registers and the second is a region for sharing between VMs.
>> +
>>  config UIO_PCI_GENERIC
>>       tristate "Generic driver for PCI 2.3 and PCI Express cards"
>>       depends on PCI
>> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
>> index 18fd818..25c1ca5 100644
>> --- a/drivers/uio/Makefile
>> +++ b/drivers/uio/Makefile
>> @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o
>>  obj-$(CONFIG_UIO_SERCOS3)    += uio_sercos3.o
>>  obj-$(CONFIG_UIO_PCI_GENERIC)        += uio_pci_generic.o
>>  obj-$(CONFIG_UIO_NETX)       += uio_netx.o
>> +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
>> diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
>> new file mode 100644
>> index 0000000..607435b
>> --- /dev/null
>> +++ b/drivers/uio/uio_ivshmem.c
>> @@ -0,0 +1,235 @@
>> +/*
>> + * UIO IVShmem Driver
>> + *
>> + * (C) 2009 Cam Macdonell
>> + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch <hjk@linutronix.de>
>> + *
>> + * Licensed under GPL version 2 only.
>> + *
>> + */
>> +
>> +#include <linux/device.h>
>> +#include <linux/module.h>
>> +#include <linux/pci.h>
>> +#include <linux/uio_driver.h>
>> +
>> +#include <asm/io.h>
>> +
>> +#define IntrStatus 0x04
>> +#define IntrMask 0x00
>> +
>> +struct ivshmem_info {
>> +    struct uio_info *uio;
>> +    struct pci_dev *dev;
>> +    char (*msix_names)[256];
>> +    struct msix_entry *msix_entries;
>> +    int nvectors;
>> +};
>> +
>> +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
>> +{
>> +
>> +    void __iomem *plx_intscr = dev_info->mem[0].internal_addr
>> +                    + IntrStatus;
>> +    u32 val;
>> +
>> +    val = readl(plx_intscr);
>> +    if (val == 0)
>> +        return IRQ_NONE;
>> +
>> +    printk(KERN_INFO "Regular interrupt (val = %d)\n", val);
>> +    return IRQ_HANDLED;
>> +}
>> +
>> +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
>> +{
>> +
>> +    struct uio_info * dev_info = (struct uio_info *) opaque;
>> +
>> +    /* we have to do this explicitly when using MSI-X */
>> +    uio_event_notify(dev_info);
>
> How does userspace know which vector was triggered?

Right now, it doesn't.  If a user had a particular need they would
need to write their own uio driver.  I guess this leads to a
discussion of MSI support in UIO and how they would work with the
userspace.

>
>> +    printk(KERN_INFO "MSI-X interrupt (%d)\n", irq);
>> +    return IRQ_HANDLED;
>> +
>
> extra empty line
>
>> +}
>> +
>> +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
>> +{
>> +    int i, err;
>> +    const char *name = "ivshmem";
>> +
>> +    printk(KERN_INFO "devname is %s\n", name);
>
> These KERN_INFO messages need to be cleaned up, they would be
> look confusing in the log.

Agreed.  I will clean most of these out.

>
>> +    ivs_info->nvectors = nvectors;
>> +
>> +
>> +    ivs_info->msix_entries = kmalloc(nvectors * sizeof *ivs_info->msix_entries,
>> +            GFP_KERNEL);
>> +    ivs_info->msix_names = kmalloc(nvectors * sizeof *ivs_info->msix_names,
>> +            GFP_KERNEL);
>> +
>
> need to handle errors
>
>> +    for (i = 0; i < nvectors; ++i)
>> +        ivs_info->msix_entries[i].entry = i;
>> +
>> +    err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
>> +                    ivs_info->nvectors);
>> +    if (err > 0) {
>> +        ivs_info->nvectors = err; /* msi-x positive error code
>> +                                     returns the number available*/
>> +        err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
>> +                    ivs_info->nvectors);
>> +        if (err > 0) {
>> +            printk(KERN_INFO "no MSI (%d). Back to INTx.\n", err);
>> +            return -ENOSPC;
>> +        }
>> +    }
>
> we can also get err < 0.
>
>> +
>> +    printk(KERN_INFO "err is %d\n", err);
>> +    if (err) return err;
>
> coding style rule violation
>
>> +
>> +    for (i = 0; i < ivs_info->nvectors; i++) {
>> +
>> +        snprintf(ivs_info->msix_names[i], sizeof *ivs_info->msix_names,
>> +            "%s-config", name);
>> +
>> +        ivs_info->msix_entries[i].entry = i;
>> +        err = request_irq(ivs_info->msix_entries[i].vector,
>> +            ivshmem_msix_handler, 0,
>> +            ivs_info->msix_names[i], ivs_info->uio);
>> +
>> +        if (err) {
>> +            return -ENOSPC;
>
> coding style rule violation
> no undo on error handling
> why override error code with -ENOSPC?

Ah, I think I've confused linux and qemu coding styles perhaps.  I'll
fix these and others above.

>
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int __devinit ivshmem_pci_probe(struct pci_dev *dev,
>> +                    const struct pci_device_id *id)
>> +{
>> +    struct uio_info *info;
>> +    struct ivshmem_info * ivshmem_info;
>> +    int nvectors = 4;
>> +
>> +    info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
>> +    if (!info)
>> +        return -ENOMEM;
>> +
>> +    ivshmem_info = kzalloc(sizeof(struct ivshmem_info), GFP_KERNEL);
>> +    if (!ivshmem_info) {
>> +        kfree(info);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    if (pci_enable_device(dev))
>> +        goto out_free;
>> +
>> +    if (pci_request_regions(dev, "ivshmem"))
>> +        goto out_disable;
>> +
>> +    info->mem[0].addr = pci_resource_start(dev, 0);
>> +    if (!info->mem[0].addr)
>> +        goto out_release;
>> +
>> +    info->mem[0].size = pci_resource_len(dev, 0);
>> +    info->mem[0].internal_addr = pci_ioremap_bar(dev, 0);
>> +    if (!info->mem[0].internal_addr) {
>> +        printk(KERN_INFO "got a null");
>> +        goto out_release;
>> +    }
>> +
>> +    info->mem[0].memtype = UIO_MEM_PHYS;
>> +
>> +    info->mem[1].addr = pci_resource_start(dev, 2);
>> +    if (!info->mem[1].addr)
>> +        goto out_unmap;
>> +    info->mem[1].internal_addr = pci_ioremap_bar(dev, 2);
>> +    if (!info->mem[1].internal_addr)
>> +        goto out_unmap;
>> +
>> +    info->mem[1].size = pci_resource_len(dev, 2);
>> +    info->mem[1].memtype = UIO_MEM_PHYS;
>> +
>> +    ivshmem_info->uio = info;
>> +    ivshmem_info->dev = dev;
>> +
>> +    if (request_msix_vectors(ivshmem_info, nvectors) != 0) {
>> +        printk(KERN_INFO "regular IRQs\n");
>> +        info->irq = dev->irq;
>> +        info->irq_flags = IRQF_SHARED;
>> +        info->handler = ivshmem_handler;
>> +        writel(0xffffffff, info->mem[0].internal_addr + IntrMask);
>> +    } else {
>> +        printk(KERN_INFO "MSI-X enabled\n");
>> +        info->irq = -1;
>> +    }
>> +
>> +    info->name = "ivshmem";
>> +    info->version = "0.0.1";
>> +
>> +    if (uio_register_device(&dev->dev, info))
>> +        goto out_unmap2;
>> +
>> +    pci_set_drvdata(dev, info);
>> +
>> +
>> +    return 0;
>> +out_unmap2:
>> +    iounmap(info->mem[2].internal_addr);
>> +out_unmap:
>> +    iounmap(info->mem[0].internal_addr);
>> +out_release:
>> +    pci_release_regions(dev);
>> +out_disable:
>> +    pci_disable_device(dev);
>> +out_free:
>> +    kfree (info);
>> +    return -ENODEV;
>> +}
>> +
>> +static void ivshmem_pci_remove(struct pci_dev *dev)
>> +{
>> +    struct uio_info *info = pci_get_drvdata(dev);
>> +
>> +    uio_unregister_device(info);
>> +    pci_release_regions(dev);
>> +    pci_disable_device(dev);
>> +    pci_set_drvdata(dev, NULL);
>> +    iounmap(info->mem[0].internal_addr);
>> +
>> +    kfree (info);
>> +}
>> +
>> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
>> +    {
>> +        .vendor =    0x1af4,
>> +        .device =    0x1110,
>
> vendor ids must be registered with PCI SIG.
> this one does not seem to be registered.
>
>> +        .subvendor =    PCI_ANY_ID,
>> +        .subdevice =    PCI_ANY_ID,
>> +    },
>> +    { 0, }
>> +};
>> +
>> +static struct pci_driver ivshmem_pci_driver = {
>> +    .name = "uio_ivshmem",
>> +    .id_table = ivshmem_pci_ids,
>> +    .probe = ivshmem_pci_probe,
>> +    .remove = ivshmem_pci_remove,
>> +};
>> +
>> +static int __init ivshmem_init_module(void)
>> +{
>> +    return pci_register_driver(&ivshmem_pci_driver);
>> +}
>> +
>> +static void __exit ivshmem_exit_module(void)
>> +{
>> +    pci_unregister_driver(&ivshmem_pci_driver);
>> +}
>> +
>> +module_init(ivshmem_init_module);
>> +module_exit(ivshmem_exit_module);
>> +
>> +MODULE_DEVICE_TABLE(pci, ivshmem_pci_ids);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR("Cam Macdonell");
>> --
>> 1.6.6.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
Avi Kivity - March 25, 2010, 4:32 p.m.
On 03/25/2010 06:23 PM, Anthony Liguori wrote:
>> There has been previous discussion of virtio, however while virtio is
>> good for exporting guest memory, it's not ideal for importing memory
>> into a guest.
>
> virtio is a DMA-based API which means that it doesn't assume cache 
> coherent shared memory.  The PCI transport takes advantage of cache 
> coherent shared memory but it's not strictly required.

Aren't we violating this by not using dma_alloc_coherent() for the queues?
Michael S. Tsirkin - March 25, 2010, 4:33 p.m.
On Thu, Mar 25, 2010 at 11:23:05AM -0500, Anthony Liguori wrote:
> On 03/25/2010 11:18 AM, Cam Macdonell wrote:
>> On Thu, Mar 25, 2010 at 3:15 AM, Michael S. Tsirkin<mst@redhat.com>  wrote:
>>    
>>> On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
>>>      
>>>> This patch adds a driver for my shared memory PCI device using the uio_pci
>>>> interface.  The driver has three memory regions.  The first memory region is for
>>>> device registers for sending interrupts. The second BAR is for receiving MSI-X
>>>> interrupts and the third memory region maps the shared memory.  The device only
>>>> exports the first and third memory regions to userspace.
>>>>
>>>> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
>>>> MSI vectors is set to 4 which could be increased, but the driver will work with
>>>> fewer vectors.  If MSI is not available, then regular interrupts will be used.
>>>>        
>>> Some high level questions, sorry if they have been raised in the past:
>>> - Can this device use virtio framework?
>>>   This gives us some standards to work off, with feature negotiation,
>>>   ability to detect config changes, support for non-PCI
>>>   platforms, decent documentation that is easy to extend,
>>>   legal id range to use.
>>>   You would thus have your driver in uio/uio_virtio_shmem.c
>>>      
>> There has been previous discussion of virtio, however while virtio is
>> good for exporting guest memory, it's not ideal for importing memory
>> into a guest.
>>    
>
> virtio is a DMA-based API which means that it doesn't assume cache  
> coherent shared memory.  The PCI transport takes advantage of cache  
> coherent shared memory but it's not strictly required.
>
> Memory sharing in virtio would be a layering violation because it forces  
> cache coherent shared memory for all virtio transports.
>
> Regards,
>
> Anthony Liguori

I am not sure I understand the argument.  We can still reuse feature
negotiation, notifications etc from virtio.  If some transports can not
support cache coherent shared memory, the device won't work there.
But it won't work there anyway, even without using virtio.
We are not forcing all devices to assume cache coherency.

In other words, let's not fall into midlayer mistake, let's
treat virtio as a library.
Michael S. Tsirkin - March 25, 2010, 4:34 p.m.
On Thu, Mar 25, 2010 at 10:30:42AM -0600, Cam Macdonell wrote:
> On Thu, Mar 25, 2010 at 3:05 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
> >> This patch adds a driver for my shared memory PCI device using the uio_pci
> >> interface.  The driver has three memory regions.  The first memory region is for
> >> device registers for sending interrupts. The second BAR is for receiving MSI-X
> >> interrupts and the third memory region maps the shared memory.  The device only
> >> exports the first and third memory regions to userspace.
> >>
> >> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
> >> MSI vectors is set to 4 which could be increased, but the driver will work with
> >> fewer vectors.  If MSI is not available, then regular interrupts will be used.
> >> ---
> >>  drivers/uio/Kconfig       |    8 ++
> >>  drivers/uio/Makefile      |    1 +
> >>  drivers/uio/uio_ivshmem.c |  235 +++++++++++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 244 insertions(+), 0 deletions(-)
> >>  create mode 100644 drivers/uio/uio_ivshmem.c
> >>
> >> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
> >> index 1da73ec..b92cded 100644
> >> --- a/drivers/uio/Kconfig
> >> +++ b/drivers/uio/Kconfig
> >> @@ -74,6 +74,14 @@ config UIO_SERCOS3
> >>
> >>         If you compile this as a module, it will be called uio_sercos3.
> >>
> >> +config UIO_IVSHMEM
> >> +     tristate "KVM shared memory PCI driver"
> >> +     default n
> >> +     help
> >> +       Userspace I/O interface for the KVM shared memory device.  This
> >> +       driver will make available two memory regions, the first is
> >> +       registers and the second is a region for sharing between VMs.
> >> +
> >>  config UIO_PCI_GENERIC
> >>       tristate "Generic driver for PCI 2.3 and PCI Express cards"
> >>       depends on PCI
> >> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
> >> index 18fd818..25c1ca5 100644
> >> --- a/drivers/uio/Makefile
> >> +++ b/drivers/uio/Makefile
> >> @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o
> >>  obj-$(CONFIG_UIO_SERCOS3)    += uio_sercos3.o
> >>  obj-$(CONFIG_UIO_PCI_GENERIC)        += uio_pci_generic.o
> >>  obj-$(CONFIG_UIO_NETX)       += uio_netx.o
> >> +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
> >> diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
> >> new file mode 100644
> >> index 0000000..607435b
> >> --- /dev/null
> >> +++ b/drivers/uio/uio_ivshmem.c
> >> @@ -0,0 +1,235 @@
> >> +/*
> >> + * UIO IVShmem Driver
> >> + *
> >> + * (C) 2009 Cam Macdonell
> >> + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch <hjk@linutronix.de>
> >> + *
> >> + * Licensed under GPL version 2 only.
> >> + *
> >> + */
> >> +
> >> +#include <linux/device.h>
> >> +#include <linux/module.h>
> >> +#include <linux/pci.h>
> >> +#include <linux/uio_driver.h>
> >> +
> >> +#include <asm/io.h>
> >> +
> >> +#define IntrStatus 0x04
> >> +#define IntrMask 0x00
> >> +
> >> +struct ivshmem_info {
> >> +    struct uio_info *uio;
> >> +    struct pci_dev *dev;
> >> +    char (*msix_names)[256];
> >> +    struct msix_entry *msix_entries;
> >> +    int nvectors;
> >> +};
> >> +
> >> +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
> >> +{
> >> +
> >> +    void __iomem *plx_intscr = dev_info->mem[0].internal_addr
> >> +                    + IntrStatus;
> >> +    u32 val;
> >> +
> >> +    val = readl(plx_intscr);
> >> +    if (val == 0)
> >> +        return IRQ_NONE;
> >> +
> >> +    printk(KERN_INFO "Regular interrupt (val = %d)\n", val);
> >> +    return IRQ_HANDLED;
> >> +}
> >> +
> >> +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
> >> +{
> >> +
> >> +    struct uio_info * dev_info = (struct uio_info *) opaque;
> >> +
> >> +    /* we have to do this explicitly when using MSI-X */
> >> +    uio_event_notify(dev_info);
> >
> > How does userspace know which vector was triggered?
> 
> Right now, it doesn't.  If a user had a particular need they would
> need to write their own uio driver.  I guess this leads to a
> discussion of MSI support in UIO and how they would work with the
> userspace.

So why request more than one vector then?

> >
> >> +    printk(KERN_INFO "MSI-X interrupt (%d)\n", irq);
> >> +    return IRQ_HANDLED;
> >> +
> >
> > extra empty line
> >
> >> +}
> >> +
> >> +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
> >> +{
> >> +    int i, err;
> >> +    const char *name = "ivshmem";
> >> +
> >> +    printk(KERN_INFO "devname is %s\n", name);
> >
> > These KERN_INFO messages need to be cleaned up, they would be
> > look confusing in the log.
> 
> Agreed.  I will clean most of these out.
> 
> >
> >> +    ivs_info->nvectors = nvectors;
> >> +
> >> +
> >> +    ivs_info->msix_entries = kmalloc(nvectors * sizeof *ivs_info->msix_entries,
> >> +            GFP_KERNEL);
> >> +    ivs_info->msix_names = kmalloc(nvectors * sizeof *ivs_info->msix_names,
> >> +            GFP_KERNEL);
> >> +
> >
> > need to handle errors
> >
> >> +    for (i = 0; i < nvectors; ++i)
> >> +        ivs_info->msix_entries[i].entry = i;
> >> +
> >> +    err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
> >> +                    ivs_info->nvectors);
> >> +    if (err > 0) {
> >> +        ivs_info->nvectors = err; /* msi-x positive error code
> >> +                                     returns the number available*/
> >> +        err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
> >> +                    ivs_info->nvectors);
> >> +        if (err > 0) {
> >> +            printk(KERN_INFO "no MSI (%d). Back to INTx.\n", err);
> >> +            return -ENOSPC;
> >> +        }
> >> +    }
> >
> > we can also get err < 0.
> >
> >> +
> >> +    printk(KERN_INFO "err is %d\n", err);
> >> +    if (err) return err;
> >
> > coding style rule violation
> >
> >> +
> >> +    for (i = 0; i < ivs_info->nvectors; i++) {
> >> +
> >> +        snprintf(ivs_info->msix_names[i], sizeof *ivs_info->msix_names,
> >> +            "%s-config", name);
> >> +
> >> +        ivs_info->msix_entries[i].entry = i;
> >> +        err = request_irq(ivs_info->msix_entries[i].vector,
> >> +            ivshmem_msix_handler, 0,
> >> +            ivs_info->msix_names[i], ivs_info->uio);
> >> +
> >> +        if (err) {
> >> +            return -ENOSPC;
> >
> > coding style rule violation
> > no undo on error handling
> > why override error code with -ENOSPC?
> 
> Ah, I think I've confused linux and qemu coding styles perhaps.  I'll
> fix these and others above.
> 
> >
> >> +        }
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int __devinit ivshmem_pci_probe(struct pci_dev *dev,
> >> +                    const struct pci_device_id *id)
> >> +{
> >> +    struct uio_info *info;
> >> +    struct ivshmem_info * ivshmem_info;
> >> +    int nvectors = 4;
> >> +
> >> +    info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
> >> +    if (!info)
> >> +        return -ENOMEM;
> >> +
> >> +    ivshmem_info = kzalloc(sizeof(struct ivshmem_info), GFP_KERNEL);
> >> +    if (!ivshmem_info) {
> >> +        kfree(info);
> >> +        return -ENOMEM;
> >> +    }
> >> +
> >> +    if (pci_enable_device(dev))
> >> +        goto out_free;
> >> +
> >> +    if (pci_request_regions(dev, "ivshmem"))
> >> +        goto out_disable;
> >> +
> >> +    info->mem[0].addr = pci_resource_start(dev, 0);
> >> +    if (!info->mem[0].addr)
> >> +        goto out_release;
> >> +
> >> +    info->mem[0].size = pci_resource_len(dev, 0);
> >> +    info->mem[0].internal_addr = pci_ioremap_bar(dev, 0);
> >> +    if (!info->mem[0].internal_addr) {
> >> +        printk(KERN_INFO "got a null");
> >> +        goto out_release;
> >> +    }
> >> +
> >> +    info->mem[0].memtype = UIO_MEM_PHYS;
> >> +
> >> +    info->mem[1].addr = pci_resource_start(dev, 2);
> >> +    if (!info->mem[1].addr)
> >> +        goto out_unmap;
> >> +    info->mem[1].internal_addr = pci_ioremap_bar(dev, 2);
> >> +    if (!info->mem[1].internal_addr)
> >> +        goto out_unmap;
> >> +
> >> +    info->mem[1].size = pci_resource_len(dev, 2);
> >> +    info->mem[1].memtype = UIO_MEM_PHYS;
> >> +
> >> +    ivshmem_info->uio = info;
> >> +    ivshmem_info->dev = dev;
> >> +
> >> +    if (request_msix_vectors(ivshmem_info, nvectors) != 0) {
> >> +        printk(KERN_INFO "regular IRQs\n");
> >> +        info->irq = dev->irq;
> >> +        info->irq_flags = IRQF_SHARED;
> >> +        info->handler = ivshmem_handler;
> >> +        writel(0xffffffff, info->mem[0].internal_addr + IntrMask);
> >> +    } else {
> >> +        printk(KERN_INFO "MSI-X enabled\n");
> >> +        info->irq = -1;
> >> +    }
> >> +
> >> +    info->name = "ivshmem";
> >> +    info->version = "0.0.1";
> >> +
> >> +    if (uio_register_device(&dev->dev, info))
> >> +        goto out_unmap2;
> >> +
> >> +    pci_set_drvdata(dev, info);
> >> +
> >> +
> >> +    return 0;
> >> +out_unmap2:
> >> +    iounmap(info->mem[2].internal_addr);
> >> +out_unmap:
> >> +    iounmap(info->mem[0].internal_addr);
> >> +out_release:
> >> +    pci_release_regions(dev);
> >> +out_disable:
> >> +    pci_disable_device(dev);
> >> +out_free:
> >> +    kfree (info);
> >> +    return -ENODEV;
> >> +}
> >> +
> >> +static void ivshmem_pci_remove(struct pci_dev *dev)
> >> +{
> >> +    struct uio_info *info = pci_get_drvdata(dev);
> >> +
> >> +    uio_unregister_device(info);
> >> +    pci_release_regions(dev);
> >> +    pci_disable_device(dev);
> >> +    pci_set_drvdata(dev, NULL);
> >> +    iounmap(info->mem[0].internal_addr);
> >> +
> >> +    kfree (info);
> >> +}
> >> +
> >> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
> >> +    {
> >> +        .vendor =    0x1af4,
> >> +        .device =    0x1110,
> >
> > vendor ids must be registered with PCI SIG.
> > this one does not seem to be registered.
> >
> >> +        .subvendor =    PCI_ANY_ID,
> >> +        .subdevice =    PCI_ANY_ID,
> >> +    },
> >> +    { 0, }
> >> +};
> >> +
> >> +static struct pci_driver ivshmem_pci_driver = {
> >> +    .name = "uio_ivshmem",
> >> +    .id_table = ivshmem_pci_ids,
> >> +    .probe = ivshmem_pci_probe,
> >> +    .remove = ivshmem_pci_remove,
> >> +};
> >> +
> >> +static int __init ivshmem_init_module(void)
> >> +{
> >> +    return pci_register_driver(&ivshmem_pci_driver);
> >> +}
> >> +
> >> +static void __exit ivshmem_exit_module(void)
> >> +{
> >> +    pci_unregister_driver(&ivshmem_pci_driver);
> >> +}
> >> +
> >> +module_init(ivshmem_init_module);
> >> +module_exit(ivshmem_exit_module);
> >> +
> >> +MODULE_DEVICE_TABLE(pci, ivshmem_pci_ids);
> >> +MODULE_LICENSE("GPL v2");
> >> +MODULE_AUTHOR("Cam Macdonell");
> >> --
> >> 1.6.6.1
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> >
Avi Kivity - March 25, 2010, 4:35 p.m.
On 03/25/2010 06:24 PM, Cam Macdonell wrote:
>
>> There is now a generic PCI 2.3 driver that can handle all PCI devices.  It
>> doesn't support MSI, but if we add MSI support then it can be used without
>> the need for a specialized driver.
>>      
> Agreed, I'd be happy to use the generic driver if MSI is there.  What
> would MSI support for UIO look like?  An array of "struct uio_irq" for
> the different vectors?
>    

I'm not familiar with the uio internals, but for the interface, an 
ioctl() on the fd to assign an eventfd to an MSI vector.  Similar to 
ioeventfd, but instead of mapping a doorbell to an eventfd, it maps a 
real MSI to an eventfd.

That would be very useful for device assignment: we can pick up a uio 
device, map its vectors, and give them to a guest.
Avi Kivity - March 25, 2010, 4:36 p.m.
On 03/25/2010 11:05 AM, Michael S. Tsirkin wrote:
>
>> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
>> +    {
>> +        .vendor =    0x1af4,
>> +        .device =    0x1110,
>>      
> vendor ids must be registered with PCI SIG.
> this one does not seem to be registered.
>    

That's the Qumranet vendor ID.
Michael S. Tsirkin - March 25, 2010, 4:37 p.m.
On Thu, Mar 25, 2010 at 06:36:02PM +0200, Avi Kivity wrote:
> On 03/25/2010 11:05 AM, Michael S. Tsirkin wrote:
>>
>>> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
>>> +    {
>>> +        .vendor =    0x1af4,
>>> +        .device =    0x1110,
>>>      
>> vendor ids must be registered with PCI SIG.
>> this one does not seem to be registered.
>>    
>
> That's the Qumranet vendor ID.

Isn't virtio_pci matching that id already?

> -- 
> error compiling committee.c: too many arguments to function
Michael S. Tsirkin - March 25, 2010, 4:40 p.m.
On Thu, Mar 25, 2010 at 06:32:15PM +0200, Avi Kivity wrote:
> On 03/25/2010 06:23 PM, Anthony Liguori wrote:
>>> There has been previous discussion of virtio, however while virtio is
>>> good for exporting guest memory, it's not ideal for importing memory
>>> into a guest.
>>
>> virtio is a DMA-based API which means that it doesn't assume cache  
>> coherent shared memory.  The PCI transport takes advantage of cache  
>> coherent shared memory but it's not strictly required.
>
> Aren't we violating this by not using dma_alloc_coherent() for the queues?

I don't see what changing this would buys us though, unless
a non-cache coherent architecture implements kvm.
TCG runs everything on a single processor so there are
no cache coherency issues.

> -- 
> error compiling committee.c: too many arguments to function
Michael S. Tsirkin - March 25, 2010, 4:42 p.m.
On Thu, Mar 25, 2010 at 10:24:20AM -0600, Cam Macdonell wrote:
> On Thu, Mar 25, 2010 at 3:46 AM, Avi Kivity <avi@redhat.com> wrote:
> > On 03/25/2010 08:09 AM, Cam Macdonell wrote:
> >>
> >> This patch adds a driver for my shared memory PCI device using the uio_pci
> >> interface.  The driver has three memory regions.  The first memory region
> >> is for
> >> device registers for sending interrupts. The second BAR is for receiving
> >> MSI-X
> >> interrupts and the third memory region maps the shared memory.  The device
> >> only
> >> exports the first and third memory regions to userspace.
> >>
> >> This driver supports MSI-X and regular pin interrupts.  Currently, the
> >> number of
> >> MSI vectors is set to 4 which could be increased, but the driver will work
> >> with
> >> fewer vectors.  If MSI is not available, then regular interrupts will be
> >> used.
> >>
> >
> > There is now a generic PCI 2.3 driver that can handle all PCI devices.  It
> > doesn't support MSI, but if we add MSI support then it can be used without
> > the need for a specialized driver.
> 
> Agreed, I'd be happy to use the generic driver if MSI is there.  What
> would MSI support for UIO look like?  An array of "struct uio_irq" for
> the different vectors?
> 
> Cam

My idea was to supply a way to bind eventfd to a vector.

> >
> > --
> > error compiling committee.c: too many arguments to function
> >
> >
Avi Kivity - March 25, 2010, 4:50 p.m.
On 03/25/2010 06:40 PM, Michael S. Tsirkin wrote:
> On Thu, Mar 25, 2010 at 06:32:15PM +0200, Avi Kivity wrote:
>    
>> On 03/25/2010 06:23 PM, Anthony Liguori wrote:
>>      
>>>> There has been previous discussion of virtio, however while virtio is
>>>> good for exporting guest memory, it's not ideal for importing memory
>>>> into a guest.
>>>>          
>>> virtio is a DMA-based API which means that it doesn't assume cache
>>> coherent shared memory.  The PCI transport takes advantage of cache
>>> coherent shared memory but it's not strictly required.
>>>        
>> Aren't we violating this by not using dma_alloc_coherent() for the queues?
>>      
> I don't see what changing this would buys us though, unless
> a non-cache coherent architecture implements kvm.
>    

We're preventing people from implementing virtio devices in hardware, 
not that anyone's doing that.
Avi Kivity - March 25, 2010, 4:51 p.m.
On 03/25/2010 06:37 PM, Michael S. Tsirkin wrote:
> On Thu, Mar 25, 2010 at 06:36:02PM +0200, Avi Kivity wrote:
>    
>> On 03/25/2010 11:05 AM, Michael S. Tsirkin wrote:
>>      
>>>        
>>>> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
>>>> +    {
>>>> +        .vendor =    0x1af4,
>>>> +        .device =    0x1110,
>>>>
>>>>          
>>> vendor ids must be registered with PCI SIG.
>>> this one does not seem to be registered.
>>>
>>>        
>> That's the Qumranet vendor ID.
>>      
> Isn't virtio_pci matching that id already?
>
>    

virtio matches devices 1000-1100 or something.
Cam Macdonell - March 25, 2010, 5:07 p.m.
On Thu, Mar 25, 2010 at 10:34 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Thu, Mar 25, 2010 at 10:30:42AM -0600, Cam Macdonell wrote:
>> On Thu, Mar 25, 2010 at 3:05 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Thu, Mar 25, 2010 at 12:09:36AM -0600, Cam Macdonell wrote:
>> >> This patch adds a driver for my shared memory PCI device using the uio_pci
>> >> interface.  The driver has three memory regions.  The first memory region is for
>> >> device registers for sending interrupts. The second BAR is for receiving MSI-X
>> >> interrupts and the third memory region maps the shared memory.  The device only
>> >> exports the first and third memory regions to userspace.
>> >>
>> >> This driver supports MSI-X and regular pin interrupts.  Currently, the number of
>> >> MSI vectors is set to 4 which could be increased, but the driver will work with
>> >> fewer vectors.  If MSI is not available, then regular interrupts will be used.
>> >> ---
>> >>  drivers/uio/Kconfig       |    8 ++
>> >>  drivers/uio/Makefile      |    1 +
>> >>  drivers/uio/uio_ivshmem.c |  235 +++++++++++++++++++++++++++++++++++++++++++++
>> >>  3 files changed, 244 insertions(+), 0 deletions(-)
>> >>  create mode 100644 drivers/uio/uio_ivshmem.c
>> >>
>> >> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
>> >> index 1da73ec..b92cded 100644
>> >> --- a/drivers/uio/Kconfig
>> >> +++ b/drivers/uio/Kconfig
>> >> @@ -74,6 +74,14 @@ config UIO_SERCOS3
>> >>
>> >>         If you compile this as a module, it will be called uio_sercos3.
>> >>
>> >> +config UIO_IVSHMEM
>> >> +     tristate "KVM shared memory PCI driver"
>> >> +     default n
>> >> +     help
>> >> +       Userspace I/O interface for the KVM shared memory device.  This
>> >> +       driver will make available two memory regions, the first is
>> >> +       registers and the second is a region for sharing between VMs.
>> >> +
>> >>  config UIO_PCI_GENERIC
>> >>       tristate "Generic driver for PCI 2.3 and PCI Express cards"
>> >>       depends on PCI
>> >> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
>> >> index 18fd818..25c1ca5 100644
>> >> --- a/drivers/uio/Makefile
>> >> +++ b/drivers/uio/Makefile
>> >> @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o
>> >>  obj-$(CONFIG_UIO_SERCOS3)    += uio_sercos3.o
>> >>  obj-$(CONFIG_UIO_PCI_GENERIC)        += uio_pci_generic.o
>> >>  obj-$(CONFIG_UIO_NETX)       += uio_netx.o
>> >> +obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
>> >> diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
>> >> new file mode 100644
>> >> index 0000000..607435b
>> >> --- /dev/null
>> >> +++ b/drivers/uio/uio_ivshmem.c
>> >> @@ -0,0 +1,235 @@
>> >> +/*
>> >> + * UIO IVShmem Driver
>> >> + *
>> >> + * (C) 2009 Cam Macdonell
>> >> + * based on Hilscher CIF card driver (C) 2007 Hans J. Koch <hjk@linutronix.de>
>> >> + *
>> >> + * Licensed under GPL version 2 only.
>> >> + *
>> >> + */
>> >> +
>> >> +#include <linux/device.h>
>> >> +#include <linux/module.h>
>> >> +#include <linux/pci.h>
>> >> +#include <linux/uio_driver.h>
>> >> +
>> >> +#include <asm/io.h>
>> >> +
>> >> +#define IntrStatus 0x04
>> >> +#define IntrMask 0x00
>> >> +
>> >> +struct ivshmem_info {
>> >> +    struct uio_info *uio;
>> >> +    struct pci_dev *dev;
>> >> +    char (*msix_names)[256];
>> >> +    struct msix_entry *msix_entries;
>> >> +    int nvectors;
>> >> +};
>> >> +
>> >> +static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
>> >> +{
>> >> +
>> >> +    void __iomem *plx_intscr = dev_info->mem[0].internal_addr
>> >> +                    + IntrStatus;
>> >> +    u32 val;
>> >> +
>> >> +    val = readl(plx_intscr);
>> >> +    if (val == 0)
>> >> +        return IRQ_NONE;
>> >> +
>> >> +    printk(KERN_INFO "Regular interrupt (val = %d)\n", val);
>> >> +    return IRQ_HANDLED;
>> >> +}
>> >> +
>> >> +static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
>> >> +{
>> >> +
>> >> +    struct uio_info * dev_info = (struct uio_info *) opaque;
>> >> +
>> >> +    /* we have to do this explicitly when using MSI-X */
>> >> +    uio_event_notify(dev_info);
>> >
>> > How does userspace know which vector was triggered?
>>
>> Right now, it doesn't.  If a user had a particular need they would
>> need to write their own uio driver.  I guess this leads to a
>> discussion of MSI support in UIO and how they would work with the
>> userspace.
>
> So why request more than one vector then?

No strong reason, to some extent an artifact of my playing with uio
and MSI together.  But, since interrupts on vector 0 are raised on
guest joins/exits, a user could modify the driver to ignore those if
they wanted.  Also, this way the device doesn't need to know if the
guest is using 1 or 2 vectors.

>
>> >
>> >> +    printk(KERN_INFO "MSI-X interrupt (%d)\n", irq);
>> >> +    return IRQ_HANDLED;
>> >> +
>> >
>> > extra empty line
>> >
>> >> +}
>> >> +
>> >> +static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
>> >> +{
>> >> +    int i, err;
>> >> +    const char *name = "ivshmem";
>> >> +
>> >> +    printk(KERN_INFO "devname is %s\n", name);
>> >
>> > These KERN_INFO messages need to be cleaned up, they would be
>> > look confusing in the log.
>>
>> Agreed.  I will clean most of these out.
>>
>> >
>> >> +    ivs_info->nvectors = nvectors;
>> >> +
>> >> +
>> >> +    ivs_info->msix_entries = kmalloc(nvectors * sizeof *ivs_info->msix_entries,
>> >> +            GFP_KERNEL);
>> >> +    ivs_info->msix_names = kmalloc(nvectors * sizeof *ivs_info->msix_names,
>> >> +            GFP_KERNEL);
>> >> +
>> >
>> > need to handle errors
>> >
>> >> +    for (i = 0; i < nvectors; ++i)
>> >> +        ivs_info->msix_entries[i].entry = i;
>> >> +
>> >> +    err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
>> >> +                    ivs_info->nvectors);
>> >> +    if (err > 0) {
>> >> +        ivs_info->nvectors = err; /* msi-x positive error code
>> >> +                                     returns the number available*/
>> >> +        err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
>> >> +                    ivs_info->nvectors);
>> >> +        if (err > 0) {
>> >> +            printk(KERN_INFO "no MSI (%d). Back to INTx.\n", err);
>> >> +            return -ENOSPC;
>> >> +        }
>> >> +    }
>> >
>> > we can also get err < 0.
>> >
>> >> +
>> >> +    printk(KERN_INFO "err is %d\n", err);
>> >> +    if (err) return err;
>> >
>> > coding style rule violation
>> >
>> >> +
>> >> +    for (i = 0; i < ivs_info->nvectors; i++) {
>> >> +
>> >> +        snprintf(ivs_info->msix_names[i], sizeof *ivs_info->msix_names,
>> >> +            "%s-config", name);
>> >> +
>> >> +        ivs_info->msix_entries[i].entry = i;
>> >> +        err = request_irq(ivs_info->msix_entries[i].vector,
>> >> +            ivshmem_msix_handler, 0,
>> >> +            ivs_info->msix_names[i], ivs_info->uio);
>> >> +
>> >> +        if (err) {
>> >> +            return -ENOSPC;
>> >
>> > coding style rule violation
>> > no undo on error handling
>> > why override error code with -ENOSPC?
>>
>> Ah, I think I've confused linux and qemu coding styles perhaps.  I'll
>> fix these and others above.
>>
>> >
>> >> +        }
>> >> +    }
>> >> +
>> >> +    return 0;
>> >> +}
>> >> +
>> >> +static int __devinit ivshmem_pci_probe(struct pci_dev *dev,
>> >> +                    const struct pci_device_id *id)
>> >> +{
>> >> +    struct uio_info *info;
>> >> +    struct ivshmem_info * ivshmem_info;
>> >> +    int nvectors = 4;
>> >> +
>> >> +    info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
>> >> +    if (!info)
>> >> +        return -ENOMEM;
>> >> +
>> >> +    ivshmem_info = kzalloc(sizeof(struct ivshmem_info), GFP_KERNEL);
>> >> +    if (!ivshmem_info) {
>> >> +        kfree(info);
>> >> +        return -ENOMEM;
>> >> +    }
>> >> +
>> >> +    if (pci_enable_device(dev))
>> >> +        goto out_free;
>> >> +
>> >> +    if (pci_request_regions(dev, "ivshmem"))
>> >> +        goto out_disable;
>> >> +
>> >> +    info->mem[0].addr = pci_resource_start(dev, 0);
>> >> +    if (!info->mem[0].addr)
>> >> +        goto out_release;
>> >> +
>> >> +    info->mem[0].size = pci_resource_len(dev, 0);
>> >> +    info->mem[0].internal_addr = pci_ioremap_bar(dev, 0);
>> >> +    if (!info->mem[0].internal_addr) {
>> >> +        printk(KERN_INFO "got a null");
>> >> +        goto out_release;
>> >> +    }
>> >> +
>> >> +    info->mem[0].memtype = UIO_MEM_PHYS;
>> >> +
>> >> +    info->mem[1].addr = pci_resource_start(dev, 2);
>> >> +    if (!info->mem[1].addr)
>> >> +        goto out_unmap;
>> >> +    info->mem[1].internal_addr = pci_ioremap_bar(dev, 2);
>> >> +    if (!info->mem[1].internal_addr)
>> >> +        goto out_unmap;
>> >> +
>> >> +    info->mem[1].size = pci_resource_len(dev, 2);
>> >> +    info->mem[1].memtype = UIO_MEM_PHYS;
>> >> +
>> >> +    ivshmem_info->uio = info;
>> >> +    ivshmem_info->dev = dev;
>> >> +
>> >> +    if (request_msix_vectors(ivshmem_info, nvectors) != 0) {
>> >> +        printk(KERN_INFO "regular IRQs\n");
>> >> +        info->irq = dev->irq;
>> >> +        info->irq_flags = IRQF_SHARED;
>> >> +        info->handler = ivshmem_handler;
>> >> +        writel(0xffffffff, info->mem[0].internal_addr + IntrMask);
>> >> +    } else {
>> >> +        printk(KERN_INFO "MSI-X enabled\n");
>> >> +        info->irq = -1;
>> >> +    }
>> >> +
>> >> +    info->name = "ivshmem";
>> >> +    info->version = "0.0.1";
>> >> +
>> >> +    if (uio_register_device(&dev->dev, info))
>> >> +        goto out_unmap2;
>> >> +
>> >> +    pci_set_drvdata(dev, info);
>> >> +
>> >> +
>> >> +    return 0;
>> >> +out_unmap2:
>> >> +    iounmap(info->mem[2].internal_addr);
>> >> +out_unmap:
>> >> +    iounmap(info->mem[0].internal_addr);
>> >> +out_release:
>> >> +    pci_release_regions(dev);
>> >> +out_disable:
>> >> +    pci_disable_device(dev);
>> >> +out_free:
>> >> +    kfree (info);
>> >> +    return -ENODEV;
>> >> +}
>> >> +
>> >> +static void ivshmem_pci_remove(struct pci_dev *dev)
>> >> +{
>> >> +    struct uio_info *info = pci_get_drvdata(dev);
>> >> +
>> >> +    uio_unregister_device(info);
>> >> +    pci_release_regions(dev);
>> >> +    pci_disable_device(dev);
>> >> +    pci_set_drvdata(dev, NULL);
>> >> +    iounmap(info->mem[0].internal_addr);
>> >> +
>> >> +    kfree (info);
>> >> +}
>> >> +
>> >> +static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
>> >> +    {
>> >> +        .vendor =    0x1af4,
>> >> +        .device =    0x1110,
>> >
>> > vendor ids must be registered with PCI SIG.
>> > this one does not seem to be registered.
>> >
>> >> +        .subvendor =    PCI_ANY_ID,
>> >> +        .subdevice =    PCI_ANY_ID,
>> >> +    },
>> >> +    { 0, }
>> >> +};
>> >> +
>> >> +static struct pci_driver ivshmem_pci_driver = {
>> >> +    .name = "uio_ivshmem",
>> >> +    .id_table = ivshmem_pci_ids,
>> >> +    .probe = ivshmem_pci_probe,
>> >> +    .remove = ivshmem_pci_remove,
>> >> +};
>> >> +
>> >> +static int __init ivshmem_init_module(void)
>> >> +{
>> >> +    return pci_register_driver(&ivshmem_pci_driver);
>> >> +}
>> >> +
>> >> +static void __exit ivshmem_exit_module(void)
>> >> +{
>> >> +    pci_unregister_driver(&ivshmem_pci_driver);
>> >> +}
>> >> +
>> >> +module_init(ivshmem_init_module);
>> >> +module_exit(ivshmem_exit_module);
>> >> +
>> >> +MODULE_DEVICE_TABLE(pci, ivshmem_pci_ids);
>> >> +MODULE_LICENSE("GPL v2");
>> >> +MODULE_AUTHOR("Cam Macdonell");
>> >> --
>> >> 1.6.6.1
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> >> the body of a message to majordomo@vger.kernel.org
>> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> >
>> >
>
>
Cam Macdonell - March 26, 2010, 4:14 p.m.
On Thu, Mar 25, 2010 at 10:35 AM, Avi Kivity <avi@redhat.com> wrote:
> On 03/25/2010 06:24 PM, Cam Macdonell wrote:
>>
>>> There is now a generic PCI 2.3 driver that can handle all PCI devices.
>>>  It
>>> doesn't support MSI, but if we add MSI support then it can be used
>>> without
>>> the need for a specialized driver.
>>>
>>
>> Agreed, I'd be happy to use the generic driver if MSI is there.  What
>> would MSI support for UIO look like?  An array of "struct uio_irq" for
>> the different vectors?
>>
>
> I'm not familiar with the uio internals, but for the interface, an ioctl()
> on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd, but
> instead of mapping a doorbell to an eventfd, it maps a real MSI to an
> eventfd.

uio will never support ioctls.  Maybe irqcontrol could be extended?

>
> That would be very useful for device assignment: we can pick up a uio
> device, map its vectors, and give them to a guest.
>
>
> --
> error compiling committee.c: too many arguments to function
>
>
Avi Kivity - March 27, 2010, 5:48 p.m.
On 03/26/2010 07:14 PM, Cam Macdonell wrote:
>
>> I'm not familiar with the uio internals, but for the interface, an ioctl()
>> on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd, but
>> instead of mapping a doorbell to an eventfd, it maps a real MSI to an
>> eventfd.
>>      
> uio will never support ioctls.

Why not?

> Maybe irqcontrol could be extended?
>    

What's irqcontrol?
Michael S. Tsirkin - March 28, 2010, 7:47 a.m.
On Sat, Mar 27, 2010 at 08:48:34PM +0300, Avi Kivity wrote:
> On 03/26/2010 07:14 PM, Cam Macdonell wrote:
>>
>>> I'm not familiar with the uio internals, but for the interface, an ioctl()
>>> on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd, but
>>> instead of mapping a doorbell to an eventfd, it maps a real MSI to an
>>> eventfd.
>>>      
>> uio will never support ioctls.
>
> Why not?
>
>> Maybe irqcontrol could be extended?
>>    
>
> What's irqcontrol?

uio accepts 32 bit writes to the char device file. We can encode
the fd number there, and use the high bit to signal assign/deassign.

> -- 
> Do not meddle in the internals of kernels, for they are subtle and quick to panic.
Avi Kivity - March 28, 2010, 8:02 a.m.
On 03/28/2010 10:47 AM, Michael S. Tsirkin wrote:
>>
>>> Maybe irqcontrol could be extended?
>>>
>>>        
>> What's irqcontrol?
>>      
> uio accepts 32 bit writes to the char device file. We can encode
> the fd number there, and use the high bit to signal assign/deassign.
>    

Ugh.  Very unexpandable.
Michael S. Tsirkin - March 28, 2010, 9:40 a.m.
On Sun, Mar 28, 2010 at 11:02:11AM +0300, Avi Kivity wrote:
> On 03/28/2010 10:47 AM, Michael S. Tsirkin wrote:
>>>
>>>> Maybe irqcontrol could be extended?
>>>>
>>>>        
>>> What's irqcontrol?
>>>      
>> uio accepts 32 bit writes to the char device file. We can encode
>> the fd number there, and use the high bit to signal assign/deassign.
>>    
>
> Ugh.  Very unexpandable.

It currently fails on any non-4 byte write.
So if we need more bits in the future we can always teach it
about e.g. 8 byte writes.

Do you think it's worth it doing it now already, and using
8 byte writes for msi mapping?


> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - March 28, 2010, 9:45 a.m.
On 03/28/2010 12:40 PM, Michael S. Tsirkin wrote:
>>> uio accepts 32 bit writes to the char device file. We can encode
>>> the fd number there, and use the high bit to signal assign/deassign.
>>>
>>>        
>> Ugh.  Very unexpandable.
>>      
> It currently fails on any non-4 byte write.
> So if we need more bits in the future we can always teach it
> about e.g. 8 byte writes.
>
> Do you think it's worth it doing it now already, and using
> 8 byte writes for msi mapping?
>    

Aren't ioctls a lot simpler?

Multiplexing multiple functions on write()s is just ioctls done uglier.
Michael S. Tsirkin - March 28, 2010, 10:31 a.m.
On Sun, Mar 28, 2010 at 12:45:02PM +0300, Avi Kivity wrote:
> On 03/28/2010 12:40 PM, Michael S. Tsirkin wrote:
>>>> uio accepts 32 bit writes to the char device file. We can encode
>>>> the fd number there, and use the high bit to signal assign/deassign.
>>>>
>>>>        
>>> Ugh.  Very unexpandable.
>>>      
>> It currently fails on any non-4 byte write.
>> So if we need more bits in the future we can always teach it
>> about e.g. 8 byte writes.
>>
>> Do you think it's worth it doing it now already, and using
>> 8 byte writes for msi mapping?
>>    
>
> Aren't ioctls a lot simpler?
>
> Multiplexing multiple functions on write()s is just ioctls done uglier.

I don't have an opinion here.

Writes do have an advantage that strace can show the buffer
content without being patched.

Further, something along the lines proposed above means that
we do not need to depend in a system header to get
the functionality.

> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - March 28, 2010, 11:12 a.m.
On 03/28/2010 01:31 PM, Michael S. Tsirkin wrote:
>
>> Aren't ioctls a lot simpler?
>>
>> Multiplexing multiple functions on write()s is just ioctls done uglier.
>>      
> I don't have an opinion here.
>
> Writes do have an advantage that strace can show the buffer
> content without being patched.
>    

ioctls encode the buffer size into the ioctl number, so in theory strace 
doesn't need to be taught about an ioctl to show its buffer.

> Further, something along the lines proposed above means that
> we do not need to depend in a system header to get
> the functionality.
>    

Yes, point users at the code and let them figure out how to do stuff.
Jamie Lokier - March 28, 2010, 1:28 p.m.
Avi Kivity wrote:
> ioctls encode the buffer size into the ioctl number, so in theory strace 
> doesn't need to be taught about an ioctl to show its buffer.

Unfortunately ioctl numbers don't always follow that rule :-(
But maybe that's just awful proprietary drivers that I've seen.

Anyway, strace should be taught how to read kernel headers to get
ioctl types ;-)

-- Jamie
malc - March 28, 2010, 1:47 p.m.
On Sun, 28 Mar 2010, Jamie Lokier wrote:

> Avi Kivity wrote:
> > ioctls encode the buffer size into the ioctl number, so in theory strace 
> > doesn't need to be taught about an ioctl to show its buffer.
> 
> Unfortunately ioctl numbers don't always follow that rule :-(

It's not a rule to begin with, since there's no way to enforce it.

> But maybe that's just awful proprietary drivers that I've seen.
> 
> Anyway, strace should be taught how to read kernel headers to get
> ioctl types ;-)
> 
> -- Jamie
> 
>
Cam Macdonell - March 28, 2010, 7:48 p.m.
On Sat, Mar 27, 2010 at 11:48 AM, Avi Kivity <avi@redhat.com> wrote:
> On 03/26/2010 07:14 PM, Cam Macdonell wrote:
>>
>>> I'm not familiar with the uio internals, but for the interface, an
>>> ioctl()
>>> on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd,
>>> but
>>> instead of mapping a doorbell to an eventfd, it maps a real MSI to an
>>> eventfd.
>>>
>>
>> uio will never support ioctls.
>
> Why not?

Perhaps I spoke too strongly, but it was rejected before

http://thread.gmane.org/gmane.linux.kernel/756481

With a compelling case perhaps it could be added.
Avi Kivity - March 29, 2010, 8:59 p.m.
On 03/28/2010 10:48 PM, Cam Macdonell wrote:
> On Sat, Mar 27, 2010 at 11:48 AM, Avi Kivity<avi@redhat.com>  wrote:
>    
>> On 03/26/2010 07:14 PM, Cam Macdonell wrote:
>>      
>>>        
>>>> I'm not familiar with the uio internals, but for the interface, an
>>>> ioctl()
>>>> on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd,
>>>> but
>>>> instead of mapping a doorbell to an eventfd, it maps a real MSI to an
>>>> eventfd.
>>>>
>>>>          
>>> uio will never support ioctls.
>>>        
>> Why not?
>>      
> Perhaps I spoke too strongly, but it was rejected before
>
> http://thread.gmane.org/gmane.linux.kernel/756481
>
> With a compelling case perhaps it could be added.
>    

Ah, the usual "ioctls are ugly, go away".

It could be done via sysfs:

   $ cat /sys/.../msix/max-interrupts
   256
   $ echo 4 > /sys/.../msix/allocate
   $ # subdirectories 0 1 2 3 magically appear
   $ # bind fd 13 to msix
   $ echo 13 > /sys/.../msix/2/bind-fd
   $ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13

Call me old fashioned, but I prefer ioctls.
Cam Macdonell - March 30, 2010, 2:52 p.m.
On Mon, Mar 29, 2010 at 2:59 PM, Avi Kivity <avi@redhat.com> wrote:
> On 03/28/2010 10:48 PM, Cam Macdonell wrote:
>>
>> On Sat, Mar 27, 2010 at 11:48 AM, Avi Kivity<avi@redhat.com>  wrote:
>>
>>>
>>> On 03/26/2010 07:14 PM, Cam Macdonell wrote:
>>>
>>>>
>>>>
>>>>>
>>>>> I'm not familiar with the uio internals, but for the interface, an
>>>>> ioctl()
>>>>> on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd,
>>>>> but
>>>>> instead of mapping a doorbell to an eventfd, it maps a real MSI to an
>>>>> eventfd.
>>>>>
>>>>>
>>>>
>>>> uio will never support ioctls.
>>>>
>>>
>>> Why not?
>>>
>>
>> Perhaps I spoke too strongly, but it was rejected before
>>
>> http://thread.gmane.org/gmane.linux.kernel/756481
>>
>> With a compelling case perhaps it could be added.
>>
>
> Ah, the usual "ioctls are ugly, go away".
>
> It could be done via sysfs:
>
>  $ cat /sys/.../msix/max-interrupts
>  256
>  $ echo 4 > /sys/.../msix/allocate
>  $ # subdirectories 0 1 2 3 magically appear
>  $ # bind fd 13 to msix
>  $ echo 13 > /sys/.../msix/2/bind-fd
>  $ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13
>
> Call me old fashioned, but I prefer ioctls.

Good point.  iiuc, the goal relative to ioctls in UIO was to not have
device drivers creating their own device-specific ABIs and drivers
that are just massive switch statements.  Having ioctls that support
functions for UIO in general, such as pairing msi vectors to eventfds,
does not go against that goal.

>
> --
> Do not meddle in the internals of kernels, for they are subtle and quick to
> panic.
>
>
Michael S. Tsirkin - March 31, 2010, 9:12 a.m.
On Mon, Mar 29, 2010 at 11:59:24PM +0300, Avi Kivity wrote:
> On 03/28/2010 10:48 PM, Cam Macdonell wrote:
>> On Sat, Mar 27, 2010 at 11:48 AM, Avi Kivity<avi@redhat.com>  wrote:
>>    
>>> On 03/26/2010 07:14 PM, Cam Macdonell wrote:
>>>      
>>>>        
>>>>> I'm not familiar with the uio internals, but for the interface, an
>>>>> ioctl()
>>>>> on the fd to assign an eventfd to an MSI vector.  Similar to ioeventfd,
>>>>> but
>>>>> instead of mapping a doorbell to an eventfd, it maps a real MSI to an
>>>>> eventfd.
>>>>>
>>>>>          
>>>> uio will never support ioctls.
>>>>        
>>> Why not?
>>>      
>> Perhaps I spoke too strongly, but it was rejected before
>>
>> http://thread.gmane.org/gmane.linux.kernel/756481
>>
>> With a compelling case perhaps it could be added.
>>    
>
> Ah, the usual "ioctls are ugly, go away".
>
> It could be done via sysfs:
>
>   $ cat /sys/.../msix/max-interrupts
>   256

This one can be discovered with existing sysfs.

>   $ echo 4 > /sys/.../msix/allocate
>   $ # subdirectories 0 1 2 3 magically appear
>   $ # bind fd 13 to msix

There's no way to know, when qemu starts, how many vectors will be used
by driver.  So I think we can just go ahead and allocate as many vectors
as supported by device at the moment when the first eventfd is bound.

>   $ echo 13 > /sys/.../msix/2/bind-fd

I think that this one can't work, both fd 13 and sysfs file will get
closed when /bin/echo process exits.

>   $ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13
>
> Call me old fashioned, but I prefer ioctls.

I think we will have to use ioctl or irqcontrol because of lifetime
issues outlines above.

> -- 
> Do not meddle in the internals of kernels, for they are subtle and quick to panic.
Avi Kivity - April 1, 2010, 8:58 a.m.
On 03/31/2010 12:12 PM, Michael S. Tsirkin wrote:
>
>
>>    $ echo 4>  /sys/.../msix/allocate
>>    $ # subdirectories 0 1 2 3 magically appear
>>    $ # bind fd 13 to msix
>>      
> There's no way to know, when qemu starts, how many vectors will be used
> by driver.  So I think we can just go ahead and allocate as many vectors
> as supported by device at the moment when the first eventfd is bound.
>    

That will cause a huge amount of vectors to be allocated.  It's better 
to do this dynamically in response to guest programming.

>>    $ echo 13>  /sys/.../msix/2/bind-fd
>>      
> I think that this one can't work, both fd 13 and sysfs file will get
> closed when /bin/echo process exits.
>    

I meant figuratively.  It's pointless to bind an eventfd from a shell.  
You'd use fprintf() from qemu to bind the eventfd.

>>    $ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13
>>
>> Call me old fashioned, but I prefer ioctls.
>>      
> I think we will have to use ioctl or irqcontrol because of lifetime
> issues outlines above.
>    

The lifetime issues don't exist if you do all that from qemu.  That's 
not to say I prefer sysfs to ioctl.

What's irqcontrol?
Avi Kivity - April 1, 2010, 8:59 a.m.
On 03/30/2010 05:52 PM, Cam Macdonell wrote:
>
>> Ah, the usual "ioctls are ugly, go away".
>>
>> It could be done via sysfs:
>>
>>   $ cat /sys/.../msix/max-interrupts
>>   256
>>   $ echo 4>  /sys/.../msix/allocate
>>   $ # subdirectories 0 1 2 3 magically appear
>>   $ # bind fd 13 to msix
>>   $ echo 13>  /sys/.../msix/2/bind-fd
>>   $ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13
>>
>> Call me old fashioned, but I prefer ioctls.
>>      
> Good point.  iiuc, the goal relative to ioctls in UIO was to not have
> device drivers creating their own device-specific ABIs and drivers
> that are just massive switch statements.  Having ioctls that support
> functions for UIO in general, such as pairing msi vectors to eventfds,
> does not go against that goal.
>    

Device specific ioctls are clearly a bad idea for uio.
Michael S. Tsirkin - April 1, 2010, 10:59 a.m.
On Thu, Apr 01, 2010 at 11:58:29AM +0300, Avi Kivity wrote:
> On 03/31/2010 12:12 PM, Michael S. Tsirkin wrote:
>>
>>
>>>    $ echo 4>  /sys/.../msix/allocate
>>>    $ # subdirectories 0 1 2 3 magically appear
>>>    $ # bind fd 13 to msix
>>>      
>> There's no way to know, when qemu starts, how many vectors will be used
>> by driver.  So I think we can just go ahead and allocate as many vectors
>> as supported by device at the moment when the first eventfd is bound.
>>    
>
> That will cause a huge amount of vectors to be allocated.  It's better  
> to do this dynamically in response to guest programming.

guest unmasks vectors one by one.
Linux does not have an API to allocate vectors one by one now.

>>>    $ echo 13>  /sys/.../msix/2/bind-fd
>>>      
>> I think that this one can't work, both fd 13 and sysfs file will get
>> closed when /bin/echo process exits.
>>    
>
> I meant figuratively.  It's pointless to bind an eventfd from a shell.   
> You'd use fprintf() from qemu to bind the eventfd.
>
>>>    $ # from now on, msix interrupt 2 will call eventfd_signal() on fd 13
>>>
>>> Call me old fashioned, but I prefer ioctls.
>>>      
>> I think we will have to use ioctl or irqcontrol because of lifetime
>> issues outlines above.
>>    
>
> The lifetime issues don't exist if you do all that from qemu.  That's  
> not to say I prefer sysfs to ioctl.
>
> What's irqcontrol?

uio core accepts 32 bit writes and passes the value written
as int to an irqcontrol callback in the device.

> -- 
> error compiling committee.c: too many arguments to function
Avi Kivity - April 1, 2010, 11:27 a.m.
On 04/01/2010 01:59 PM, Michael S. Tsirkin wrote:
> On Thu, Apr 01, 2010 at 11:58:29AM +0300, Avi Kivity wrote:
>    
>> On 03/31/2010 12:12 PM, Michael S. Tsirkin wrote:
>>      
>>>
>>>        
>>>>     $ echo 4>   /sys/.../msix/allocate
>>>>     $ # subdirectories 0 1 2 3 magically appear
>>>>     $ # bind fd 13 to msix
>>>>
>>>>          
>>> There's no way to know, when qemu starts, how many vectors will be used
>>> by driver.  So I think we can just go ahead and allocate as many vectors
>>> as supported by device at the moment when the first eventfd is bound.
>>>
>>>        
>> That will cause a huge amount of vectors to be allocated.  It's better
>> to do this dynamically in response to guest programming.
>>      
> guest unmasks vectors one by one.
> Linux does not have an API to allocate vectors one by one now.
>    

Well, maybe it should.  I'm worried that the guest could exhaust host 
irqs if we allocate the maximum amount.

>> What's irqcontrol?
>>      
> uio core accepts 32 bit writes and passes the value written
> as int to an irqcontrol callback in the device.
>    

I see.  In this case I withdraw my earlier objection about using write() 
to control eventfd binding, as it's clearly interrupt related.  I still 
prefer an explicit ioctl though.  I think you suggested to allow 
irqcontrol to support >4 byte writes, that should also work.

Patch

diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
index 1da73ec..b92cded 100644
--- a/drivers/uio/Kconfig
+++ b/drivers/uio/Kconfig
@@ -74,6 +74,14 @@  config UIO_SERCOS3
 
 	  If you compile this as a module, it will be called uio_sercos3.
 
+config UIO_IVSHMEM
+	tristate "KVM shared memory PCI driver"
+	default n
+	help
+	  Userspace I/O interface for the KVM shared memory device.  This
+	  driver will make available two memory regions, the first is
+	  registers and the second is a region for sharing between VMs.
+
 config UIO_PCI_GENERIC
 	tristate "Generic driver for PCI 2.3 and PCI Express cards"
 	depends on PCI
diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
index 18fd818..25c1ca5 100644
--- a/drivers/uio/Makefile
+++ b/drivers/uio/Makefile
@@ -6,3 +6,4 @@  obj-$(CONFIG_UIO_AEC)	+= uio_aec.o
 obj-$(CONFIG_UIO_SERCOS3)	+= uio_sercos3.o
 obj-$(CONFIG_UIO_PCI_GENERIC)	+= uio_pci_generic.o
 obj-$(CONFIG_UIO_NETX)	+= uio_netx.o
+obj-$(CONFIG_UIO_IVSHMEM) += uio_ivshmem.o
diff --git a/drivers/uio/uio_ivshmem.c b/drivers/uio/uio_ivshmem.c
new file mode 100644
index 0000000..607435b
--- /dev/null
+++ b/drivers/uio/uio_ivshmem.c
@@ -0,0 +1,235 @@ 
+/*
+ * UIO IVShmem Driver
+ *
+ * (C) 2009 Cam Macdonell
+ * based on Hilscher CIF card driver (C) 2007 Hans J. Koch <hjk@linutronix.de>
+ *
+ * Licensed under GPL version 2 only.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/uio_driver.h>
+
+#include <asm/io.h>
+
+#define IntrStatus 0x04
+#define IntrMask 0x00
+
+struct ivshmem_info {
+    struct uio_info *uio;
+    struct pci_dev *dev;
+    char (*msix_names)[256];
+    struct msix_entry *msix_entries;
+    int nvectors;
+};
+
+static irqreturn_t ivshmem_handler(int irq, struct uio_info *dev_info)
+{
+
+    void __iomem *plx_intscr = dev_info->mem[0].internal_addr
+                    + IntrStatus;
+    u32 val;
+
+    val = readl(plx_intscr);
+    if (val == 0)
+        return IRQ_NONE;
+
+    printk(KERN_INFO "Regular interrupt (val = %d)\n", val);
+    return IRQ_HANDLED;
+}
+
+static irqreturn_t ivshmem_msix_handler(int irq, void *opaque)
+{
+
+    struct uio_info * dev_info = (struct uio_info *) opaque;
+
+    /* we have to do this explicitly when using MSI-X */
+    uio_event_notify(dev_info);
+    printk(KERN_INFO "MSI-X interrupt (%d)\n", irq);
+    return IRQ_HANDLED;
+
+}
+
+static int request_msix_vectors(struct ivshmem_info *ivs_info, int nvectors)
+{
+    int i, err;
+    const char *name = "ivshmem";
+
+    printk(KERN_INFO "devname is %s\n", name);
+    ivs_info->nvectors = nvectors;
+
+
+    ivs_info->msix_entries = kmalloc(nvectors * sizeof *ivs_info->msix_entries,
+            GFP_KERNEL);
+    ivs_info->msix_names = kmalloc(nvectors * sizeof *ivs_info->msix_names,
+            GFP_KERNEL);
+
+    for (i = 0; i < nvectors; ++i)
+        ivs_info->msix_entries[i].entry = i;
+
+    err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
+                    ivs_info->nvectors);
+    if (err > 0) {
+        ivs_info->nvectors = err; /* msi-x positive error code
+                                     returns the number available*/
+        err = pci_enable_msix(ivs_info->dev, ivs_info->msix_entries,
+                    ivs_info->nvectors);
+        if (err > 0) {
+            printk(KERN_INFO "no MSI (%d). Back to INTx.\n", err);
+            return -ENOSPC;
+        }
+    }
+
+    printk(KERN_INFO "err is %d\n", err);
+    if (err) return err;
+
+    for (i = 0; i < ivs_info->nvectors; i++) {
+
+        snprintf(ivs_info->msix_names[i], sizeof *ivs_info->msix_names,
+            "%s-config", name);
+
+        ivs_info->msix_entries[i].entry = i;
+        err = request_irq(ivs_info->msix_entries[i].vector,
+            ivshmem_msix_handler, 0,
+            ivs_info->msix_names[i], ivs_info->uio);
+
+        if (err) {
+            return -ENOSPC;
+        }
+    }
+
+    return 0;
+}
+
+static int __devinit ivshmem_pci_probe(struct pci_dev *dev,
+                    const struct pci_device_id *id)
+{
+    struct uio_info *info;
+    struct ivshmem_info * ivshmem_info;
+    int nvectors = 4;
+
+    info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
+    if (!info)
+        return -ENOMEM;
+
+    ivshmem_info = kzalloc(sizeof(struct ivshmem_info), GFP_KERNEL);
+    if (!ivshmem_info) {
+        kfree(info);
+        return -ENOMEM;
+    }
+
+    if (pci_enable_device(dev))
+        goto out_free;
+
+    if (pci_request_regions(dev, "ivshmem"))
+        goto out_disable;
+
+    info->mem[0].addr = pci_resource_start(dev, 0);
+    if (!info->mem[0].addr)
+        goto out_release;
+
+    info->mem[0].size = pci_resource_len(dev, 0);
+    info->mem[0].internal_addr = pci_ioremap_bar(dev, 0);
+    if (!info->mem[0].internal_addr) {
+        printk(KERN_INFO "got a null");
+        goto out_release;
+    }
+
+    info->mem[0].memtype = UIO_MEM_PHYS;
+
+    info->mem[1].addr = pci_resource_start(dev, 2);
+    if (!info->mem[1].addr)
+        goto out_unmap;
+    info->mem[1].internal_addr = pci_ioremap_bar(dev, 2);
+    if (!info->mem[1].internal_addr)
+        goto out_unmap;
+
+    info->mem[1].size = pci_resource_len(dev, 2);
+    info->mem[1].memtype = UIO_MEM_PHYS;
+
+    ivshmem_info->uio = info;
+    ivshmem_info->dev = dev;
+
+    if (request_msix_vectors(ivshmem_info, nvectors) != 0) {
+        printk(KERN_INFO "regular IRQs\n");
+        info->irq = dev->irq;
+        info->irq_flags = IRQF_SHARED;
+        info->handler = ivshmem_handler;
+        writel(0xffffffff, info->mem[0].internal_addr + IntrMask);
+    } else {
+        printk(KERN_INFO "MSI-X enabled\n");
+        info->irq = -1;
+    }
+
+    info->name = "ivshmem";
+    info->version = "0.0.1";
+
+    if (uio_register_device(&dev->dev, info))
+        goto out_unmap2;
+
+    pci_set_drvdata(dev, info);
+
+
+    return 0;
+out_unmap2:
+    iounmap(info->mem[2].internal_addr);
+out_unmap:
+    iounmap(info->mem[0].internal_addr);
+out_release:
+    pci_release_regions(dev);
+out_disable:
+    pci_disable_device(dev);
+out_free:
+    kfree (info);
+    return -ENODEV;
+}
+
+static void ivshmem_pci_remove(struct pci_dev *dev)
+{
+    struct uio_info *info = pci_get_drvdata(dev);
+
+    uio_unregister_device(info);
+    pci_release_regions(dev);
+    pci_disable_device(dev);
+    pci_set_drvdata(dev, NULL);
+    iounmap(info->mem[0].internal_addr);
+
+    kfree (info);
+}
+
+static struct pci_device_id ivshmem_pci_ids[] __devinitdata = {
+    {
+        .vendor =    0x1af4,
+        .device =    0x1110,
+        .subvendor =    PCI_ANY_ID,
+        .subdevice =    PCI_ANY_ID,
+    },
+    { 0, }
+};
+
+static struct pci_driver ivshmem_pci_driver = {
+    .name = "uio_ivshmem",
+    .id_table = ivshmem_pci_ids,
+    .probe = ivshmem_pci_probe,
+    .remove = ivshmem_pci_remove,
+};
+
+static int __init ivshmem_init_module(void)
+{
+    return pci_register_driver(&ivshmem_pci_driver);
+}
+
+static void __exit ivshmem_exit_module(void)
+{
+    pci_unregister_driver(&ivshmem_pci_driver);
+}
+
+module_init(ivshmem_init_module);
+module_exit(ivshmem_exit_module);
+
+MODULE_DEVICE_TABLE(pci, ivshmem_pci_ids);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Cam Macdonell");