diff mbox

[4/7] vbus-proxy: add a pci-to-vbus bridge

Message ID 20090803171751.17268.48169.stgit@dev.haskins.net
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Gregory Haskins Aug. 3, 2009, 5:17 p.m. UTC
This patch adds a pci-based driver to interface between the a host VBUS
and the guest's vbus-proxy bus model.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/vbus/Kconfig      |   10 +
 drivers/vbus/Makefile     |    3 
 drivers/vbus/pci-bridge.c |  824 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/Kbuild      |    1 
 include/linux/vbus_pci.h  |  127 +++++++
 5 files changed, 965 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vbus/pci-bridge.c
 create mode 100644 include/linux/vbus_pci.h


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Arnd Bergmann Aug. 6, 2009, 2:42 p.m. UTC | #1
On Monday 03 August 2009, Gregory Haskins wrote:
> This patch adds a pci-based driver to interface between the a host VBUS
> and the guest's vbus-proxy bus model.
> 
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>

This seems to be duplicating parts of virtio-pci that could be kept
common by extending the virtio code. Layering on top of virtio
would also make it possible to use the same features you add
on top of other transports (e.g. the s390 virtio code) without
adding yet another backend for each of them.

> +static int
> +vbus_pci_hypercall(unsigned long nr, void *data, unsigned long len)
> +{
> +	struct vbus_pci_hypercall params = {
> +		.vector = nr,
> +		.len    = len,
> +		.datap  = __pa(data),
> +	};
> +	unsigned long flags;
> +	int ret;
> +
> +	spin_lock_irqsave(&vbus_pci.lock, flags);
> +
> +	memcpy_toio(&vbus_pci.regs->hypercall.data, &params, sizeof(params));
> +	ret = ioread32(&vbus_pci.regs->hypercall.result);
> +
> +	spin_unlock_irqrestore(&vbus_pci.lock, flags);
> +
> +	return ret;
> +}

The functionality looks reasonable but please don't call this a hypercall.
A hypercall would be hypervisor specific by definition while this one
is device specific if I understand it correctly. How about "command queue",
"mailbox", "message queue", "devcall" or something else that we have in
existing PCI devices?

> +
> +static int
> +vbus_pci_device_open(struct vbus_device_proxy *vdev, int version, int flags)
> +{
> +	struct vbus_pci_device *dev = to_dev(vdev);
> +	struct vbus_pci_deviceopen params;
> +	int ret;
> +
> +	if (dev->handle)
> +		return -EINVAL;
> +
> +	params.devid   = vdev->id;
> +	params.version = version;
> +
> +	ret = vbus_pci_hypercall(VBUS_PCI_HC_DEVOPEN,
> +				 &params, sizeof(params));
> +	if (ret < 0)
> +		return ret;
> +
> +	dev->handle = params.handle;
> +
> +	return 0;
> +}

This seems to add an artificial abstraction that does not make sense
if you stick to the PCI abstraction. The two sensible and common models
for virtual devices that I've seen are:

* The hypervisor knows what virtual resources exist and provides them
  to the guest. The guest owns them as soon as they show up in the
  bus (e.g. PCI) probe. The 'handle' is preexisting.

* The guest starts without any devices and asks for resources it wants
  to access. There is no probing of resources but the guest issues
  a hypercall to get a handle to a newly created virtual device
  (or -ENODEV).

What is your reasoning for requiring both a probe and an allocation?

> +static int
> +vbus_pci_device_shm(struct vbus_device_proxy *vdev, int id, int prio,
> +		    void *ptr, size_t len,
> +		    struct shm_signal_desc *sdesc, struct shm_signal **signal,
> +		    int flags)
> +{
> +	struct vbus_pci_device *dev = to_dev(vdev);
> +	struct _signal *_signal = NULL;
> +	struct vbus_pci_deviceshm params;
> +	unsigned long iflags;
> +	int ret;
> +
> +	if (!dev->handle)
> +		return -EINVAL;
> +
> +	params.devh   = dev->handle;
> +	params.id     = id;
> +	params.flags  = flags;
> +	params.datap  = (u64)__pa(ptr);
> +	params.len    = len;
> +
> +	if (signal) {
> +		/*
> +		 * The signal descriptor must be embedded within the
> +		 * provided ptr
> +		 */
> +		if (!sdesc
> +		    || (len < sizeof(*sdesc))
> +		    || ((void *)sdesc < ptr)
> +		    || ((void *)sdesc > (ptr + len - sizeof(*sdesc))))
> +			return -EINVAL;
> +
> +		_signal = kzalloc(sizeof(*_signal), GFP_KERNEL);
> +		if (!_signal)
> +			return -ENOMEM;
> +
> +		_signal_init(&_signal->signal, sdesc, &_signal_ops);
> +
> +		/*
> +		 * take another reference for the host.  This is dropped
> +		 * by a SHMCLOSE event
> +		 */
> +		shm_signal_get(&_signal->signal);
> +
> +		params.signal.offset = (u64)sdesc - (u64)ptr;
> +		params.signal.prio   = prio;
> +		params.signal.cookie = (u64)_signal;
> +
> +	} else
> +		params.signal.offset = -1; /* yes, this is a u32, but its ok */
> +
> +	ret = vbus_pci_hypercall(VBUS_PCI_HC_DEVSHM,
> +				 &params, sizeof(params));
> +	if (ret < 0) {
> +		if (_signal) {
> +			/*
> +			 * We held two references above, so we need to drop
> +			 * both of them
> +			 */
> +			shm_signal_put(&_signal->signal);
> +			shm_signal_put(&_signal->signal);
> +		}
> +
> +		return ret;
> +	}
> +
> +	if (signal) {
> +		_signal->handle = ret;
> +
> +		spin_lock_irqsave(&vbus_pci.lock, iflags);
> +
> +		list_add_tail(&_signal->list, &dev->shms);
> +
> +		spin_unlock_irqrestore(&vbus_pci.lock, iflags);
> +
> +		shm_signal_get(&_signal->signal);
> +		*signal = &_signal->signal;
> +	}
> +
> +	return 0;
> +}

This could be implemented by virtio devices as well, right?

> +static int
> +vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
> +		     size_t len, int flags)
> +{
> +	struct vbus_pci_device *dev = to_dev(vdev);
> +	struct vbus_pci_devicecall params = {
> +		.devh  = dev->handle,
> +		.func  = func,
> +		.datap = (u64)__pa(data),
> +		.len   = len,
> +		.flags = flags,
> +	};
> +
> +	if (!dev->handle)
> +		return -EINVAL;
> +
> +	return vbus_pci_hypercall(VBUS_PCI_HC_DEVCALL, &params, sizeof(params));
> +}

Why the indirection? It seems to me that you could do the simpler

static int
vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
		     size_t len, int flags)
{
	struct vbus_pci_device *dev = to_dev(vdev);
	struct vbus_pci_hypercall params = {
		.vector = func,
		.len    = len,
		.datap  = __pa(data),
	};
	spin_lock_irqsave(&dev.lock, flags);
	memcpy_toio(&dev.regs->hypercall.data, &params, sizeof(params));
	ret = ioread32(&dev.regs->hypercall.result);
	spin_unlock_irqrestore(&dev.lock, flags);

	return ret;
}

This gets rid of your 'handle' and the unwinding through an extra pointer
indirection. You just need to make sure that the device specific call numbers
don't conflict with any global ones.

> +
> +static struct ioq_notifier eventq_notifier;

> ...

> +/* Invoked whenever the hypervisor ioq_signal()s our eventq */
> +static void
> +eventq_wakeup(struct ioq_notifier *notifier)
> +{
> +	struct ioq_iterator iter;
> +	int ret;
> +
> +	/* We want to iterate on the head of the in-use index */
> +	ret = ioq_iter_init(&vbus_pci.eventq, &iter, ioq_idxtype_inuse, 0);
> +	BUG_ON(ret < 0);
> +
> +	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
> +	BUG_ON(ret < 0);
> +
> +	/*
> +	 * The EOM is indicated by finding a packet that is still owned by
> +	 * the south side.
> +	 *
> +	 * FIXME: This in theory could run indefinitely if the host keeps
> +	 * feeding us events since there is nothing like a NAPI budget.  We
> +	 * might need to address that
> +	 */
> +	while (!iter.desc->sown) {
> +		struct ioq_ring_desc *desc  = iter.desc;
> +		struct vbus_pci_event *event;
> +
> +		event = (struct vbus_pci_event *)desc->cookie;
> +
> +		switch (event->eventid) {
> +		case VBUS_PCI_EVENT_DEVADD:
> +			event_devadd(&event->data.add);
> +			break;
> +		case VBUS_PCI_EVENT_DEVDROP:
> +			event_devdrop(&event->data.handle);
> +			break;
> +		case VBUS_PCI_EVENT_SHMSIGNAL:
> +			event_shmsignal(&event->data.handle);
> +			break;
> +		case VBUS_PCI_EVENT_SHMCLOSE:
> +			event_shmclose(&event->data.handle);
> +			break;
> +		default:
> +			printk(KERN_WARNING "VBUS_PCI: Unexpected event %d\n",
> +			       event->eventid);
> +			break;
> +		};
> +
> +		memset(event, 0, sizeof(*event));
> +
> +		/* Advance the in-use head */
> +		ret = ioq_iter_pop(&iter, 0);
> +		BUG_ON(ret < 0);
> +	}
> +
> +	/* And let the south side know that we changed the queue */
> +	ioq_signal(&vbus_pci.eventq, 0);
> +}

Ah, so you have a global event queue and your own device hotplug mechanism.
But why would you then still use PCI to back it? We already have PCI hotplug
to add and remove devices and you have defined per device notifier queues 
that you can use for waking up the device, right?

	Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins Aug. 6, 2009, 3:59 p.m. UTC | #2
>>> On 8/6/2009 at 10:42 AM, in message <200908061642.40614.arnd@arndb.de>, Arnd
Bergmann <arnd@arndb.de> wrote: 
> On Monday 03 August 2009, Gregory Haskins wrote:
>> This patch adds a pci-based driver to interface between the a host VBUS
>> and the guest's vbus-proxy bus model.
>> 
>> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> 
> This seems to be duplicating parts of virtio-pci that could be kept
> common by extending the virtio code. Layering on top of virtio
> would also make it possible to use the same features you add
> on top of other transports (e.g. the s390 virtio code) without
> adding yet another backend for each of them.

This doesn't make sense to me, but I suspect we are both looking at what this code does differently.  I am under the impression that you may believe that there is one of these objects per vbus device.  Note that this is just a bridge to vbus, so there is only one of these per system with potentially many vbus devices behind it.

In essence, this driver's job is to populate the "vbus-proxy" LDM bus with objects that it finds across the PCI-OTHER bridge.  This would actually sit below the virtio components in the stack, so it doesnt make sense (to me) to turn around and build this on top of virtio.  But perhaps I am missing something you are seeing.

Can you elaborate?

> 
>> +static int
>> +vbus_pci_hypercall(unsigned long nr, void *data, unsigned long len)
>> +{
>> +	struct vbus_pci_hypercall params = {
>> +		.vector = nr,
>> +		.len    = len,
>> +		.datap  = __pa(data),
>> +	};
>> +	unsigned long flags;
>> +	int ret;
>> +
>> +	spin_lock_irqsave(&vbus_pci.lock, flags);
>> +
>> +	memcpy_toio(&vbus_pci.regs->hypercall.data, &params, sizeof(params));
>> +	ret = ioread32(&vbus_pci.regs->hypercall.result);
>> +
>> +	spin_unlock_irqrestore(&vbus_pci.lock, flags);
>> +
>> +	return ret;
>> +}
> 
> The functionality looks reasonable but please don't call this a hypercall.

Heh, I guess its just semantics.  The reason why its called hypercall is two fold:

1) In previous versions (vbus-v3 and earlier), it actually *was* literally a KVM-hypercall.
2) In this current version, it is purely a PCI device doing PIO, but it still acts exactly like a hypercall on the backend (via the ioeventfd mechanism in KVM).

That said, I am not married to the name, so I can come up with something more appropriate/descriptive.

> A hypercall would be hypervisor specific by definition while this one
> is device specific if I understand it correctly. How about "command queue",
> "mailbox", "message queue", "devcall" or something else that we have in
> existing PCI devices?
> 
>> +
>> +static int
>> +vbus_pci_device_open(struct vbus_device_proxy *vdev, int version, int 
> flags)
>> +{
>> +	struct vbus_pci_device *dev = to_dev(vdev);
>> +	struct vbus_pci_deviceopen params;
>> +	int ret;
>> +
>> +	if (dev->handle)
>> +		return -EINVAL;
>> +
>> +	params.devid   = vdev->id;
>> +	params.version = version;
>> +
>> +	ret = vbus_pci_hypercall(VBUS_PCI_HC_DEVOPEN,
>> +				 &params, sizeof(params));
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	dev->handle = params.handle;
>> +
>> +	return 0;
>> +}
> 
> This seems to add an artificial abstraction that does not make sense
> if you stick to the PCI abstraction.

I think there may be confusion about what is going on here.  The "device-open" pertains to a vbus device *beyond* the bridge, not the PCI device (the bridge) itself.  Nor is the vbus device a PCI device.

Whats happening here is somewhat analogous to a PCI config-cycle.  Its a way to open a channel to a device beyond the bridge in _response_ to a probe.

We have a way to enumerate devices present beyond the bridge (this yields a "device-id")  but in order to actually talk to the device, you must first call DEVOPEN(id).  When a device-id is enumerated, it generates a probe() event on vbus-proxy.  The responding driver in question would then turn around and issue the handle = dev->open(VERSION) to see if it is compatible with the device, and to establish a context for further communication.

The reason why DEVOPEN returns a unique handle is to help ensure that the driver has established proper context before allowing other calls.

> The two sensible and common models
> for virtual devices that I've seen are:
> 
> * The hypervisor knows what virtual resources exist and provides them
>   to the guest. The guest owns them as soon as they show up in the
>   bus (e.g. PCI) probe. The 'handle' is preexisting.
> 
> * The guest starts without any devices and asks for resources it wants
>   to access. There is no probing of resources but the guest issues
>   a hypercall to get a handle to a newly created virtual device
>   (or -ENODEV).
> 
> What is your reasoning for requiring both a probe and an allocation?

Answered above, I thnk

> 
>> +static int
>> +vbus_pci_device_shm(struct vbus_device_proxy *vdev, int id, int prio,
>> +		    void *ptr, size_t len,
>> +		    struct shm_signal_desc *sdesc, struct shm_signal **signal,
>> +		    int flags)
>> +{
>> +	struct vbus_pci_device *dev = to_dev(vdev);
>> +	struct _signal *_signal = NULL;
>> +	struct vbus_pci_deviceshm params;
>> +	unsigned long iflags;
>> +	int ret;
>> +
>> +	if (!dev->handle)
>> +		return -EINVAL;
>> +
>> +	params.devh   = dev->handle;
>> +	params.id     = id;
>> +	params.flags  = flags;
>> +	params.datap  = (u64)__pa(ptr);
>> +	params.len    = len;
>> +
>> +	if (signal) {
>> +		/*
>> +		 * The signal descriptor must be embedded within the
>> +		 * provided ptr
>> +		 */
>> +		if (!sdesc
>> +		    || (len < sizeof(*sdesc))
>> +		    || ((void *)sdesc < ptr)
>> +		    || ((void *)sdesc > (ptr + len - sizeof(*sdesc))))
>> +			return -EINVAL;
>> +
>> +		_signal = kzalloc(sizeof(*_signal), GFP_KERNEL);
>> +		if (!_signal)
>> +			return -ENOMEM;
>> +
>> +		_signal_init(&_signal->signal, sdesc, &_signal_ops);
>> +
>> +		/*
>> +		 * take another reference for the host.  This is dropped
>> +		 * by a SHMCLOSE event
>> +		 */
>> +		shm_signal_get(&_signal->signal);
>> +
>> +		params.signal.offset = (u64)sdesc - (u64)ptr;
>> +		params.signal.prio   = prio;
>> +		params.signal.cookie = (u64)_signal;
>> +
>> +	} else
>> +		params.signal.offset = -1; /* yes, this is a u32, but its ok */
>> +
>> +	ret = vbus_pci_hypercall(VBUS_PCI_HC_DEVSHM,
>> +				 &params, sizeof(params));
>> +	if (ret < 0) {
>> +		if (_signal) {
>> +			/*
>> +			 * We held two references above, so we need to drop
>> +			 * both of them
>> +			 */
>> +			shm_signal_put(&_signal->signal);
>> +			shm_signal_put(&_signal->signal);
>> +		}
>> +
>> +		return ret;
>> +	}
>> +
>> +	if (signal) {
>> +		_signal->handle = ret;
>> +
>> +		spin_lock_irqsave(&vbus_pci.lock, iflags);
>> +
>> +		list_add_tail(&_signal->list, &dev->shms);
>> +
>> +		spin_unlock_irqrestore(&vbus_pci.lock, iflags);
>> +
>> +		shm_signal_get(&_signal->signal);
>> +		*signal = &_signal->signal;
>> +	}
>> +
>> +	return 0;
>> +}
> 
> This could be implemented by virtio devices as well, right?

The big difference with dev->shm() is that it is not bound to a particular ABI within the shared-memory (as opposed to virtio, which assumes a virtio ABI).  This just creates an empty shared-memory region (with a bidirectional signaling path) which you can overlay a variety of structures (virtio included).  You can of course also use non-ring based structures, such as, say, an array of idempotent state.

The point is that, once this is done, you have a shared-memory region and a way (via the shm-signal) to bidirectionally signal changes to that memory region.  You can then build bigger things with it, like virtqueues.

> 
>> +static int
>> +vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
>> +		     size_t len, int flags)
>> +{
>> +	struct vbus_pci_device *dev = to_dev(vdev);
>> +	struct vbus_pci_devicecall params = {
>> +		.devh  = dev->handle,
>> +		.func  = func,
>> +		.datap = (u64)__pa(data),
>> +		.len   = len,
>> +		.flags = flags,
>> +	};
>> +
>> +	if (!dev->handle)
>> +		return -EINVAL;
>> +
>> +	return vbus_pci_hypercall(VBUS_PCI_HC_DEVCALL, &params, sizeof(params));
>> +}
> 
> Why the indirection? It seems to me that you could do the simpler

What indirection?

/me looks below and thinks he sees the confusion..

> 
> static int
> vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
> 		     size_t len, int flags)
> {
> 	struct vbus_pci_device *dev = to_dev(vdev);
> 	struct vbus_pci_hypercall params = {
> 		.vector = func,
> 		.len    = len,
> 		.datap  = __pa(data),
> 	};
> 	spin_lock_irqsave(&dev.lock, flags);
> 	memcpy_toio(&dev.regs->hypercall.data, &params, sizeof(params));
> 	ret = ioread32(&dev.regs->hypercall.result);
> 	spin_unlock_irqrestore(&dev.lock, flags);
> 
> 	return ret;
> }
> 
> This gets rid of your 'handle' and the unwinding through an extra pointer
> indirection. You just need to make sure that the device specific call 
> numbers
> don't conflict with any global ones.

Ah, now I see the confusion...

DEVCALL is sending a synchronous call to a specific device beyond the bridge.  The MMIO going on here against dev.regs->hypercall.data is sending a synchronous call to the bridge itself.  They are distinctly different ;)

> 
>> +
>> +static struct ioq_notifier eventq_notifier;
> 
>> ...
> 
>> +/* Invoked whenever the hypervisor ioq_signal()s our eventq */
>> +static void
>> +eventq_wakeup(struct ioq_notifier *notifier)
>> +{
>> +	struct ioq_iterator iter;
>> +	int ret;
>> +
>> +	/* We want to iterate on the head of the in-use index */
>> +	ret = ioq_iter_init(&vbus_pci.eventq, &iter, ioq_idxtype_inuse, 0);
>> +	BUG_ON(ret < 0);
>> +
>> +	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
>> +	BUG_ON(ret < 0);
>> +
>> +	/*
>> +	 * The EOM is indicated by finding a packet that is still owned by
>> +	 * the south side.
>> +	 *
>> +	 * FIXME: This in theory could run indefinitely if the host keeps
>> +	 * feeding us events since there is nothing like a NAPI budget.  We
>> +	 * might need to address that
>> +	 */
>> +	while (!iter.desc->sown) {
>> +		struct ioq_ring_desc *desc  = iter.desc;
>> +		struct vbus_pci_event *event;
>> +
>> +		event = (struct vbus_pci_event *)desc->cookie;
>> +
>> +		switch (event->eventid) {
>> +		case VBUS_PCI_EVENT_DEVADD:
>> +			event_devadd(&event->data.add);
>> +			break;
>> +		case VBUS_PCI_EVENT_DEVDROP:
>> +			event_devdrop(&event->data.handle);
>> +			break;
>> +		case VBUS_PCI_EVENT_SHMSIGNAL:
>> +			event_shmsignal(&event->data.handle);
>> +			break;
>> +		case VBUS_PCI_EVENT_SHMCLOSE:
>> +			event_shmclose(&event->data.handle);
>> +			break;
>> +		default:
>> +			printk(KERN_WARNING "VBUS_PCI: Unexpected event %d\n",
>> +			       event->eventid);
>> +			break;
>> +		};
>> +
>> +		memset(event, 0, sizeof(*event));
>> +
>> +		/* Advance the in-use head */
>> +		ret = ioq_iter_pop(&iter, 0);
>> +		BUG_ON(ret < 0);
>> +	}
>> +
>> +	/* And let the south side know that we changed the queue */
>> +	ioq_signal(&vbus_pci.eventq, 0);
>> +}
> 
> Ah, so you have a global event queue and your own device hotplug mechanism.
> But why would you then still use PCI to back it?

PCI is only used as a PCI-to-vbus bridge.  Beyond that, the bridge is populating the vbus devices it sees beyond the bridge into the vbus-proxy LDM bus.

That said, the event queue is sending me events such as "device added" and "shm-signal", which are then reflected up into the general model.

> We already have PCI hotplug to add and remove devices

Yep, and I do actually use that...to get a probe for the bridge itself ;)

>and you have defined per device notifier queues that you can use for waking up the device, right?

Only per bridge.  vbus drivers themselves allocate additional dynamic shm-signal channels that are tunneled through the bridge's eventq(s).

Kind Regards,
-Greg
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Aug. 6, 2009, 5:03 p.m. UTC | #3
On Thursday 06 August 2009, you wrote:
> >>> On 8/6/2009 at 10:42 AM, in message <200908061642.40614.arnd@arndb.de>, Arnd
> Bergmann <arnd@arndb.de> wrote: 
> > On Monday 03 August 2009, Gregory Haskins wrote:
> >> This patch adds a pci-based driver to interface between the a host VBUS
> >> and the guest's vbus-proxy bus model.
> >> 
> >> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> > 
> > This seems to be duplicating parts of virtio-pci that could be kept
> > common by extending the virtio code. Layering on top of virtio
> > would also make it possible to use the same features you add
> > on top of other transports (e.g. the s390 virtio code) without
> > adding yet another backend for each of them.
> 
> This doesn't make sense to me, but I suspect we are both looking at what this
> code does differently.  I am under the impression that you may believe that
> there is one of these objects per vbus device.  Note that this is just a bridge
> to vbus, so there is only one of these per system with potentially many vbus
> devices behind it.

Right, this did not become clear from the posting. For virtio, we discussed
a model like this in the beginning and then rejected it in favour of a
"one PCI device per virtio device" model, which I now think is a better
approach than your pci-to-vbus bridge.
 
> In essence, this driver's job is to populate the "vbus-proxy" LDM bus with
> objects that it finds across the PCI-OTHER bridge.  This would actually sit
> below the virtio components in the stack, so it doesnt make sense (to me) to
> turn around and build this on top of virtio.  But perhaps I am missing
> something you are seeing.
> 
> Can you elaborate?

Your PCI device does not serve any real purpose as far as I can tell, you
could just as well have a root device as a parent for all the vbus devices
if you do your device probing like this.

However, assuming that you do the IMHO right thing to do probing like
virtio with a PCI device for each slave, the code will be almost the same
as virtio-pci and the two can be the same.

> > This seems to add an artificial abstraction that does not make sense
> > if you stick to the PCI abstraction.
> 
> I think there may be confusion about what is going on here.  The "device-open"
> pertains to a vbus device *beyond* the bridge, not the PCI device (the bridge)
> itself.  Nor is the vbus device a PCI device.
> 
> Whats happening here is somewhat analogous to a PCI config-cycle.  Its
> a way to open a channel to a device beyond the bridge in _response_ to
> a probe.
> 
> We have a way to enumerate devices present beyond the bridge (this yields
> a "device-id")  but in order to actually talk to the device, you must first
> call DEVOPEN(id).  When a device-id is enumerated, it generates a probe()
> event on vbus-proxy.  The responding driver in question would then turn
> around and issue the handle = dev->open(VERSION) to see if it is compatible
> with the device, and to establish a context for further communication.
> 
> The reason why DEVOPEN returns a unique handle is to help ensure that the
> driver has established proper context before allowing other calls.

So assuming this kind of bus is the right idea (which I think it's not),
why can't the host assume they are open to start with and you go and
enumerate the devices on the bridge, creating a vbus_device for each
one as you go. Then you just need to match the vbus drivers with the
devices by some string or vendor/device ID tuple.

> > 
> > This could be implemented by virtio devices as well, right?
> 
> The big difference with dev->shm() is that it is not bound to
> a particular ABI within the shared-memory (as opposed to
> virtio, which assumes a virtio ABI).  This just creates a
> n empty shared-memory region (with a bidirectional signaling
> path) which you can overlay a variety of structures (virtio
> included).  You can of course also use non-ring based
> structures, such as, say, an array of idempotent state.
>
> The point is that, once this is done, you have a shared-memory
> region and a way (via the shm-signal) to bidirectionally signal
> changes to that memory region.  You can then build bigger
> things with it, like virtqueues.

Let me try to rephrase my point: I believe you can implement
the shm/ioq data transport on top of the virtio bus level, by
adding shm and signal functions to struct virtio_config_ops
alongside find_vqs() so that a virtio_device can have either
any combination of virtqueues, shm and ioq.

> > static int
> > vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
> > 		     size_t len, int flags)
> > {
> > 	struct vbus_pci_device *dev = to_dev(vdev);
> > 	struct vbus_pci_hypercall params = {
> > 		.vector = func,
> > 		.len    = len,
> > 		.datap  = __pa(data),
> > 	};
> > 	spin_lock_irqsave(&dev.lock, flags);
> > 	memcpy_toio(&dev.regs->hypercall.data, &params, sizeof(params));
> > 	ret = ioread32(&dev.regs->hypercall.result);
> > 	spin_unlock_irqrestore(&dev.lock, flags);
> > 
> > 	return ret;
> > }
> > 
> > This gets rid of your 'handle' and the unwinding through an extra pointer
> > indirection. You just need to make sure that the device specific call 
> > numbers
> > don't conflict with any global ones.
> 
> Ah, now I see the confusion...
> 
> DEVCALL is sending a synchronous call to a specific device beyond the bridge.  The MMIO going on here against dev.regs->hypercall.data is sending a synchronous call to the bridge itself.  They are distinctly different ;)

well, my point earlier was that they probably should not be different  ;-)

	Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins Aug. 6, 2009, 9:04 p.m. UTC | #4
>>> On 8/6/2009 at  1:03 PM, in message <200908061903.05083.arnd@arndb.de>, Arnd
Bergmann <arnd@arndb.de> wrote: 
> On Thursday 06 August 2009, you wrote:
>> >>> On 8/6/2009 at 10:42 AM, in message <200908061642.40614.arnd@arndb.de>, Arnd
>> Bergmann <arnd@arndb.de> wrote: 
>> > On Monday 03 August 2009, Gregory Haskins wrote:
>> >> This patch adds a pci-based driver to interface between the a host VBUS
>> >> and the guest's vbus-proxy bus model.
>> >> 
>> >> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
>> > 
>> > This seems to be duplicating parts of virtio-pci that could be kept
>> > common by extending the virtio code. Layering on top of virtio
>> > would also make it possible to use the same features you add
>> > on top of other transports (e.g. the s390 virtio code) without
>> > adding yet another backend for each of them.
>> 
>> This doesn't make sense to me, but I suspect we are both looking at what 
> this
>> code does differently.  I am under the impression that you may believe that
>> there is one of these objects per vbus device.  Note that this is just a 
> bridge
>> to vbus, so there is only one of these per system with potentially many vbus
>> devices behind it.
> 
> Right, this did not become clear from the posting. For virtio, we discussed
> a model like this in the beginning and then rejected it in favour of a
> "one PCI device per virtio device" model, which I now think is a better
> approach than your pci-to-vbus bridge.

I agree that the 1:1 model may have made sense for QEMU based virtio.  I think you will find it has diminished value here, however.

Here are some of my arguments against it:

1) there is an ample PCI model that is easy to work with when you are in QEMU and using its device model (and you get it for free).  Its the path of least resistance.  For something in kernel, it is more awkward to try to coordinate the in-kernel state with the PCI state.  Afaict, you either need to have it live partially in both places, or you need some PCI emulation in the kernel.

2) The signal model for the 1:1 design is not very flexible IMO.
    2a) I want to be able to allocate dynamic signal paths, not pre-allocate msi-x vectors at dev-add.
    2b) I also want to collapse multiple interrupts together so as to minimize the context switch rate (inject + EIO overhead).  My design effectively has "NAPI" for interrupt handling.  This helps when the system needs it the most: heavy IO.

3) The 1:1 model is not buying us much in terms of hotplug.  We don't really "use" PCI very much even in virtio.  Its a thin-shim of uniform dev-ids to resurface to the virtio-bus as something else.  With LDM, hotplug is ridiculously easy anyway, so who cares.  I already need an event channel anyway for (2b) anyway, so the devadd/devdrop events are trivial to handle.

4) communicating with something efficiently in-kernel requires more finesse than basic PIO/MMIO.  There are tricks you can do to get around this, but with 1:1 you would have to do this trick repeatedly for each device.  Even with a library solution to help, you still have per-cpu .data overhead and cpu hotplug overhead to get maximum performance.  With my "bridge" model, I do it once, which I believe is ideal.

5) 1:1 is going to quickly populate the available MMIO/PIO and IDT slots for any kind of medium to large configuration.  The bridge model scales better in this regard.

So based on that, I think the bridge model works better for vbus.  Perhaps you can convince me otherwise ;)

>  
>> In essence, this driver's job is to populate the "vbus-proxy" LDM bus with
>> objects that it finds across the PCI-OTHER bridge.  This would actually sit
>> below the virtio components in the stack, so it doesnt make sense (to me) to
>> turn around and build this on top of virtio.  But perhaps I am missing
>> something you are seeing.
>> 
>> Can you elaborate?
> 
> Your PCI device does not serve any real purpose as far as I can tell

That is certainly debatable.  Its purpose is as follows:

1) Allows a guest to discover the vbus feature (fwiw: I used to do this with cpuid)
2) Allows the guest to establish proper context to communicate with the feature (mmio, pio, and msi) (fwiw: i used to use hypercalls)
3) Access the virtual-devices that have been configured for the feature

Correct me if I am wrong:  Isn't this more of less the exact intent of something like an LDM bus (vbus-proxy) and a PCI-BRIDGE?  Other than the possibility that there might be some mergable overlap (still debatable), I don't think its fair to say that this does not serve a purpose.

>, you could just as well have a root device as a parent for all the vbus devices
> if you do your device probing like this.

Yes, I suppose the "bridge" could have been advertised as a virtio-based root device.  In this way, the virtio probe() would replace my pci probe() for feature discovery, and a virtqueue could replace my msi+ioq for the eventq channel.

I see a few issues with that, however:

1) The virtqueue library, while a perfectly nice ring design at the metadata level, does not have an API that is friendly to kernel-to-kernel communication.  It was designed more for frontend use to some remote backend.  The IOQ library on the other hand, was specifically designed to support use as kernel-to-kernel (see north/south designations).  So this made life easier for me.  To do what you propose, the eventq channel would need to terminate in kernel, and I would thus be forced to deal that the potential API problems.

2) I would need to have Avi et. al. allocate a virtio vector to use from their namespace, which I am sure they wont be willing to do until they accept my design.  Today, I have a nice conflict free PCI ID to use as I see fit.

Im sure both of these hurdles are not insurmountable, but I am left scratching my head as to why its worth the effort.  It seems to me its a "six of one, half-dozen of the other" kind of scenario.  Either I write a qemu PCI device and pci-bridge driver, or I write a qemu virtio-devicve and virtio root driver.

In short: What does this buy us, or did you mean something else?  

> 
> However, assuming that you do the IMHO right thing to do probing like
> virtio with a PCI device for each slave, the code will be almost the same
> as virtio-pci and the two can be the same.

Can you elaborate?

> 
>> > This seems to add an artificial abstraction that does not make sense
>> > if you stick to the PCI abstraction.
>> 
>> I think there may be confusion about what is going on here.  The 
> "device-open"
>> pertains to a vbus device *beyond* the bridge, not the PCI device (the 
> bridge)
>> itself.  Nor is the vbus device a PCI device.
>> 
>> Whats happening here is somewhat analogous to a PCI config-cycle.  Its
>> a way to open a channel to a device beyond the bridge in _response_ to
>> a probe.
>> 
>> We have a way to enumerate devices present beyond the bridge (this yields
>> a "device-id")  but in order to actually talk to the device, you must first
>> call DEVOPEN(id).  When a device-id is enumerated, it generates a probe()
>> event on vbus-proxy.  The responding driver in question would then turn
>> around and issue the handle = dev->open(VERSION) to see if it is compatible
>> with the device, and to establish a context for further communication.
>> 
>> The reason why DEVOPEN returns a unique handle is to help ensure that the
>> driver has established proper context before allowing other calls.
> 
> So assuming this kind of bus is the right idea (which I think it's not),
> why can't the host assume they are open to start with

Read on..

>and you go and enumerate the devices on the bridge, creating a vbus_device for each
> one as you go.

Thats exactly what it does.

> Then you just need to match the vbus drivers with the
> devices by some string or vendor/device ID tuple.
> 

Yep, thats right too.  Then, when the driver gets a ->probe(), it does an dev->open() to check various state:

a) can the device be opened?  if it has an max-open policy (most will have a max-open = 1 policy) and something else already has the device open, it will fail (this will not be common).
b) is the driver ABI revision compatible with the device ABI revision?  This is like checking the pci config-space revision number.

For an example, see drivers/net/vbus-enet.c, line 764:

http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=drivers/net/vbus-enet.c;h=7220f43723adc5b0bece1bc37974fae1b034cd9e;hb=b3b2339efbd4e754b1c85f8bc8f85f21a1a1f509#l764

Its simple a check to see if the driver and device are compatible, and therefore the probe should succeed.  Nothing more.  I think what I have done is similar to how most buses (like PCI) work today (ala revision number checks with a config-cycle).

Regarding the id->handle indirection:

Internally, the DEVOPEN call translates an "id" to a "handle".  The handle is just a token to help ensure that the caller actually opened the device successfully.  Note that the "id" namespace is 0 based.  Therefore, something like an errant DEVCALL(0) would be indistinguishable from a legit request.  Using the handle abstraction gives me a slightly more robust mechanism to ensure the caller actually meant to call the host, and was in the proper context to do so.  For one thing, if the device had never been opened, this would have failed before it ever reached the model.  Its one more check I can do at the infrastructure level, and one less thing each model has to look out for.

Is the id->handle translation critical?  No, i'm sure we could live without it, but I also don't think it hurts anything.  It allows the overall code to be slightly more robust, and the individual model code to be slightly less complicated.  Therefore, I don't see a problem.

>> > 
>> > This could be implemented by virtio devices as well, right?
>> 
>> The big difference with dev->shm() is that it is not bound to
>> a particular ABI within the shared-memory (as opposed to
>> virtio, which assumes a virtio ABI).  This just creates a
>> n empty shared-memory region (with a bidirectional signaling
>> path) which you can overlay a variety of structures (virtio
>> included).  You can of course also use non-ring based
>> structures, such as, say, an array of idempotent state.
>>
>> The point is that, once this is done, you have a shared-memory
>> region and a way (via the shm-signal) to bidirectionally signal
>> changes to that memory region.  You can then build bigger
>> things with it, like virtqueues.
> 
> Let me try to rephrase my point: I believe you can implement
> the shm/ioq data transport on top of the virtio bus level, by
> adding shm and signal functions to struct virtio_config_ops
> alongside find_vqs() so that a virtio_device can have either
> any combination of virtqueues, shm and ioq.

Yes, I believe this might be doable, but I don't know virtio well enough to say for sure.

> 
>> > static int
>> > vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
>> > 		     size_t len, int flags)
>> > {
>> > 	struct vbus_pci_device *dev = to_dev(vdev);
>> > 	struct vbus_pci_hypercall params = {
>> > 		.vector = func,
>> > 		.len    = len,
>> > 		.datap  = __pa(data),
>> > 	};
>> > 	spin_lock_irqsave(&dev.lock, flags);
>> > 	memcpy_toio(&dev.regs->hypercall.data, &params, sizeof(params));
>> > 	ret = ioread32(&dev.regs->hypercall.result);
>> > 	spin_unlock_irqrestore(&dev.lock, flags);
>> > 
>> > 	return ret;
>> > }
>> > 
>> > This gets rid of your 'handle' and the unwinding through an extra pointer
>> > indirection. You just need to make sure that the device specific call 
>> > numbers
>> > don't conflict with any global ones.
>> 
>> Ah, now I see the confusion...
>> 
>> DEVCALL is sending a synchronous call to a specific device beyond the 
> bridge.  The MMIO going on here against dev.regs->hypercall.data is sending a 
> synchronous call to the bridge itself.  They are distinctly different ;)
> 
> well, my point earlier was that they probably should not be different  ;-)

Ok :)

I still do not see how they could be merged in a way that is both a) worth the effort, and b) doesn't compromise my design.  But I will keep an open mind if you want to continue the conversation.

Thanks for all the feedback.  I do appreciate it.

Kind Regards,
-Greg

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Aug. 6, 2009, 10:57 p.m. UTC | #5
On Thursday 06 August 2009, Gregory Haskins wrote:
> >>> On 8/6/2009 at  1:03 PM, in message <200908061903.05083.arnd@arndb.de>, Arnd Bergmann <arnd@arndb.de> wrote: 
> Here are some of my arguments against it:
> 
> 1) there is an ample PCI model that is easy to work with when you are in QEMU and using its device model (and you get it for free).  Its the path of least resistance.  For something in kernel, it is more awkward to try to coordinate the in-kernel state with the PCI state.  Afaict, you either need to have it live partially in both places, or you need some PCI emulation in the kernel.

True, if the whole hypervisor is in the host kernel, then doing full PCI emulation would be
insane. I was assuming that all of the setup code still lived in host user space.
What is the reason why it cannot? Do you want to use something other than qemu,
do you think this will impact performance, or something else?

> 2) The signal model for the 1:1 design is not very flexible IMO.
>     2a) I want to be able to allocate dynamic signal paths, not pre-allocate msi-x vectors at dev-add.

I believe msi-x implies that the interrupt vectors get added by the device driver
at run time, unlike legacy interrupts or msi. It's been a while since I dealt with
that though.

>     2b) I also want to collapse multiple interrupts together so as to minimize the context switch rate (inject + EIO overhead).  My design effectively has "NAPI" for interrupt handling.  This helps when the system needs it the most: heavy IO.

That sounds like a very useful concept in general, but this seems to be a
detail of the interrupt controller implementation. If the IO-APIC cannot
do what you want here, maybe we just need a paravirtual IRQ controller
driver, like e.g. the PS3 has.

> 3) The 1:1 model is not buying us much in terms of hotplug.  We don't really "use" PCI very much even in virtio.  Its a thin-shim of uniform dev-ids to resurface to the virtio-bus as something else.  With LDM, hotplug is ridiculously easy anyway, so who cares.  I already need an event channel anyway for (2b) anyway, so the devadd/devdrop events are trivial to handle.

I agree for Linux guests, but when you want to run other guest operating systems,
PCI hotplug is probably the most common interface for this. AFAIK, the windows
virtio-net driver does not at all have a concept of a virtio layer but is simply
a network driver for a PCI card. The same could be applied any other device,
possibly with some library code doing all the queue handling in a common way.l

> 4) communicating with something efficiently in-kernel requires more finesse than basic PIO/MMIO.  There are tricks you can do to get around this, but with 1:1 you would have to do this trick repeatedly for each device.  Even with a library solution to help, you still have per-cpu .data overhead and cpu hotplug overhead to get maximum performance.  With my "bridge" model, I do it once, which I believe is ideal.
>
> 5) 1:1 is going to quickly populate the available MMIO/PIO and IDT slots for any kind of medium to large configuration.  The bridge model scales better in this regard.

We don't need to rely on PIO, it's just the common interface that all hypervisors
can easily support. We could have different underlying methods for the communication
if space or performance becomes a bottleneck because of this.

> So based on that, I think the bridge model works better for vbus.  Perhaps you can convince me otherwise ;)

Being able to define all of it in the host kernel seems to be the major
advantage of your approach, the other points you mentioned are less
important IMHO. The question is whether that is indeed a worthy goal,
or if it should just live in user space as with the qemu PCI code.

> >> In essence, this driver's job is to populate the "vbus-proxy" LDM bus with
> >> objects that it finds across the PCI-OTHER bridge.  This would actually sit
> >> below the virtio components in the stack, so it doesnt make sense (to me) to
> >> turn around and build this on top of virtio.  But perhaps I am missing
> >> something you are seeing.
> >> 
> >> Can you elaborate?
> > 
> > Your PCI device does not serve any real purpose as far as I can tell
> 
> That is certainly debatable.  Its purpose is as follows:
> 
> 1) Allows a guest to discover the vbus feature (fwiw: I used to do this with cpuid)

true, I missed that.

> 2) Allows the guest to establish proper context to communicate with the feature (mmio, pio, and msi) (fwiw: i used to use hypercalls)
> 3) Access the virtual-devices that have been configured for the feature
> 
> Correct me if I am wrong:  Isn't this more of less the exact intent of something like an LDM bus (vbus-proxy) and a PCI-BRIDGE?  Other than the possibility that there might be some mergable overlap (still debatable), I don't think its fair to say that this does not serve a purpose.

I guess you are right on that. An interesting variation of that would be make the
child devices of it virtio devices again though: Instead of the PCI emulation code
in the host kernel, you could define a simpler interface to the same effect. So the
root device would be a virtio-pci device, below which you can have virtio-virtio
devices.

> >, you could just as well have a root device as a parent for all the vbus devices
> > if you do your device probing like this.
> 
> Yes, I suppose the "bridge" could have been advertised as a virtio-based root device.  In this way, the virtio probe() would replace my pci probe() for feature discovery, and a virtqueue could replace my msi+ioq for the eventq channel.
>
> I see a few issues with that, however:
> 
> 1) The virtqueue library, while a perfectly nice ring design at the metadata level, does not have an API that is friendly to kernel-to-kernel communication.  It was designed more for frontend use to some remote backend.  The IOQ library on the other hand, was specifically designed to support use as kernel-to-kernel (see north/south designations).  So this made life easier for me.  To do what you propose, the eventq channel would need to terminate in kernel, and I would thus be forced to deal that the potential API problems.

Well, virtqueues are not that bad for kernel-to-kernel communication, as Ira mentioned
referring to his virtio-over-PCI driver. You can have virtqueues on both sides, having
the host kernel create a pair of virtqueues (one in user aka guest space, one in the host
kernel), with the host virtqueue_ops doing copy_{to,from}_user to move data between them.

If you have that, you can actually use the same virtio_net driver in both guest and
host kernel, just communicating over different virtio implementations. Interestingly,
that would mean that you no longer need a separation between guest and host device
drivers (vbus and vbus-proxy in your case) but could use the same device abstraction
with just different transports to back the shm-signal or virtqueue.
 
> 2) I would need to have Avi et. al. allocate a virtio vector to use from their namespace, which I am sure they wont be willing to do until they accept my design.  Today, I have a nice conflict free PCI ID to use as I see fit.

My impression is the opposite: as long as you try to reinvent everything at once,
you face opposition, but if you just improve parts of the existing design one
by one (like eventfd), I think you will find lots of support.

> Im sure both of these hurdles are not insurmountable, but I am left scratching my head as to why its worth the effort.  It seems to me its a "six of one, half-dozen of the other" kind of scenario.  Either I write a qemu PCI device and pci-bridge driver, or I write a qemu virtio-devicve and virtio root driver.
> 
> In short: What does this buy us, or did you mean something else?  

In my last reply, I was thinking of a root device that can not be probed like a PCI device.

> > However, assuming that you do the IMHO right thing to do probing like
> > virtio with a PCI device for each slave, the code will be almost the same
> > as virtio-pci and the two can be the same.
> 
> Can you elaborate?

Well, let me revise based on the discussion:

The main point that remains is that I think a vbus-proxy should be the same as a
virtio device. This could be done by having (as in my earlier mails) a PCI device
per vbus-proxy, with devcall implemented in PIO or config-space and additional
shm/shm-signal, or it could be a single virtio device from virtio-pci or one
of the other existing provides that connects you with a new virtio provider
sitting in the host kernel. This provider has child devices for any endpoint
(virtio-net, venet, ...) that is implemented in the host kernel.

> >and you go and enumerate the devices on the bridge, creating a vbus_device for each
> > one as you go.
> 
> Thats exactly what it does.
> 
> > Then you just need to match the vbus drivers with the
> > devices by some string or vendor/device ID tuple.
> > 
> 
> Yep, thats right too.  Then, when the driver gets a ->probe(), it does an dev->open() to check various state:
> 
> a) can the device be opened?  if it has an max-open policy (most will have a max-open = 1 policy) and something else already has the device open, it will fail (this will not be common).
> b) is the driver ABI revision compatible with the device ABI revision?  This is like checking the pci config-space revision number.
> 
> For an example, see drivers/net/vbus-enet.c, line 764:
> 
> http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=drivers/net/vbus-enet.c;h=7220f43723adc5b0bece1bc37974fae1b034cd9e;hb=b3b2339efbd4e754b1c85f8bc8f85f21a1a1f509#l764
> 
> Its simple a check to see if the driver and device are compatible, and therefore the probe should succeed.  Nothing more.  I think what I have done is similar to how most buses (like PCI) work today (ala revision number checks with a config-cycle).

ok. 
 
> Regarding the id->handle indirection:
> 
> Internally, the DEVOPEN call translates an "id" to a "handle".  The handle is just a token to help ensure that the caller actually opened the device successfully.  Note that the "id" namespace is 0 based.  Therefore, something like an errant DEVCALL(0) would be indistinguishable from a legit request.  Using the handle abstraction gives me a slightly more robust mechanism to ensure the caller actually meant to call the host, and was in the proper context to do so.  For one thing, if the device had never been opened, this would have failed before it ever reached the model.  Its one more check I can do at the infrastructure level, and one less thing each model has to look out for.
> 
> Is the id->handle translation critical?  No, i'm sure we could live without it, but I also don't think it hurts anything.  It allows the overall code to be slightly more robust, and the individual model code to be slightly less complicated.  Therefore, I don't see a problem.

Right, assuming your model with all vbus devices behind a single PCI device, your
handle does not hurt, it's the equivalent of a bus/dev/fn number or an MMIO address.

	Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins Aug. 7, 2009, 4:42 a.m. UTC | #6
>>> On 8/6/2009 at  6:57 PM, in message <200908070057.54795.arnd@arndb.de>, Arnd
Bergmann <arnd@arndb.de> wrote: 
> On Thursday 06 August 2009, Gregory Haskins wrote:
>> >>> On 8/6/2009 at  1:03 PM, in message <200908061903.05083.arnd@arndb.de>, Arnd 
> Bergmann <arnd@arndb.de> wrote: 
>> Here are some of my arguments against it:
>> 
>> 1) there is an ample PCI model that is easy to work with when you are in 
> QEMU and using its device model (and you get it for free).  Its the path of 
> least resistance.  For something in kernel, it is more awkward to try to 
> coordinate the in-kernel state with the PCI state.  Afaict, you either need to 
> have it live partially in both places, or you need some PCI emulation in the 
> kernel.
> 
> True, if the whole hypervisor is in the host kernel, then doing full PCI 
> emulation would be
> insane.

In this case, the entire bus is more or less self contained and in-kernel.  We technically *do* still have qemu present, however, so you are right there.

> I was assuming that all of the setup code still lived in host user 
> space.

Very little.  Just enough to register the PCI device, handle the MMIO/PIO/MSI configuration, etc.  All the bus management uses the standard vbus management interface (configfs/sysfs)

> What is the reason why it cannot?

Its not that it "can't" per se..  Its just awkward to have it live in two places, and I would need to coordinate in-kernel changes to userspace, etc.  Today I do not need to do this:  i.e. the model in userspace is very simple.

> Do you want to use something other than qemu,

Well, only in the sense that vbus has its own management interface and bus model, and I want them to be used.

> do you think this will impact performance, or something else?

performance is not a concern for this aspect of operation.

> 
>> 2) The signal model for the 1:1 design is not very flexible IMO.
>>     2a) I want to be able to allocate dynamic signal paths, not pre-allocate 
> msi-x vectors at dev-add.
> 
> I believe msi-x implies that the interrupt vectors get added by the device 
> driver
> at run time, unlike legacy interrupts or msi. It's been a while since I 
> dealt with
> that though.

Yeah, its been a while for me too.  I would have to look at the spec again.

My understanding was that its just a slight variation of msi, with some of the constraints revised (no <= 32 vector limit, etc).  Perhaps it is fancier than that and 2a is unfounded.  TBD.

> 
>>     2b) I also want to collapse multiple interrupts together so as to 
> minimize the context switch rate (inject + EIO overhead).  My design 
> effectively has "NAPI" for interrupt handling.  This helps when the system 
> needs it the most: heavy IO.
> 
> That sounds like a very useful concept in general, but this seems to be a
> detail of the interrupt controller implementation. If the IO-APIC cannot
> do what you want here, maybe we just need a paravirtual IRQ controller
> driver, like e.g. the PS3 has.

Yeah, I agree this could be a function of the APIC code.  Do note that I mentioned this in passing to Avi a few months ago but FWIW he indicated at that time that he is not interested in making the APIC PV.

Also, I almost forgot an important one.  Add:

   2c) Interrupt prioritization.  I want to be able to assign priority to interrupts and handle them in priority order.

> 
>> 3) The 1:1 model is not buying us much in terms of hotplug.  We don't really 
> "use" PCI very much even in virtio.  Its a thin-shim of uniform dev-ids to 
> resurface to the virtio-bus as something else.  With LDM, hotplug is 
> ridiculously easy anyway, so who cares.  I already need an event channel 
> anyway for (2b) anyway, so the devadd/devdrop events are trivial to handle.
> 
> I agree for Linux guests, but when you want to run other guest operating 
> systems,
> PCI hotplug is probably the most common interface for this. AFAIK, the 
> windows
> virtio-net driver does not at all have a concept of a virtio layer but is 
> simply
> a network driver for a PCI card. The same could be applied any other device,
> possibly with some library code doing all the queue handling in a common 
> way.l

I was told it also has a layering like Linux, but I haven't actually seen the code myself, so I do not know if this is true.

> 
>> 4) communicating with something efficiently in-kernel requires more finesse 
> than basic PIO/MMIO.  There are tricks you can do to get around this, but 
> with 1:1 you would have to do this trick repeatedly for each device.  Even 
> with a library solution to help, you still have per-cpu .data overhead and cpu 
> hotplug overhead to get maximum performance.  With my "bridge" model, I do it 
> once, which I believe is ideal.
>>
>> 5) 1:1 is going to quickly populate the available MMIO/PIO and IDT slots for 
> any kind of medium to large configuration.  The bridge model scales better in 
> this regard.
> 
> We don't need to rely on PIO, it's just the common interface that all 
> hypervisors
> can easily support. We could have different underlying methods for the 
> communication
> if space or performance becomes a bottleneck because of this.

Heh...I already proposed an alternative, which incidentally was shot down:

http://lkml.org/lkml/2009/5/5/132

(in the end, I think we agreed that a technique of tunneling PIO/MMIO over hypercaills would be better than introducing a new namespace.  But we also decided that the difference between PIO and PIOoHC was too small to care, and we don't care about non-x86)

But in any case, my comment still stands: 1:1 puts load on the PIO emulation (even if you use PIOoHC).  I am not sure this can be easily worked around.

> 
>> So based on that, I think the bridge model works better for vbus.  Perhaps 
> you can convince me otherwise ;)
> 
> Being able to define all of it in the host kernel seems to be the major
> advantage of your approach, the other points you mentioned are less
> important IMHO. The question is whether that is indeed a worthy goal,
> or if it should just live in user space as with the qemu PCI code.

I don't think we can gloss over these so easily.  They are all important to me, particularly 2b and 2c.

> 
>> >> In essence, this driver's job is to populate the "vbus-proxy" LDM bus with
>> >> objects that it finds across the PCI-OTHER bridge.  This would actually sit
>> >> below the virtio components in the stack, so it doesnt make sense (to me) 
> to
>> >> turn around and build this on top of virtio.  But perhaps I am missing
>> >> something you are seeing.
>> >> 
>> >> Can you elaborate?
>> > 
>> > Your PCI device does not serve any real purpose as far as I can tell
>> 
>> That is certainly debatable.  Its purpose is as follows:
>> 
>> 1) Allows a guest to discover the vbus feature (fwiw: I used to do this with 
> cpuid)
> 
> true, I missed that.
> 
>> 2) Allows the guest to establish proper context to communicate with the 
> feature (mmio, pio, and msi) (fwiw: i used to use hypercalls)
>> 3) Access the virtual-devices that have been configured for the feature
>> 
>> Correct me if I am wrong:  Isn't this more of less the exact intent of 
> something like an LDM bus (vbus-proxy) and a PCI-BRIDGE?  Other than the 
> possibility that there might be some mergable overlap (still debatable), I 
> don't think its fair to say that this does not serve a purpose.
> 
> I guess you are right on that. An interesting variation of that would be 
> make the
> child devices of it virtio devices again though: Instead of the PCI 
> emulation code
> in the host kernel, you could define a simpler interface to the same effect. 
> So the
> root device would be a virtio-pci device, below which you can have 
> virtio-virtio
> devices.

Interesting....but note I think that is effectively what I do today (with virtio-vbus) except you wouldn't have the explicit vbus-proxy model underneath.  Also, if 1:1 via PCI is important for windows, that solution would have the same problem that the virtio-vbus model does.


> 
>> >, you could just as well have a root device as a parent for all the vbus 
> devices
>> > if you do your device probing like this.
>> 
>> Yes, I suppose the "bridge" could have been advertised as a virtio-based root 
> device.  In this way, the virtio probe() would replace my pci probe() for 
> feature discovery, and a virtqueue could replace my msi+ioq for the eventq 
> channel.
>>
>> I see a few issues with that, however:
>> 
>> 1) The virtqueue library, while a perfectly nice ring design at the metadata 
> level, does not have an API that is friendly to kernel-to-kernel communication. 
>  It was designed more for frontend use to some remote backend.  The IOQ 
> library on the other hand, was specifically designed to support use as 
> kernel-to-kernel (see north/south designations).  So this made life easier 
> for me.  To do what you propose, the eventq channel would need to terminate 
> in kernel, and I would thus be forced to deal that the potential API 
> problems.
> 
> Well, virtqueues are not that bad for kernel-to-kernel communication, as Ira 
> mentioned
> referring to his virtio-over-PCI driver. You can have virtqueues on both 
> sides, having
> the host kernel create a pair of virtqueues (one in user aka guest space, 
> one in the host
> kernel), with the host virtqueue_ops doing copy_{to,from}_user to move data 
> between them.

Its been a while since I looked, so perhaps I am wrong here.  I will look again.

> 
> If you have that, you can actually use the same virtio_net driver in both 
> guest and
> host kernel, just communicating over different virtio implementations. 
> Interestingly,
> that would mean that you no longer need a separation between guest and host 
> device
> drivers (vbus and vbus-proxy in your case) but could use the same device 
> abstraction
> with just different transports to back the shm-signal or virtqueue.

Actually, I think there are some problems with that model (such as management of the interface).  virtio-net really wants to connect to a virtio-net-backend (such as the one in qemu or vbus).  It wasn't designed to connect back to back like that.  I think you will quickly run into problems similar to what Ira faced with virtio-over-PCI with that model.

>  
>> 2) I would need to have Avi et. al. allocate a virtio vector to use from 
> their namespace, which I am sure they wont be willing to do until they accept 
> my design.  Today, I have a nice conflict free PCI ID to use as I see fit.
> 
> My impression is the opposite: as long as you try to reinvent everything at 
> once,
> you face opposition, but if you just improve parts of the existing design 
> one
> by one (like eventfd), I think you will find lots of support.
> 
>> Im sure both of these hurdles are not insurmountable, but I am left 
> scratching my head as to why its worth the effort.  It seems to me its a "six 
> of one, half-dozen of the other" kind of scenario.  Either I write a qemu PCI 
> device and pci-bridge driver, or I write a qemu virtio-devicve and virtio root 
> driver.
>> 
>> In short: What does this buy us, or did you mean something else?  
> 
> In my last reply, I was thinking of a root device that can not be probed 
> like a PCI device.

IIUC, because you missed the "feature discovery" function of the bridge, you thought this was possible but now see it is problematic?  Or are you saying that this concept is still valid and should be considered?  I think its the former, but wanted to be sure we were on the same page.

> 
>> > However, assuming that you do the IMHO right thing to do probing like
>> > virtio with a PCI device for each slave, the code will be almost the same
>> > as virtio-pci and the two can be the same.
>> 
>> Can you elaborate?
> 
> Well, let me revise based on the discussion:
> 
> The main point that remains is that I think a vbus-proxy should be the same 
> as a
> virtio device. This could be done by having (as in my earlier mails) a PCI 
> device
> per vbus-proxy, with devcall implemented in PIO or config-space and additional
> shm/shm-signal,

So the problem with this model is the points I made earlier (such as 2b, 2c).

I do agree with you that the *lack* of this model may be problematic for Windows, depending on the answer w.r.t. what the windows drivers look like.

> or it could be a single virtio device from virtio-pci or one
> of the other existing provides that connects you with a new virtio provider
> sitting in the host kernel. This provider has child devices for any endpoint
> (virtio-net, venet, ...) that is implemented in the host kernel.

This is an interesting idea, but I think it also has problems.

What we do get with having the explicit vbus-proxy exposed in the stack (aside from being able to support "vbus native" drivers, like venet) is a neat way to map vbus-isms into virtio-isms.  For instance, vbus uses a string-based device id, and virtio uses a PCI-ID.  Using this as an intermediate layer allows the "virtio" vbus-id to know that we issue dev->call(GETID) to obtain the PCI-ID value of this virtio-device, and we should publish this result to virtio-bus.

Without this intermediate layer, the vbus identity scheme would have to be compatible with virtio PCI-ID based scheme, and I think this is suboptimal for the overall design of vbus.

[snip]

>  
>> Regarding the id->handle indirection:
>> 
>> Internally, the DEVOPEN call translates an "id" to a "handle".  The handle 
> is just a token to help ensure that the caller actually opened the device 
> successfully.  Note that the "id" namespace is 0 based.  Therefore, something 
> like an errant DEVCALL(0) would be indistinguishable from a legit request.  
> Using the handle abstraction gives me a slightly more robust mechanism to 
> ensure the caller actually meant to call the host, and was in the proper 
> context to do so.  For one thing, if the device had never been opened, this 
> would have failed before it ever reached the model.  Its one more check I can 
> do at the infrastructure level, and one less thing each model has to look out 
> for.
>> 
>> Is the id->handle translation critical?  No, i'm sure we could live without 
> it, but I also don't think it hurts anything.  It allows the overall code to 
> be slightly more robust, and the individual model code to be slightly less 
> complicated.  Therefore, I don't see a problem.
> 
> Right, assuming your model with all vbus devices behind a single PCI device, 
> your
> handle does not hurt, it's the equivalent of a bus/dev/fn number or an MMIO 
> address.

Agreed

Thanks Arnd,
-Greg

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Aug. 7, 2009, 2:57 p.m. UTC | #7
On Friday 07 August 2009, Gregory Haskins wrote:
> >>> Arnd Bergmann <arnd@arndb.de> wrote: 
> > On Thursday 06 August 2009, Gregory Haskins wrote:
> > 
> > >     2b) I also want to collapse multiple interrupts together so as to 
> > > minimize the context switch rate (inject + EIO overhead).  My design 
> > > effectively has "NAPI" for interrupt handling.  This helps when the system 
> > > needs it the most: heavy IO.
> > 
> > That sounds like a very useful concept in general, but this seems to be a
> > detail of the interrupt controller implementation. If the IO-APIC cannot
> > do what you want here, maybe we just need a paravirtual IRQ controller
> > driver, like e.g. the PS3 has.
> 
> Yeah, I agree this could be a function of the APIC code.  Do note that I
> mentioned this in passing to Avi a few months ago but FWIW he indicated
> at that time that he is not interested in making the APIC PV.
> 
> Also, I almost forgot an important one.  Add:
> 
>    2c) Interrupt prioritization.  I want to be able to assign priority
>    to interrupts and handle them in priority order.

I think this part of the interface has developed into the wrong direction
because you confused two questions:

1. should you build an advanced interrupt mechanism for virtual drivers?
2. how should you build an advanced interrupt mechanism for virtual drivers?

My guess is that when Avi said he did not want a paravirtual IO-APIC,
he implied that the existing one is good enough (maybe Avi can clarify that
point himself) answering question 1, while you took that as an indication
that the code should live elsewhere instead, answering question 2.

What you built with the shm-signal code is essentially a paravirtual nested
interrupt controller by another name, and deeply integrated into a new
bigger subsystem. I believe that this has significant disadvantages
over the approach of making it a standard interrupt controller driver:

* It completely avoids the infrastructure that we have built into Linux
  to deal with interrupts, e.g. /proc/interrupts statistics, IRQ
  balancing and CPU affinity.

* It makes it impossible to quantify the value of the feature to start with,
  which could be used to answer question 1 above.

* Less importantly, it does not work with any other drivers that might
  also benefit from a new interrupt controller -- if it is indeed better
  than the one we already have.

	Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins Aug. 7, 2009, 3:44 p.m. UTC | #8
>>> On 8/7/2009 at 10:57 AM, in message <200908071657.32858.arnd@arndb.de>, Arnd
Bergmann <arnd@arndb.de> wrote: 
> On Friday 07 August 2009, Gregory Haskins wrote:
>> >>> Arnd Bergmann <arnd@arndb.de> wrote: 
>> > On Thursday 06 August 2009, Gregory Haskins wrote:
>> > 
>> > >     2b) I also want to collapse multiple interrupts together so as to 
>> > > minimize the context switch rate (inject + EIO overhead).  My design 
>> > > effectively has "NAPI" for interrupt handling.  This helps when the system 
> 
>> > > needs it the most: heavy IO.
>> > 
>> > That sounds like a very useful concept in general, but this seems to be a
>> > detail of the interrupt controller implementation. If the IO-APIC cannot
>> > do what you want here, maybe we just need a paravirtual IRQ controller
>> > driver, like e.g. the PS3 has.
>> 
>> Yeah, I agree this could be a function of the APIC code.  Do note that I
>> mentioned this in passing to Avi a few months ago but FWIW he indicated
>> at that time that he is not interested in making the APIC PV.
>> 
>> Also, I almost forgot an important one.  Add:
>> 
>>    2c) Interrupt prioritization.  I want to be able to assign priority
>>    to interrupts and handle them in priority order.
> 
> I think this part of the interface has developed into the wrong direction
> because you confused two questions:
> 
> 1. should you build an advanced interrupt mechanism for virtual drivers?
> 2. how should you build an advanced interrupt mechanism for virtual drivers?
> 
> My guess is that when Avi said he did not want a paravirtual IO-APIC,
> he implied that the existing one is good enough (maybe Avi can clarify that
> point himself) answering question 1, while you took that as an indication
> that the code should live elsewhere instead, answering question 2.
> 
> What you built with the shm-signal code is essentially a paravirtual nested
> interrupt controller by another name, and deeply integrated into a new
> bigger subsystem. I believe that this has significant disadvantages
> over the approach of making it a standard interrupt controller driver:
> 
> * It completely avoids the infrastructure that we have built into Linux
>   to deal with interrupts, e.g. /proc/interrupts statistics, IRQ
>   balancing and CPU affinity.
> 
> * It makes it impossible to quantify the value of the feature to start with,
>   which could be used to answer question 1 above.
> 
> * Less importantly, it does not work with any other drivers that might
>   also benefit from a new interrupt controller -- if it is indeed better
>   than the one we already have.
> 
> 	Arnd <><

Hi Arnd,

I don't strongly disagree with anything you said (except for perhaps that I confused the question).  I agree that the PCI-bridge effectively implements something akin to an interrupt controller.  I agree that this interrupt controller, if indeed superior (I believe it is), can only benefit devices inherently behind the bridge instead of all of KVM (this in of itself doesnt bother me, as I plan on all my performance work to be based on that bus, but I digress.  Also note that this is not dissimilar to other bridge+bus (think usb, scsi) operation).  I agree that a potentially more ideal solution would be if we had a proper generic PV interrupt controller that exhibited similar traits as to what I describe (priority, inject+EIO overhead reduction, etc) so that all of KVM benefited.

The issue wasn't that I didn't know these things.  The issue is that I have no control over whether such an intrusive change to KVM (and the guest arch code) is accepted (and at least one relevant maintainer expressed dissatisfaction (*) at the idea when proposed) Conversely, I am the maintainer of AlacrityVM, so I do have control over the bridge design. ;)  Also note that this particular design decision is completely encapsulated within alacrityvm's components.  IOW, I am not foisting my ideas on the entire kernel tree:  If someone doesn't like what I have done, they can choose not to use alacrity and its like my ideas never existed.  The important thing with this distinction is I am not changing how core linux or core kvm works in the process, only the one little piece of the world that particularly interests me.

That said, if attitudes about some of these ideas have changed, I may be able to break that piece out and start submitting it to kvm@ as some kind of pv interrupt controller.  I would only be interested in doing so if Avi et. al. express an openness to the idea...I.e.. I don't want to waste my time any more than any one elses.

Kind Regards,
-Greg

(*) I think Avi said something to the effect of "you are falling into the 'lets PV the world' trap"

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Snyder Aug. 7, 2009, 3:55 p.m. UTC | #9
On Thu, Aug 06, 2009 at 10:42:00PM -0600, Gregory Haskins wrote:
> >>> On 8/6/2009 at  6:57 PM, in message <200908070057.54795.arnd@arndb.de>, Arnd
> Bergmann <arnd@arndb.de> wrote: 

[ big snip ]

> >> I see a few issues with that, however:
> >> 
> >> 1) The virtqueue library, while a perfectly nice ring design at the metadata 
> > level, does not have an API that is friendly to kernel-to-kernel communication. 
> >  It was designed more for frontend use to some remote backend.  The IOQ 
> > library on the other hand, was specifically designed to support use as 
> > kernel-to-kernel (see north/south designations).  So this made life easier 
> > for me.  To do what you propose, the eventq channel would need to terminate 
> > in kernel, and I would thus be forced to deal that the potential API 
> > problems.
> > 
> > Well, virtqueues are not that bad for kernel-to-kernel communication, as Ira 
> > mentioned
> > referring to his virtio-over-PCI driver. You can have virtqueues on both 
> > sides, having
> > the host kernel create a pair of virtqueues (one in user aka guest space, 
> > one in the host
> > kernel), with the host virtqueue_ops doing copy_{to,from}_user to move data 
> > between them.
> 
> Its been a while since I looked, so perhaps I am wrong here.  I will look again.
> 
> > 
> > If you have that, you can actually use the same virtio_net driver in both 
> > guest and
> > host kernel, just communicating over different virtio implementations. 
> > Interestingly,
> > that would mean that you no longer need a separation between guest and host 
> > device
> > drivers (vbus and vbus-proxy in your case) but could use the same device 
> > abstraction
> > with just different transports to back the shm-signal or virtqueue.
> 
> Actually, I think there are some problems with that model (such as management of the interface).  virtio-net really wants to connect to a virtio-net-backend (such as the one in qemu or vbus).  It wasn't designed to connect back to back like that.  I think you will quickly run into problems similar to what Ira faced with virtio-over-PCI with that model.
> 

Getting the virtio-net devices talking to each other over PCI was not
terribly difficult. However, the capabilities negotiation works in a
VERY awkward way. The capabilities negotiation was really designed with
a virtio-net-backend in mind. Unless I'm missing something obvious, it
is essentially broken for the case where two virtio-net's are talking to
each other.

For example, imagine the situation where you'd like the guest to get a
specific MAC address, but you do not care what MAC address the host
recieves.

Normally, you'd set struct virtio_net_config's mac[] field, and set the
VIRTIO_NET_F_MAC feature. However, when you have two virtio-net's
communicating directly, this doesn't work.

Let me explain with a quick diagram. The results described are the
values RETURNED from virtio_config_ops->get() and
virtio_config_ops->get_features() when called by the system in question.

Guest System
1) struct virtio_net_config->mac[]: 00:11:22:33:44:55
2) features: VIRTIO_NET_F_MAC

Host System
1) struct virtio_net_config->mac[]: unset
2) features: VIRTIO_NET_F_MAC unset

In this case, the feature negotiation code will not accept the
VIRTIO_NET_F_MAC feature, and both systems will generate random mac
addresses. Not the behavior we want at all.

I "fixed" the problem by ALWAYS setting a random MAC address, and ALWAYS
setting the VIRTIO_NET_F_MAC feature. By doing this, both sides always
negotiate the VIRTIO_NET_F_MAC feature.

In conclusion, the feature negotiation works fine for driver features,
such as VIRTIO_NET_MRG_RXBUF or VIRTIO_NET_F_GSO, but completely breaks
down for user-controlled features, like VIRTIO_NET_F_MAC.

I think the vbus configfs interface works great for this situation,
because it has an obvious and seperate backend. It is obvious where the
configuration information is coming from.

With my powerpc hardware, it should be easily possible to have at least
6 devices, each with two virtqueues, one for tx and one for rx. (The
limit is caused by the amount of distinct kick() events I can generate.)
This could allow many combinations of devices, such as:

* 1 virtio-net, 1 virtio-console
* 3 virtio-net, 2 virtio-console
* 6 virtio-net
* etc.

In all honesty, I don't really care so much about the management
interface for my purposes. A static configuration of devices works for
me. However, I doubt that would ever be accepted into the upstream
kernel, which is what I'm really concerned about. I hate seeing drivers
live out-of-tree.

Getting two virtio-net's talking to each other had one other major
problem: the fields in struct virtio_net_hdr are not defined with a
constant endianness. When connecting two virtio-net's running on
different machines, they may have different endianness, as in my case
between a big-endian powerpc guest and a little-endian x86 host. I'm not
confident that qemu-system-ppc, running on x86, using virtio for the
network interface, even works at all. (I have not tested it.)

Sorry that this got somewhat long winded and went a little off topic.
Ira
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Haskins Aug. 7, 2009, 6:25 p.m. UTC | #10
Ira W. Snyder wrote:

<big snip>

> With my powerpc hardware, it should be easily possible to have at least
> 6 devices, each with two virtqueues, one for tx and one for rx. (The
> limit is caused by the amount of distinct kick() events I can generate.)
> This could allow many combinations of devices, such as:
> 
> * 1 virtio-net, 1 virtio-console
> * 3 virtio-net, 2 virtio-console
> * 6 virtio-net
> * etc.

Note that the vbus "connector" design abstract details such as kick
events, so you are not limited by the number of physical interrupts of
your hardware, per se.

You can actually have an arbitrary width namespace for your virtqueues
running over a single interrupt, if you were so inclined.  (Note you can
also have a 1:1 with interrupts if you like, too)

If you look at the design of the vbus-pcibridge connector, it actually
aggregates the entire namespace into between 1 and 8 interrupts
(depending on availability, and separated by priority level).  If
multiple vectors are not available, the entire protocol can fallback to
run over a single vector when necessary.

Kind Regards,
-Greg
diff mbox

Patch

diff --git a/drivers/vbus/Kconfig b/drivers/vbus/Kconfig
index e1939f5..87c545d 100644
--- a/drivers/vbus/Kconfig
+++ b/drivers/vbus/Kconfig
@@ -12,3 +12,13 @@  config VBUS_PROXY
 	in a virtualization solution which implements virtual-bus devices
 	on the backend, say Y.  If unsure, say N.
 
+config VBUS_PCIBRIDGE
+       tristate "PCI to Virtual-Bus bridge"
+       depends on PCI
+       depends on VBUS_PROXY
+       select IOQ
+       default n
+       help
+        Provides a way to bridge host side vbus devices via a PCI-BRIDGE
+        object.  If you are running virtualization with vbus devices on the
+	host, and the vbus is exposed via PCI, say Y.  Otherwise, say N.
diff --git a/drivers/vbus/Makefile b/drivers/vbus/Makefile
index a29a1e0..944b7f1 100644
--- a/drivers/vbus/Makefile
+++ b/drivers/vbus/Makefile
@@ -1,3 +1,6 @@ 
 
 vbus-proxy-objs += bus-proxy.o
 obj-$(CONFIG_VBUS_PROXY) += vbus-proxy.o
+
+vbus-pcibridge-objs += pci-bridge.o
+obj-$(CONFIG_VBUS_PCIBRIDGE) += vbus-pcibridge.o
diff --git a/drivers/vbus/pci-bridge.c b/drivers/vbus/pci-bridge.c
new file mode 100644
index 0000000..b21a9a3
--- /dev/null
+++ b/drivers/vbus/pci-bridge.c
@@ -0,0 +1,824 @@ 
+/*
+ * Copyright (C) 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *	Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
+#include <linux/workqueue.h>
+#include <linux/ioq.h>
+#include <linux/interrupt.h>
+#include <linux/vbus_driver.h>
+#include <linux/vbus_pci.h>
+
+MODULE_AUTHOR("Gregory Haskins");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");
+
+#define VBUS_PCI_NAME "pci-to-vbus-bridge"
+
+struct vbus_pci {
+	spinlock_t                lock;
+	struct pci_dev           *dev;
+	struct ioq                eventq;
+	struct vbus_pci_event    *ring;
+	struct vbus_pci_regs     *regs;
+	void                     *piosignal;
+	int                       irq;
+	int                       enabled:1;
+};
+
+static struct vbus_pci vbus_pci;
+
+struct vbus_pci_device {
+	char                     type[VBUS_MAX_DEVTYPE_LEN];
+	u64                      handle;
+	struct list_head         shms;
+	struct vbus_device_proxy vdev;
+	struct work_struct       add;
+	struct work_struct       drop;
+};
+
+/*
+ * -------------------
+ * common routines
+ * -------------------
+ */
+
+static int
+vbus_pci_hypercall(unsigned long nr, void *data, unsigned long len)
+{
+	struct vbus_pci_hypercall params = {
+		.vector = nr,
+		.len    = len,
+		.datap  = __pa(data),
+	};
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&vbus_pci.lock, flags);
+
+	memcpy_toio(&vbus_pci.regs->hypercall.data, &params, sizeof(params));
+	ret = ioread32(&vbus_pci.regs->hypercall.result);
+
+	spin_unlock_irqrestore(&vbus_pci.lock, flags);
+
+	return ret;
+}
+
+struct vbus_pci_device *
+to_dev(struct vbus_device_proxy *vdev)
+{
+	return container_of(vdev, struct vbus_pci_device, vdev);
+}
+
+static void
+_signal_init(struct shm_signal *signal, struct shm_signal_desc *desc,
+	     struct shm_signal_ops *ops)
+{
+	desc->magic = SHM_SIGNAL_MAGIC;
+	desc->ver   = SHM_SIGNAL_VER;
+
+	shm_signal_init(signal, shm_locality_north, ops, desc);
+}
+
+/*
+ * -------------------
+ * _signal
+ * -------------------
+ */
+
+struct _signal {
+	struct vbus_pci   *pcivbus;
+	struct shm_signal  signal;
+	u32                handle;
+	struct rb_node     node;
+	struct list_head   list;
+};
+
+static struct _signal *
+to_signal(struct shm_signal *signal)
+{
+       return container_of(signal, struct _signal, signal);
+}
+
+static int
+_signal_inject(struct shm_signal *signal)
+{
+	struct _signal *_signal = to_signal(signal);
+
+	iowrite32(_signal->handle, vbus_pci.piosignal);
+
+	return 0;
+}
+
+static void
+_signal_release(struct shm_signal *signal)
+{
+	struct _signal *_signal = to_signal(signal);
+
+	kfree(_signal);
+}
+
+static struct shm_signal_ops _signal_ops = {
+	.inject  = _signal_inject,
+	.release = _signal_release,
+};
+
+/*
+ * -------------------
+ * vbus_device_proxy routines
+ * -------------------
+ */
+
+static int
+vbus_pci_device_open(struct vbus_device_proxy *vdev, int version, int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	struct vbus_pci_deviceopen params;
+	int ret;
+
+	if (dev->handle)
+		return -EINVAL;
+
+	params.devid   = vdev->id;
+	params.version = version;
+
+	ret = vbus_pci_hypercall(VBUS_PCI_HC_DEVOPEN,
+				 &params, sizeof(params));
+	if (ret < 0)
+		return ret;
+
+	dev->handle = params.handle;
+
+	return 0;
+}
+
+static int
+vbus_pci_device_close(struct vbus_device_proxy *vdev, int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	unsigned long iflags;
+	int ret;
+
+	if (!dev->handle)
+		return -EINVAL;
+
+	spin_lock_irqsave(&vbus_pci.lock, iflags);
+
+	while (!list_empty(&dev->shms)) {
+		struct _signal *_signal;
+
+		_signal = list_first_entry(&dev->shms, struct _signal, list);
+
+		list_del(&_signal->list);
+
+		spin_unlock_irqrestore(&vbus_pci.lock, iflags);
+		shm_signal_put(&_signal->signal);
+		spin_lock_irqsave(&vbus_pci.lock, iflags);
+	}
+
+	spin_unlock_irqrestore(&vbus_pci.lock, iflags);
+
+	/*
+	 * The DEVICECLOSE will implicitly close all of the shm on the
+	 * host-side, so there is no need to do an explicit per-shm
+	 * hypercall
+	 */
+	ret = vbus_pci_hypercall(VBUS_PCI_HC_DEVCLOSE,
+				 &dev->handle, sizeof(dev->handle));
+
+	if (ret < 0)
+		printk(KERN_ERR "VBUS-PCI: Error closing device %s/%lld: %d\n",
+		       vdev->type, vdev->id, ret);
+
+	dev->handle = 0;
+
+	return 0;
+}
+
+static int
+vbus_pci_device_shm(struct vbus_device_proxy *vdev, int id, int prio,
+		    void *ptr, size_t len,
+		    struct shm_signal_desc *sdesc, struct shm_signal **signal,
+		    int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	struct _signal *_signal = NULL;
+	struct vbus_pci_deviceshm params;
+	unsigned long iflags;
+	int ret;
+
+	if (!dev->handle)
+		return -EINVAL;
+
+	params.devh   = dev->handle;
+	params.id     = id;
+	params.flags  = flags;
+	params.datap  = (u64)__pa(ptr);
+	params.len    = len;
+
+	if (signal) {
+		/*
+		 * The signal descriptor must be embedded within the
+		 * provided ptr
+		 */
+		if (!sdesc
+		    || (len < sizeof(*sdesc))
+		    || ((void *)sdesc < ptr)
+		    || ((void *)sdesc > (ptr + len - sizeof(*sdesc))))
+			return -EINVAL;
+
+		_signal = kzalloc(sizeof(*_signal), GFP_KERNEL);
+		if (!_signal)
+			return -ENOMEM;
+
+		_signal_init(&_signal->signal, sdesc, &_signal_ops);
+
+		/*
+		 * take another reference for the host.  This is dropped
+		 * by a SHMCLOSE event
+		 */
+		shm_signal_get(&_signal->signal);
+
+		params.signal.offset = (u64)sdesc - (u64)ptr;
+		params.signal.prio   = prio;
+		params.signal.cookie = (u64)_signal;
+
+	} else
+		params.signal.offset = -1; /* yes, this is a u32, but its ok */
+
+	ret = vbus_pci_hypercall(VBUS_PCI_HC_DEVSHM,
+				 &params, sizeof(params));
+	if (ret < 0) {
+		if (_signal) {
+			/*
+			 * We held two references above, so we need to drop
+			 * both of them
+			 */
+			shm_signal_put(&_signal->signal);
+			shm_signal_put(&_signal->signal);
+		}
+
+		return ret;
+	}
+
+	if (signal) {
+		_signal->handle = ret;
+
+		spin_lock_irqsave(&vbus_pci.lock, iflags);
+
+		list_add_tail(&_signal->list, &dev->shms);
+
+		spin_unlock_irqrestore(&vbus_pci.lock, iflags);
+
+		shm_signal_get(&_signal->signal);
+		*signal = &_signal->signal;
+	}
+
+	return 0;
+}
+
+static int
+vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
+		     size_t len, int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	struct vbus_pci_devicecall params = {
+		.devh  = dev->handle,
+		.func  = func,
+		.datap = (u64)__pa(data),
+		.len   = len,
+		.flags = flags,
+	};
+
+	if (!dev->handle)
+		return -EINVAL;
+
+	return vbus_pci_hypercall(VBUS_PCI_HC_DEVCALL, &params, sizeof(params));
+}
+
+static void
+vbus_pci_device_release(struct vbus_device_proxy *vdev)
+{
+	struct vbus_pci_device *_dev = to_dev(vdev);
+
+	vbus_pci_device_close(vdev, 0);
+
+	kfree(_dev);
+}
+
+struct vbus_device_proxy_ops vbus_pci_device_ops = {
+	.open    = vbus_pci_device_open,
+	.close   = vbus_pci_device_close,
+	.shm     = vbus_pci_device_shm,
+	.call    = vbus_pci_device_call,
+	.release = vbus_pci_device_release,
+};
+
+/*
+ * -------------------
+ * vbus events
+ * -------------------
+ */
+
+static void
+deferred_devadd(struct work_struct *work)
+{
+	struct vbus_pci_device *new;
+	int ret;
+
+	new = container_of(work, struct vbus_pci_device, add);
+
+	ret = vbus_device_proxy_register(&new->vdev);
+	if (ret < 0)
+		panic("failed to register device %lld(%s): %d\n",
+		      new->vdev.id, new->type, ret);
+}
+
+static void
+deferred_devdrop(struct work_struct *work)
+{
+	struct vbus_pci_device *dev;
+
+	dev = container_of(work, struct vbus_pci_device, drop);
+	vbus_device_proxy_unregister(&dev->vdev);
+}
+
+static void
+event_devadd(struct vbus_pci_add_event *event)
+{
+	struct vbus_pci_device *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "VBUS_PCI: Out of memory on add_event\n");
+		return;
+	}
+
+	INIT_LIST_HEAD(&new->shms);
+
+	memcpy(new->type, event->type, VBUS_MAX_DEVTYPE_LEN);
+	new->vdev.type        = new->type;
+	new->vdev.id          = event->id;
+	new->vdev.ops         = &vbus_pci_device_ops;
+
+	dev_set_name(&new->vdev.dev, "%lld", event->id);
+
+	INIT_WORK(&new->add, deferred_devadd);
+	INIT_WORK(&new->drop, deferred_devdrop);
+
+	schedule_work(&new->add);
+}
+
+static void
+event_devdrop(struct vbus_pci_handle_event *event)
+{
+	struct vbus_device_proxy *dev = vbus_device_proxy_find(event->handle);
+
+	if (!dev) {
+		printk(KERN_WARNING "VBUS-PCI: devdrop failed: %lld\n",
+		       event->handle);
+		return;
+	}
+
+	schedule_work(&to_dev(dev)->drop);
+}
+
+static void
+event_shmsignal(struct vbus_pci_handle_event *event)
+{
+	struct _signal *_signal = (struct _signal *)event->handle;
+
+	_shm_signal_wakeup(&_signal->signal);
+}
+
+static void
+event_shmclose(struct vbus_pci_handle_event *event)
+{
+	struct _signal *_signal = (struct _signal *)event->handle;
+
+	/*
+	 * This reference was taken during the DEVICESHM call
+	 */
+	shm_signal_put(&_signal->signal);
+}
+
+/*
+ * -------------------
+ * eventq routines
+ * -------------------
+ */
+
+static struct ioq_notifier eventq_notifier;
+
+static int __init
+eventq_init(int qlen)
+{
+	struct ioq_iterator iter;
+	int ret;
+	int i;
+
+	vbus_pci.ring = kzalloc(sizeof(struct vbus_pci_event) * qlen,
+				GFP_KERNEL);
+	if (!vbus_pci.ring)
+		return -ENOMEM;
+
+	/*
+	 * We want to iterate on the "valid" index.  By default the iterator
+	 * will not "autoupdate" which means it will not hypercall the host
+	 * with our changes.  This is good, because we are really just
+	 * initializing stuff here anyway.  Note that you can always manually
+	 * signal the host with ioq_signal() if the autoupdate feature is not
+	 * used.
+	 */
+	ret = ioq_iter_init(&vbus_pci.eventq, &iter, ioq_idxtype_valid, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Seek to the tail of the valid index (which should be our first
+	 * item since the queue is brand-new)
+	 */
+	ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Now populate each descriptor with an empty vbus_event and mark it
+	 * valid
+	 */
+	for (i = 0; i < qlen; i++) {
+		struct vbus_pci_event *event = &vbus_pci.ring[i];
+		size_t                 len   = sizeof(*event);
+		struct ioq_ring_desc  *desc  = iter.desc;
+
+		BUG_ON(iter.desc->valid);
+
+		desc->cookie = (u64)event;
+		desc->ptr    = (u64)__pa(event);
+		desc->len    = len; /* total length  */
+		desc->valid  = 1;
+
+		/*
+		 * This push operation will simultaneously advance the
+		 * valid-tail index and increment our position in the queue
+		 * by one.
+		 */
+		ret = ioq_iter_push(&iter, 0);
+		BUG_ON(ret < 0);
+	}
+
+	vbus_pci.eventq.notifier = &eventq_notifier;
+
+	/*
+	 * And finally, ensure that we can receive notification
+	 */
+	ioq_notify_enable(&vbus_pci.eventq, 0);
+
+	return 0;
+}
+
+/* Invoked whenever the hypervisor ioq_signal()s our eventq */
+static void
+eventq_wakeup(struct ioq_notifier *notifier)
+{
+	struct ioq_iterator iter;
+	int ret;
+
+	/* We want to iterate on the head of the in-use index */
+	ret = ioq_iter_init(&vbus_pci.eventq, &iter, ioq_idxtype_inuse, 0);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * The EOM is indicated by finding a packet that is still owned by
+	 * the south side.
+	 *
+	 * FIXME: This in theory could run indefinitely if the host keeps
+	 * feeding us events since there is nothing like a NAPI budget.  We
+	 * might need to address that
+	 */
+	while (!iter.desc->sown) {
+		struct ioq_ring_desc *desc  = iter.desc;
+		struct vbus_pci_event *event;
+
+		event = (struct vbus_pci_event *)desc->cookie;
+
+		switch (event->eventid) {
+		case VBUS_PCI_EVENT_DEVADD:
+			event_devadd(&event->data.add);
+			break;
+		case VBUS_PCI_EVENT_DEVDROP:
+			event_devdrop(&event->data.handle);
+			break;
+		case VBUS_PCI_EVENT_SHMSIGNAL:
+			event_shmsignal(&event->data.handle);
+			break;
+		case VBUS_PCI_EVENT_SHMCLOSE:
+			event_shmclose(&event->data.handle);
+			break;
+		default:
+			printk(KERN_WARNING "VBUS_PCI: Unexpected event %d\n",
+			       event->eventid);
+			break;
+		};
+
+		memset(event, 0, sizeof(*event));
+
+		/* Advance the in-use head */
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+	}
+
+	/* And let the south side know that we changed the queue */
+	ioq_signal(&vbus_pci.eventq, 0);
+}
+
+static struct ioq_notifier eventq_notifier = {
+	.signal = &eventq_wakeup,
+};
+
+/* Injected whenever the host issues an ioq_signal() on the eventq */
+irqreturn_t
+eventq_intr(int irq, void *dev)
+{
+	_shm_signal_wakeup(vbus_pci.eventq.signal);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * -------------------
+ */
+
+static int
+eventq_signal_inject(struct shm_signal *signal)
+{
+	/* The eventq uses the special-case handle=0 */
+	iowrite32(0, vbus_pci.piosignal);
+
+	return 0;
+}
+
+static void
+eventq_signal_release(struct shm_signal *signal)
+{
+	kfree(signal);
+}
+
+static struct shm_signal_ops eventq_signal_ops = {
+	.inject  = eventq_signal_inject,
+	.release = eventq_signal_release,
+};
+
+/*
+ * -------------------
+ */
+
+static void
+eventq_ioq_release(struct ioq *ioq)
+{
+	/* released as part of the vbus_pci object */
+}
+
+static struct ioq_ops eventq_ioq_ops = {
+	.release = eventq_ioq_release,
+};
+
+/*
+ * -------------------
+ */
+
+static void
+vbus_pci_release(void)
+{
+	if (vbus_pci.irq > 0)
+		free_irq(vbus_pci.irq, NULL);
+
+	if (vbus_pci.piosignal)
+		pci_iounmap(vbus_pci.dev, (void *)vbus_pci.piosignal);
+
+	if (vbus_pci.regs)
+		pci_iounmap(vbus_pci.dev, (void *)vbus_pci.regs);
+
+	pci_release_regions(vbus_pci.dev);
+	pci_disable_device(vbus_pci.dev);
+
+	kfree(vbus_pci.eventq.head_desc);
+	kfree(vbus_pci.ring);
+
+	vbus_pci.enabled = false;
+}
+
+static int __init
+vbus_pci_open(void)
+{
+	struct vbus_pci_negotiate params = {
+		.magic        = VBUS_PCI_ABI_MAGIC,
+		.version      = VBUS_PCI_HC_VERSION,
+		.capabilities = 0,
+	};
+
+	return vbus_pci_hypercall(VBUS_PCI_HC_NEGOTIATE,
+				  &params, sizeof(params));
+}
+
+#define QLEN 1024
+
+static int __init
+vbus_pci_register(void)
+{
+	struct vbus_pci_busreg params = {
+		.count = 1,
+		.eventq = {
+			{
+				.count = QLEN,
+				.ring  = (u64)__pa(vbus_pci.eventq.head_desc),
+				.data  = (u64)__pa(vbus_pci.ring),
+			},
+		},
+	};
+
+	return vbus_pci_hypercall(VBUS_PCI_HC_BUSREG, &params, sizeof(params));
+}
+
+static int __init
+_ioq_init(size_t ringsize, struct ioq *ioq, struct ioq_ops *ops)
+{
+	struct shm_signal    *signal = NULL;
+	struct ioq_ring_head *head = NULL;
+	size_t                len  = IOQ_HEAD_DESC_SIZE(ringsize);
+
+	head = kzalloc(len, GFP_KERNEL | GFP_DMA);
+	if (!head)
+		return -ENOMEM;
+
+	signal = kzalloc(sizeof(*signal), GFP_KERNEL);
+	if (!signal) {
+		kfree(head);
+		return -ENOMEM;
+	}
+
+	head->magic     = IOQ_RING_MAGIC;
+	head->ver	= IOQ_RING_VER;
+	head->count     = ringsize;
+
+	_signal_init(signal, &head->signal, &eventq_signal_ops);
+
+	ioq_init(ioq, ops, ioq_locality_north, head, signal, ringsize);
+
+	return 0;
+}
+
+static int __devinit
+vbus_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+
+	if (vbus_pci.enabled)
+		return -EEXIST; /* we only support one bridge per kernel */
+
+	if (pdev->revision != VBUS_PCI_ABI_VERSION) {
+		printk(KERN_DEBUG "VBUS_PCI: expected ABI version %d, got %d\n",
+		       VBUS_PCI_ABI_VERSION,
+		       pdev->revision);
+		return -ENODEV;
+	}
+
+	vbus_pci.dev = pdev;
+
+	ret = pci_enable_device(pdev);
+	if (ret < 0)
+		return ret;
+
+	ret = pci_request_regions(pdev, VBUS_PCI_NAME);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Could not init BARs: %d\n", ret);
+		goto out_fail;
+	}
+
+	vbus_pci.regs = pci_iomap(pdev, 0, sizeof(struct vbus_pci_regs));
+	if (!vbus_pci.regs) {
+		printk(KERN_ERR "VBUS_PCI: Could not map BARs\n");
+		goto out_fail;
+	}
+
+	vbus_pci.piosignal = pci_iomap(pdev, 1, sizeof(u32));
+	if (!vbus_pci.piosignal) {
+		printk(KERN_ERR "VBUS_PCI: Could not map BARs\n");
+		goto out_fail;
+	}
+
+	ret = vbus_pci_open();
+	if (ret < 0) {
+		printk(KERN_DEBUG "VBUS_PCI: Could not register with host: %d\n",
+		       ret);
+		goto out_fail;
+	}
+
+	/*
+	 * Allocate an IOQ to use for host-2-guest event notification
+	 */
+	ret = _ioq_init(QLEN, &vbus_pci.eventq, &eventq_ioq_ops);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Cound not init eventq: %d\n", ret);
+		goto out_fail;
+	}
+
+	ret = eventq_init(QLEN);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Cound not setup ring: %d\n", ret);
+		goto out_fail;
+	}
+
+	ret = pci_enable_msi(pdev);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Cound not enable MSI: %d\n", ret);
+		goto out_fail;
+	}
+
+	vbus_pci.irq = pdev->irq;
+
+	ret = request_irq(pdev->irq, eventq_intr, 0, "vbus", NULL);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Failed to register IRQ %d\n: %d",
+		       pdev->irq, ret);
+		goto out_fail;
+	}
+
+	/*
+	 * Finally register our queue on the host to start receiving events
+	 */
+	ret = vbus_pci_register();
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Could not register with host: %d\n",
+		       ret);
+		goto out_fail;
+	}
+
+	vbus_pci.enabled = true;
+
+	printk(KERN_INFO "Virtual-Bus: Copyright (c) 2009, " \
+	       "Gregory Haskins <ghaskins@novell.com>\n");
+
+	return 0;
+
+ out_fail:
+	vbus_pci_release();
+
+	return ret;
+}
+
+static void __devexit
+vbus_pci_remove(struct pci_dev *pdev)
+{
+	vbus_pci_release();
+}
+
+static DEFINE_PCI_DEVICE_TABLE(vbus_pci_tbl) = {
+	{ PCI_DEVICE(0x11da, 0x2000) },
+	{ 0 },
+};
+
+MODULE_DEVICE_TABLE(pci, vbus_pci_tbl);
+
+static struct pci_driver vbus_pci_driver = {
+	.name     = VBUS_PCI_NAME,
+	.id_table = vbus_pci_tbl,
+	.probe    = vbus_pci_probe,
+	.remove   = vbus_pci_remove,
+};
+
+int __init
+vbus_pci_init(void)
+{
+	memset(&vbus_pci, 0, sizeof(vbus_pci));
+	spin_lock_init(&vbus_pci.lock);
+
+	return pci_register_driver(&vbus_pci_driver);
+}
+
+static void __exit
+vbus_pci_exit(void)
+{
+	pci_unregister_driver(&vbus_pci_driver);
+}
+
+module_init(vbus_pci_init);
+module_exit(vbus_pci_exit);
+
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 32b3eb8..fa15bbf 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -358,6 +358,7 @@  unifdef-y += uio.h
 unifdef-y += unistd.h
 unifdef-y += usbdevice_fs.h
 unifdef-y += utsname.h
+unifdef-y += vbus_pci.h
 unifdef-y += videodev2.h
 unifdef-y += videodev.h
 unifdef-y += virtio_config.h
diff --git a/include/linux/vbus_pci.h b/include/linux/vbus_pci.h
new file mode 100644
index 0000000..e18ff59
--- /dev/null
+++ b/include/linux/vbus_pci.h
@@ -0,0 +1,127 @@ 
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * PCI to Virtual-Bus Bridge
+ *
+ * Author:
+ *      Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_VBUS_PCI_H
+#define _LINUX_VBUS_PCI_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define VBUS_PCI_ABI_MAGIC 0xbf53eef5
+#define VBUS_PCI_ABI_VERSION 1
+#define VBUS_PCI_HC_VERSION 1
+
+enum {
+	VBUS_PCI_HC_NEGOTIATE,
+	VBUS_PCI_HC_BUSREG,
+	VBUS_PCI_HC_DEVOPEN,
+	VBUS_PCI_HC_DEVCLOSE,
+	VBUS_PCI_HC_DEVCALL,
+	VBUS_PCI_HC_DEVSHM,
+
+	VBUS_PCI_HC_MAX,      /* must be last */
+};
+
+struct vbus_pci_negotiate {
+	__u32 magic;
+	__u32 version;
+	__u64 capabilities;
+};
+
+struct vbus_pci_deviceopen {
+	__u32 devid;
+	__u32 version; /* device ABI version */
+	__u64 handle; /* return value for devh */
+};
+
+struct vbus_pci_devicecall {
+	__u64 devh;   /* device-handle (returned from DEVICEOPEN */
+	__u32 func;
+	__u32 len;
+	__u32 flags;
+	__u64 datap;
+};
+
+struct vbus_pci_deviceshm {
+	__u64 devh;   /* device-handle (returned from DEVICEOPEN */
+	__u32 id;
+	__u32 len;
+	__u32 flags;
+	struct {
+		__u32 offset;
+		__u32 prio;
+		__u64 cookie; /* token to pass back when signaling client */
+	} signal;
+	__u64 datap;
+};
+
+struct vbus_pci_hypercall {
+	__u32 vector;
+	__u32 len;
+	__u64 datap;
+};
+
+struct vbus_pci_regs {
+	struct {
+		struct vbus_pci_hypercall data;
+		__u32                     result;
+	} hypercall;
+};
+
+struct vbus_pci_eventqreg {
+	__u32 count;
+	__u64 ring;
+	__u64 data;
+};
+
+struct vbus_pci_busreg {
+	__u32 count;  /* supporting multiple queues allows for prio, etc */
+	struct vbus_pci_eventqreg eventq[1];
+};
+
+enum vbus_pci_eventid {
+	VBUS_PCI_EVENT_DEVADD,
+	VBUS_PCI_EVENT_DEVDROP,
+	VBUS_PCI_EVENT_SHMSIGNAL,
+	VBUS_PCI_EVENT_SHMCLOSE,
+};
+
+#define VBUS_MAX_DEVTYPE_LEN 128
+
+struct vbus_pci_add_event {
+	__u64 id;
+	char  type[VBUS_MAX_DEVTYPE_LEN];
+};
+
+struct vbus_pci_handle_event {
+	__u64 handle;
+};
+
+struct vbus_pci_event {
+	__u32 eventid;
+	union {
+		struct vbus_pci_add_event    add;
+		struct vbus_pci_handle_event handle;
+	} data;
+};
+
+#endif /* _LINUX_VBUS_PCI_H */