Patchwork [7/7,v2] drivers/misc: introduce Freescale hypervisor management driver

login
register
mail settings
Submitter Timur Tabi
Date June 1, 2011, 6:35 p.m.
Message ID <1306953337-15698-1-git-send-email-timur@freescale.com>
Download mbox | patch
Permalink /patch/98233/
State Superseded
Headers show

Comments

Timur Tabi - June 1, 2011, 6:35 p.m.
The Freescale hypervisor management driver provides several services to
drivers and applications related to the Freescale hypervisor:

1. An ioctl interface for querying and managing partitions

2. A file interface to reading incoming doorbells

3. An interrupt handler for shutting down the partition upon receiving the
   shutdown doorbell from a manager partition

4. An interface for receiving callbacks when a managed partition shuts down.

Signed-off-by: Timur Tabi <timur@freescale.com>
---
 drivers/misc/Kconfig           |    7 +
 drivers/misc/Makefile          |    1 +
 drivers/misc/fsl_hypervisor.c  |  941 ++++++++++++++++++++++++++++++++++++++++
 include/linux/Kbuild           |    1 +
 include/linux/fsl_hypervisor.h |  203 +++++++++
 5 files changed, 1153 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/fsl_hypervisor.c
 create mode 100644 include/linux/fsl_hypervisor.h
Alan Cox - June 1, 2011, 7:46 p.m.
O> +	/* One partition must be local, the other must be remote.  In other
> +	   words, if source and target are both -1, or are both not -1, then
> +	   return an error. */
> +	if ((param.source == -1) == (param.target == -1))
> +		return -EINVAL;

Excess brackets (I just mention that one in passing)

> +	/* pages is an array of struct page pointers that's initialized by
> +	   get_user_pages() */
> +	pages = kzalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
> +	if (!pages) {
> +		pr_debug("fsl-hv: could not allocate page list\n");
> +		return -ENOMEM;
> +	}

pages allocated

> +
> +	/* sg_list is the list of fh_sg_list objects that we pass to the
> +	   hypervisor */
> +	sg_list_unaligned = kmalloc(nr_pages * sizeof(struct fh_sg_list) +
> +		sizeof(struct fh_sg_list) - 1, GFP_KERNEL);
> +	if (!sg_list_unaligned) {
> +		pr_debug("fsl-hv: could not allocate S/G list\n");

but not freed on this path

Although the other paths look fine.

> +exit:
> +	if (pages) {
> +		for (i = 0; i < nr_pages; i++)
> +			if (pages[i])
> +				page_cache_release(pages[i]);
> +	}
> +
> +	kfree(sg_list_unaligned);
> +	kfree(pages);

> +static char *strdup_from_user(const char __user *ustr, size_t max)
> +{
> +	size_t len;
> +	char *str;
> +
> +	len = strnlen_user(ustr, max);
> +	if (len > max)
> +		return ERR_PTR(-ENAMETOOLONG);
> +
> +	str = kmalloc(len, GFP_KERNEL);
> +	if (!str)
> +		return ERR_PTR(-ENOMEM);
> +
> +	if (copy_from_user(str, ustr, len))
> +		return ERR_PTR(-EFAULT);
> +
> +	return str;
> +}

This belongs on lib/ if its of general use which I think it perhaps is
and if we don't have one already.


> +	default:
> +		pr_debug("fsl-hv: unknown ioctl %u\n", cmd);
> +		ret = -ENOIOCTLCMD;

-ENOTTY

(-ENOIOCTLCMD is an internal indicator designed so driver layers can say
'dunno, try the next layer up')

> +/* Linked list of processes that have us open */
> +struct list_head db_list;

static ?


> + * We use the same interrupt handler for all doorbells.  Whenever a doorbell
> + * is rung, and we receive an interrupt, we just put the handle for that
> + * doorbell (passed to us as *data) into all of the queues.

I wonder if these should be presented as IRQs or whether that makes no
sense ?


> +static irqreturn_t fsl_hv_state_change_isr(int irq, void *data)
> +{
> +	unsigned int status;
> +	struct doorbell_isr *dbisr = data;
> +	int ret;
> +
> +	/* Determine the new state, and if it's stopped, notify the clients. */
> +	ret = fh_partition_get_status(dbisr->partition, &status);
> +	if (!ret && (status == FH_PARTITION_STOPPED))
> +		schedule_work(&dbisr->work);
> +
> +	/* Call the normal handler */
> +	return fsl_hv_isr(irq, (void *) (uintptr_t) dbisr->doorbell);
> +}

Would a threaded IRQ clean this up by avoiding the queue/work stuff ?


> +static irqreturn_t fsl_hv_shutdown_isr(int irq, void *data)
> +{
> +	schedule_work(&power_off);
> +
> +	/* We should never get here */

Probably worth printing something if you do (panic(...) ?)
Timur Tabi - June 1, 2011, 8:24 p.m.
Alan Cox wrote:
> O> +	/* One partition must be local, the other must be remote.  In other
>> +	   words, if source and target are both -1, or are both not -1, then
>> +	   return an error. */
>> +	if ((param.source == -1) == (param.target == -1))
>> +		return -EINVAL;
> 
> Excess brackets (I just mention that one in passing)'

Do you mean excess parentheses?  If so, then I don't see how.  "(param.source ==
-1)" and "(param.target == -1)" are expressions that return a boolean.  I'm
comparing the two boolean results to see if they're equal.  I want to make sure
that the compiler doesn't do something like this:

	if (param.source == (-1 == (param.target == -1)))

I don't even know what that means, but it's not what I want.

>> +static char *strdup_from_user(const char __user *ustr, size_t max)
>> +{
>> +	size_t len;
>> +	char *str;
>> +
>> +	len = strnlen_user(ustr, max);
>> +	if (len > max)
>> +		return ERR_PTR(-ENAMETOOLONG);
>> +
>> +	str = kmalloc(len, GFP_KERNEL);
>> +	if (!str)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	if (copy_from_user(str, ustr, len))
>> +		return ERR_PTR(-EFAULT);
>> +
>> +	return str;
>> +}
> 
> This belongs on lib/ if its of general use which I think it perhaps is
> and if we don't have one already.

Where exactly in lib/ should it go?  lib/strings.c seems too low-level for a
function like this.  lib/string_helpers.c is for sprintf-like functions.  And
it's too generic for lib/powerpc/

>> +	default:
>> +		pr_debug("fsl-hv: unknown ioctl %u\n", cmd);
>> +		ret = -ENOIOCTLCMD;
> 
> -ENOTTY
> 
> (-ENOIOCTLCMD is an internal indicator designed so driver layers can say
> 'dunno, try the next layer up')
> 
>> +/* Linked list of processes that have us open */
>> +struct list_head db_list;
> 
> static ?
> 
> 
>> + * We use the same interrupt handler for all doorbells.  Whenever a doorbell
>> + * is rung, and we receive an interrupt, we just put the handle for that
>> + * doorbell (passed to us as *data) into all of the queues.
> 
> I wonder if these should be presented as IRQs or whether that makes no
> sense ?

Well, the "handles" are supposed to be just unique numbers.  In this case, they
are IRQs, but we don't want to expose that.  The application is supposed to scan
the device tree looking for the doorbell nodes that it wants, and in those nodes
there is a 'reg' property that contains the handle for that doorbell.  So the
numbers just need to match.  What they represent is not relevant.

>> +static irqreturn_t fsl_hv_state_change_isr(int irq, void *data)
>> +{
>> +	unsigned int status;
>> +	struct doorbell_isr *dbisr = data;
>> +	int ret;
>> +
>> +	/* Determine the new state, and if it's stopped, notify the clients. */
>> +	ret = fh_partition_get_status(dbisr->partition, &status);
>> +	if (!ret && (status == FH_PARTITION_STOPPED))
>> +		schedule_work(&dbisr->work);
>> +
>> +	/* Call the normal handler */
>> +	return fsl_hv_isr(irq, (void *) (uintptr_t) dbisr->doorbell);
>> +}
> 
> Would a threaded IRQ clean this up by avoiding the queue/work stuff ?

Yes, I think so.  V3 coming up.
Alan Cox - June 1, 2011, 8:34 p.m.
On Wed, 1 Jun 2011 15:24:51 -0500
Timur Tabi <timur@freescale.com> wrote:

> Alan Cox wrote:
> > O> +	/* One partition must be local, the other must be remote.  In other
> >> +	   words, if source and target are both -1, or are both not -1, then
> >> +	   return an error. */
> >> +	if ((param.source == -1) == (param.target == -1))
> >> +		return -EINVAL;
> > 
> > Excess brackets (I just mention that one in passing)'
> 
> Do you mean excess parentheses?  If so, then I don't see how.  "(param.source ==
> -1)" and "(param.target == -1)" are expressions that return a boolean.  I'm
> comparing the two boolean results to see if they're equal

Ok - it's a bit non obvious but yes fair enough and it is commented.

> Where exactly in lib/ should it go?  lib/strings.c seems too low-level for a
> function like this.  lib/string_helpers.c is for sprintf-like functions.  And
> it's too generic for lib/powerpc/

I'd submit it to lib/string personally but I'm not sure where would be
better. If someone doesn't like lib/string they can suggest a better
place 8)

> Well, the "handles" are supposed to be just unique numbers.  In this case, they
> are IRQs, but we don't want to expose that.  The application is supposed to scan
> the device tree looking for the doorbell nodes that it wants, and in those nodes
> there is a 'reg' property that contains the handle for that doorbell.  So the
> numbers just need to match.  What they represent is not relevant.

Ok so they are always going to be exposed to users not to drivers, in
which case yes it makes sense. If they are going to get used by kernel
drivers as well an IRQ interface would probably also make sense.

Alan
Scott Wood - June 1, 2011, 8:54 p.m.
On Wed, 1 Jun 2011 20:46:18 +0100
Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:

> > +static char *strdup_from_user(const char __user *ustr, size_t max)
> > +{
> > +	size_t len;
> > +	char *str;
> > +
> > +	len = strnlen_user(ustr, max);
> > +	if (len > max)
> > +		return ERR_PTR(-ENAMETOOLONG);

if (len >= max)

> > +	str = kmalloc(len, GFP_KERNEL);
> > +	if (!str)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	if (copy_from_user(str, ustr, len))
> > +		return ERR_PTR(-EFAULT);
> > +
> > +	return str;
> > +}

Memory leak on the EFAULT path

If strnlen_user gets an exception, it'll return zero, causing a
zero-length kmalloc.  Will kmalloc(0, ...) return NULL?  If so, a bad
user pointer would result in -ENOMEM rather than -EFAULT.

> > +	default:
> > +		pr_debug("fsl-hv: unknown ioctl %u\n", cmd);
> > +		ret = -ENOIOCTLCMD;
> 
> -ENOTTY
> 
> (-ENOIOCTLCMD is an internal indicator designed so driver layers can say
> 'dunno, try the next layer up')

There's a check for -ENOIOCTLCMD in vfs_ioctl() -- though it generates
-EINVAL rather than -ENOTTY (why?).

Using -ENOIOCTLCMD consistently would make it easier to refactor a
toplevel ioctl handler into a nested one, plus consistency is good in
general.

> > + * We use the same interrupt handler for all doorbells.  Whenever a doorbell
> > + * is rung, and we receive an interrupt, we just put the handle for that
> > + * doorbell (passed to us as *data) into all of the queues.
> 
> I wonder if these should be presented as IRQs or whether that makes no
> sense ?

They are presented as IRQs.  This driver registers the same handler for all
doorbell IRQs, and pipes the notifications up to userspace, but you could
also directly register a kernel handler for an individual doorbell IRQ.

> > +static irqreturn_t fsl_hv_shutdown_isr(int irq, void *data)
> > +{
> > +	schedule_work(&power_off);
> > +
> > +	/* We should never get here */
> 
> Probably worth printing something if you do (panic(...) ?)

I don't think the comment is accurate.  We've just scheduled the workqueue,
not waited for it to complete.

Timur, shouldn't this schedule orderly_poweroff rather than
kernel_power_off?

-Scott
Arnd Bergmann - June 1, 2011, 9:40 p.m.
On Wednesday 01 June 2011, Timur Tabi wrote:
> The Freescale hypervisor management driver provides several services to
> drivers and applications related to the Freescale hypervisor:
> 
> 1. An ioctl interface for querying and managing partitions
> 
> 2. A file interface to reading incoming doorbells
> 
> 3. An interrupt handler for shutting down the partition upon receiving the
>    shutdown doorbell from a manager partition
> 
> 4. An interface for receiving callbacks when a managed partition shuts down.
> 
> Signed-off-by: Timur Tabi <timur@freescale.com>
> ---
>  drivers/misc/Kconfig           |    7 +
>  drivers/misc/Makefile          |    1 +
>  drivers/misc/fsl_hypervisor.c  |  941 ++++++++++++++++++++++++++++++++++++++++
>  include/linux/Kbuild           |    1 +
>  include/linux/fsl_hypervisor.h |  203 +++++++++
>  5 files changed, 1153 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/misc/fsl_hypervisor.c
>  create mode 100644 include/linux/fsl_hypervisor.h

I think drivers/misc is not the right place for this, but I'm not completely
sure what is. drivers/firmware would be better at least, but virt/fsl might
also be ok.

> +static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set)
> +{
> +	struct fsl_hv_ioctl_prop param;
> +	char __user *upath, *upropname;
> +	void __user *upropval;
> +	char *path = NULL, *propname = NULL;
> +	void *propval = NULL;
> +	int ret = 0;
> +

I'm not convinced that an ioctl interface is the right way to work with
device tree properties. A more natural way would be to export it as
a file system, or maybe as a flattened device tree blob (the latter option
would require changing the hypervisor interface, which might not be
possible).

> +/**
> + * fsl_hv_ioctl: ioctl main entry point
> + */
> +static long fsl_hv_ioctl(struct file *file, unsigned int cmd,
> +			 unsigned long argaddr)
> +{
> +	union fsl_hv_ioctl_param __user *arg =
> +		(union fsl_hv_ioctl_param __user *)argaddr;
> +	long ret;
> +

For an ioctl, please follow the normal pattern of defining a separate
structure for each case, no union.

You can use a void __user * in the common ioctl function, and pass that
to the typed argument list in the specific functions.

	Arnd
Alan Cox - June 1, 2011, 9:45 p.m.
> There's a check for -ENOIOCTLCMD in vfs_ioctl() -- though it generates
> -EINVAL rather than -ENOTTY (why?).

Good question - perhaps someone didn't read the spec 8)
Scott Wood - June 1, 2011, 10:24 p.m.
On Wed, 1 Jun 2011 23:40:14 +0200
Arnd Bergmann <arnd@arndb.de> wrote:

> > +static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set)
> > +{
> > +	struct fsl_hv_ioctl_prop param;
> > +	char __user *upath, *upropname;
> > +	void __user *upropval;
> > +	char *path = NULL, *propname = NULL;
> > +	void *propval = NULL;
> > +	int ret = 0;
> > +
> 
> I'm not convinced that an ioctl interface is the right way to work with
> device tree properties. A more natural way would be to export it as
> a file system, or maybe as a flattened device tree blob (the latter option
> would require changing the hypervisor interface, which might not be
> possible).

I wanted to have the hypervisor take an update dtb (we already have special
meta-properties for things like deletion as part of the hv config
mechanism).  But others on the project wanted to keep it simple, and so
get/set property it was. :-/

It's unlikely to change at this point without a real need.

As for a filesystem interface, it's not a good match either.  You can't
iterate over anything to read out the full tree from the hv.  You can't
delete anything.  You can't create empty nodes.  The hv interface was meant
to enable some specific management actions, rather than to provide general
device tree access.  This driver is a thin wrapper around the management
hcalls.

There would still be other ioctls needed for starting/stopping the
partition, etc.

> > +/**
> > + * fsl_hv_ioctl: ioctl main entry point
> > + */
> > +static long fsl_hv_ioctl(struct file *file, unsigned int cmd,
> > +			 unsigned long argaddr)
> > +{
> > +	union fsl_hv_ioctl_param __user *arg =
> > +		(union fsl_hv_ioctl_param __user *)argaddr;
> > +	long ret;
> > +
> 
> For an ioctl, please follow the normal pattern of defining a separate
> structure for each case, no union.

And have fsl_hypervisor.h provide the full set of proper ioctl numbers, with
the specific struct for each ioctl referenced, rather than having the client program do
"ioctl(f, _IOWR(0, cmd, union fsl_hv_ioctl_param), p)".

-Scott
Timur Tabi - June 2, 2011, 9:28 p.m.
Arnd Bergmann wrote:
> I think drivers/misc is not the right place for this, but I'm not completely
> sure what is. drivers/firmware would be better at least, but virt/fsl might
> also be ok.

I don't think it's correct to think of a hypervisor as firmware, so I don't
think drivers/firmware is better.

I'm not sure that creating virt/fsl and putting the driver in there is a good
idea, because it will be the only driver in that directory.  Unlike KVM, this
driver is just a collection of front-ends to our hypervisor API.  The actual
hypervisor is completely separate.  That's why I put it in drivers/misc, because
it's just a single driver with a miscellaneous collection of interfaces.

We also don't want to have any Kconfig options that "turn on" hypervisor
support.  I've intentionally made support for the hypervisor part of the normal
platform code, and the device tree is used to determine whether that code is
active or not.

So virt/fsl seems like overkill to me.

> I'm not convinced that an ioctl interface is the right way to work with
> device tree properties. A more natural way would be to export it as
> a file system, or maybe as a flattened device tree blob (the latter option
> would require changing the hypervisor interface, which might not be
> possible).

As Scott said, this is just a front-end to the hypervisor API, and we already
have applications that depend on the ioctl interface.  Considering that this
driver is specific to the Freescale hypervisor, so I don't see any harm in using
ioctls.

> For an ioctl, please follow the normal pattern of defining a separate
> structure for each case, no union.

Ok.  This will break our existing applications, but it's a minor fix.
Timur Tabi - June 3, 2011, 2:44 p.m.
Arnd Bergmann wrote:
> For an ioctl, please follow the normal pattern of defining a separate
> structure for each case, no union.
> 
> You can use a void __user * in the common ioctl function, and pass that
> to the typed argument list in the specific functions.

I have a GPL question.  This header file is currently licensed under the  GPL v2
only.  Does that mean that any application that includes this header file so
that it can talk to the driver/hypervisor also needs to be licensed under the GPL?
Arnd Bergmann - June 3, 2011, 3:17 p.m.
On Friday 03 June 2011, Timur Tabi wrote:
> Arnd Bergmann wrote:
> > For an ioctl, please follow the normal pattern of defining a separate
> > structure for each case, no union.
> > 
> > You can use a void __user * in the common ioctl function, and pass that
> > to the typed argument list in the specific functions.
> 
> I have a GPL question.  This header file is currently licensed under the  GPL v2
> only.  Does that mean that any application that includes this header file so
> that it can talk to the driver/hypervisor also needs to be licensed under the GPL?

If you have a license question, ask your lawyer.

Common answers that you would hear are:

* User space interfaces of the kernel are excluded from the License by the
  "normal system call" exception in linux/COPYING.

* If the header files only contain interfaces but no code, they are not
  copyrighted.

* If you want to ship a copy of the file with a user space application source,
  but have to make it available under multiple licenses to do that, e.g. dual
  GPL/BSD.

	Arnd
Arnd Bergmann - June 3, 2011, 3:24 p.m.
On Thursday 02 June 2011, Timur Tabi wrote:
> Arnd Bergmann wrote:
> > I think drivers/misc is not the right place for this, but I'm not completely
> > sure what is. drivers/firmware would be better at least, but virt/fsl might
> > also be ok.
> 
> I don't think it's correct to think of a hypervisor as firmware, so I don't
> think drivers/firmware is better.
> 
> I'm not sure that creating virt/fsl and putting the driver in there is a good
> idea, because it will be the only driver in that directory.  Unlike KVM, this
> driver is just a collection of front-ends to our hypervisor API.  The actual
> hypervisor is completely separate.  That's why I put it in drivers/misc, because
> it's just a single driver with a miscellaneous collection of interfaces.

Ok, fair enough. If nobody has a strong preference any other way, just make it
drivers/firmware then.

> We also don't want to have any Kconfig options that "turn on" hypervisor
> support.  I've intentionally made support for the hypervisor part of the normal
> platform code, and the device tree is used to determine whether that code is
> active or not.
> 
> So virt/fsl seems like overkill to me.

Ok. Obviously, you still have the option to move the code later if you require
more interfaces to talk to your hypervisor, and then you can still add virt/fsl,
so no problem.

> > I'm not convinced that an ioctl interface is the right way to work with
> > device tree properties. A more natural way would be to export it as
> > a file system, or maybe as a flattened device tree blob (the latter option
> > would require changing the hypervisor interface, which might not be
> > possible).
> 
> As Scott said, this is just a front-end to the hypervisor API, and we already
> have applications that depend on the ioctl interface.  Considering that this
> driver is specific to the Freescale hypervisor, so I don't see any harm in using
> ioctls.

Good interface design is always important, one of the reasons is serving as
an example for other people doing similar drivers. There are a lot of
companies writing hypervisors and they all need interfaces like this.

> > For an ioctl, please follow the normal pattern of defining a separate
> > structure for each case, no union.
> 
> Ok.  This will break our existing applications, but it's a minor fix.

Ok. Better to break it now than accidentally break it later because of
some side-effect of a strange interface, e.g. when the union would change in
size.

	Arnd
Timur Tabi - June 3, 2011, 3:28 p.m.
Arnd Bergmann wrote:
>> > I don't think it's correct to think of a hypervisor as firmware, so I don't
>> > think drivers/firmware is better.
>> > 
>> > I'm not sure that creating virt/fsl and putting the driver in there is a good
>> > idea, because it will be the only driver in that directory.  Unlike KVM, this
>> > driver is just a collection of front-ends to our hypervisor API.  The actual
>> > hypervisor is completely separate.  That's why I put it in drivers/misc, because
>> > it's just a single driver with a miscellaneous collection of interfaces.

> Ok, fair enough. If nobody has a strong preference any other way, just make it
> drivers/firmware then.

Did you mean to say drivers/misc?
Arnd Bergmann - June 3, 2011, 3:28 p.m.
On Thursday 02 June 2011, Scott Wood wrote:
> I wanted to have the hypervisor take an update dtb (we already have special
> meta-properties for things like deletion as part of the hv config
> mechanism).  But others on the project wanted to keep it simple, and so
> get/set property it was. :-/
> 
> It's unlikely to change at this point without a real need.
> 
> As for a filesystem interface, it's not a good match either. 
> You can't iterate over anything to read out the full tree from the hv.

kexec iterates over /proc/device-tree to create a dts blob.

> You can't delete anything. 

rm, rmdir

> You can't create empty nodes.

mkdir

> The hv interface was meant to enable some specific management actions,
> rather than to provide general device tree access.  This driver is a thin
> wrapper around the management hcalls.

A file system would be a slightly more abstract interface to do the
same thing, I gues.

> There would still be other ioctls needed for starting/stopping the
> partition, etc.

Right, although you could model them as a file interface as well.
KVMfs is one example doing that.

	Arnd
Scott Wood - June 3, 2011, 4:22 p.m.
On Fri, 3 Jun 2011 17:28:43 +0200
Arnd Bergmann <arnd@arndb.de> wrote:

> On Thursday 02 June 2011, Scott Wood wrote:
> > I wanted to have the hypervisor take an update dtb (we already have special
> > meta-properties for things like deletion as part of the hv config
> > mechanism).  But others on the project wanted to keep it simple, and so
> > get/set property it was. :-/
> > 
> > It's unlikely to change at this point without a real need.
> > 
> > As for a filesystem interface, it's not a good match either. 
> > You can't iterate over anything to read out the full tree from the hv.
> 
> kexec iterates over /proc/device-tree to create a dts blob.

That's irrelevant, because we're not talking about that device tree.  We're
talking about the device tree of another hypervisor guest.

> > You can't delete anything. 
> 
> rm, rmdir
> 
> > You can't create empty nodes.
> 
> mkdir

I know how to operate a filesystem.  You can't do these operations *on
another guest's device tree through the hv interface*.

> > There would still be other ioctls needed for starting/stopping the
> > partition, etc.
> 
> Right, although you could model them as a file interface as well.
> KVMfs is one example doing that.

And what would be the benefit of this major restructuring and added
complexity?

-Scott
Arnd Bergmann - June 6, 2011, 3:42 p.m.
On Friday 03 June 2011, Timur Tabi wrote:
> Arnd Bergmann wrote:
> >> > I don't think it's correct to think of a hypervisor as firmware, so I don't
> >> > think drivers/firmware is better.
> >> > 
> >> > I'm not sure that creating virt/fsl and putting the driver in there is a good
> >> > idea, because it will be the only driver in that directory.  Unlike KVM, this
> >> > driver is just a collection of front-ends to our hypervisor API.  The actual
> >> > hypervisor is completely separate.  That's why I put it in drivers/misc, because
> >> > it's just a single driver with a miscellaneous collection of interfaces.
> 
> > Ok, fair enough. If nobody has a strong preference any other way, just make it
> > drivers/firmware then.
> 
> Did you mean to say drivers/misc?

Sorry, I misread your first sentence above. I thought you said that you prefer
drivers/firmware over virt/fsl. drivers/misc is definitely the wrong
place for this, please choose a better one. Maybe drivers/virt/ ?

	Arnd
Timur Tabi - June 6, 2011, 3:48 p.m.
Arnd Bergmann wrote:
> Sorry, I misread your first sentence above. I thought you said that you prefer
> drivers/firmware over virt/fsl. drivers/misc is definitely the wrong
> place for this, please choose a better one. Maybe drivers/virt/ ?

I'll be more than happy to go with the consensus, but I don't think it makes
sense to create a new directory just for this one, limited-use driver.  You're
the only person who's complained about drivers/misc.  I'm pretty sure that if I
put it in drivers/virt, I'll get more complaints.

I still don't understand what's wrong with drivers/misc, especially since my
driver registers as a "misc" driver.
Arnd Bergmann - June 6, 2011, 3:53 p.m.
On Friday 03 June 2011, Scott Wood wrote:
> On Fri, 3 Jun 2011 17:28:43 +0200
> Arnd Bergmann <arnd@arndb.de> wrote:
> 
> > On Thursday 02 June 2011, Scott Wood wrote:
> > > I wanted to have the hypervisor take an update dtb (we already have special
> > > meta-properties for things like deletion as part of the hv config
> > > mechanism).  But others on the project wanted to keep it simple, and so
> > > get/set property it was. :-/
> > > 
> > > It's unlikely to change at this point without a real need.
> > > 
> > > As for a filesystem interface, it's not a good match either. 
> > > You can't iterate over anything to read out the full tree from the hv.
> > 
> > kexec iterates over /proc/device-tree to create a dts blob.
> 
> That's irrelevant, because we're not talking about that device tree.  We're
> talking about the device tree of another hypervisor guest.

I understand that it's a different device tree. That doesn't mean we
can't use the same tools.

> > > You can't delete anything. 
> > 
> > rm, rmdir
> > 
> > > You can't create empty nodes.
> > 
> > mkdir
> 
> I know how to operate a filesystem.  You can't do these operations *on
> another guest's device tree through the hv interface*.

Why not? From a device driver perspective, it's not much of a difference
whether you export a device (or hypervisor, firmware, ...) setting
as an ioctl or an inode operation.

> > > There would still be other ioctls needed for starting/stopping the
> > > partition, etc.
> > 
> > Right, although you could model them as a file interface as well.
> > KVMfs is one example doing that.
> 
> And what would be the benefit of this major restructuring and added
> complexity?

I think it would be a slightly better abstraction, and the complexity
is not as big as you make it sound. I'm mainly countering your statement
that it would be a bad interface or that would not possible to do.

I'm not that opposed to having an ioctl interface for your hypervisor
interface, but I am opposed to making design choices based on
a bad representations of facts or not having considered the options
that other people suggest.

	Arnd
Arnd Bergmann - June 6, 2011, 4:03 p.m.
On Monday 06 June 2011, Timur Tabi wrote:
> Arnd Bergmann wrote:
> > Sorry, I misread your first sentence above. I thought you said that you prefer
> > drivers/firmware over virt/fsl. drivers/misc is definitely the wrong
> > place for this, please choose a better one. Maybe drivers/virt/ ?
> 
> I'll be more than happy to go with the consensus, but I don't think it makes
> sense to create a new directory just for this one, limited-use driver.  You're
> the only person who's complained about drivers/misc.  I'm pretty sure that if I
> put it in drivers/virt, I'll get more complaints.
> 
> I still don't understand what's wrong with drivers/misc, especially since my
> driver registers as a "misc" driver.

I basically think that drivers/misc is wrong for most of the stuff that is
already in there, either because the drivers actually fit into a subsystem
together with other drivers or because they contain rather horrible code.

The idea that drivers using misc_register belong into drivers/misc is a
common misconception. Traditionally they go to drivers/char, which would
still be a better choice, and most "misc" drivers are actually part of a
proper subsystem, while most drivers in drivers/misc don't have a character
device interface.

When we talked about the situation of drivers/misc and drivers/char at
one of the recent conferences, a broad consensus was that they are in
need of a maintainer, which I foolishly signed up for. Deepak wanted
to send an update to the MAINTAINERS file for this (I guess I can do
that too, since he must have forgotten about it), but the main idea is
that I'm there to say no to any driver that someone tries to add there,
unless there are really good reasons why it is actually a good place
to live for that driver.

	Arnd
Timur Tabi - June 6, 2011, 4:09 p.m.
Arnd Bergmann wrote:
> When we talked about the situation of drivers/misc and drivers/char at
> one of the recent conferences, a broad consensus was that they are in
> need of a maintainer, which I foolishly signed up for. Deepak wanted
> to send an update to the MAINTAINERS file for this (I guess I can do
> that too, since he must have forgotten about it), but the main idea is
> that I'm there to say no to any driver that someone tries to add there,
> unless there are really good reasons why it is actually a good place
> to live for that driver.

Can you give me an example of a driver that *does* belong in drivers/misc?
Frankly, I just don't see what's wrong with a repository of various drivers that
don't really belong anywhere else.

And what about my concern that my driver will be the only one in drivers/virt?
Arnd Bergmann - June 6, 2011, 4:24 p.m.
On Monday 06 June 2011, Timur Tabi wrote:
> Arnd Bergmann wrote:
> > When we talked about the situation of drivers/misc and drivers/char at
> > one of the recent conferences, a broad consensus was that they are in
> > need of a maintainer, which I foolishly signed up for. Deepak wanted
> > to send an update to the MAINTAINERS file for this (I guess I can do
> > that too, since he must have forgotten about it), but the main idea is
> > that I'm there to say no to any driver that someone tries to add there,
> > unless there are really good reasons why it is actually a good place
> > to live for that driver.
> 
> Can you give me an example of a driver that does belong in drivers/misc?

Not really.

> Frankly, I just don't see what's wrong with a repository of various drivers that
> don't really belong anywhere else.

The main problem is that for the most part it's a pile of crap that
nobody wants to look at, so drivers getting added there see basically
no real review.

> And what about my concern that my driver will be the only one in drivers/virt?

I have no doubt that more of these will come. Chris Metcalf is currently
looking for a home for his tilera hypervisor drivers, and we have the
microsoft hyperv drivers in drivers/staging, so they will hopefully
move to a proper place later. We also have similar drivers in other
places, e.g. drivers/ps3/ps3-sys-manager.c, drivers/s390/char/vmcp.c
or parts of drivers/xen.

	Arnd
Timur Tabi - June 6, 2011, 4:27 p.m.
Arnd Bergmann wrote:
> I have no doubt that more of these will come. Chris Metcalf is currently
> looking for a home for his tilera hypervisor drivers, and we have the
> microsoft hyperv drivers in drivers/staging, so they will hopefully
> move to a proper place later. We also have similar drivers in other
> places, e.g. drivers/ps3/ps3-sys-manager.c, drivers/s390/char/vmcp.c
> or parts of drivers/xen.

Alright, drivers/virt it is.  I'll post a v4 once everyone else has had a chance
to comment on v3.
Scott Wood - June 6, 2011, 6:15 p.m.
On Mon, 6 Jun 2011 17:53:09 +0200
Arnd Bergmann <arnd@arndb.de> wrote:

> On Friday 03 June 2011, Scott Wood wrote:
> > On Fri, 3 Jun 2011 17:28:43 +0200
> > Arnd Bergmann <arnd@arndb.de> wrote:
> > 
> > > On Thursday 02 June 2011, Scott Wood wrote:
> > > > I wanted to have the hypervisor take an update dtb (we already have special
> > > > meta-properties for things like deletion as part of the hv config
> > > > mechanism).  But others on the project wanted to keep it simple, and so
> > > > get/set property it was. :-/
> > > > 
> > > > It's unlikely to change at this point without a real need.
> > > > 
> > > > As for a filesystem interface, it's not a good match either. 
> > > > You can't iterate over anything to read out the full tree from the hv.
> > > 
> > > kexec iterates over /proc/device-tree to create a dts blob.
> > 
> > That's irrelevant, because we're not talking about that device tree.  We're
> > talking about the device tree of another hypervisor guest.
> 
> I understand that it's a different device tree. That doesn't mean we
> can't use the same tools.

The kernel does not have the same sort of access to this tree.

> > > > You can't delete anything. 
> > > 
> > > rm, rmdir
> > > 
> > > > You can't create empty nodes.
> > > 
> > > mkdir
> > 
> > I know how to operate a filesystem.  You can't do these operations *on
> > another guest's device tree through the hv interface*.
> 
> Why not?

Because the hypervisor does not support it.  It provides only getprop and
setprop.  I think you took my "you can't do that" statements to be a
statement about limitations of using a filesystem interfcae -- quite the
opposite, I was saying the hv functionality is too limited to support a
filesystem interface with normal semantics.

> > And what would be the benefit of this major restructuring and added
> > complexity?
> 
> I think it would be a slightly better abstraction, and the complexity
> is not as big as you make it sound. I'm mainly countering your statement
> that it would be a bad interface or that would not possible to do.
> 
> I'm not that opposed to having an ioctl interface for your hypervisor
> interface, but I am opposed to making design choices based on
> a bad representations of facts or not having considered the options
> that other people suggest.

I don't really see how a filesystem is a better abstraction for a wrapper
around a procedural interface.  A somewhat better argument is that ioctls
are a pain, and Linux doesn't have a better way to expose a procedural
interface, that doesn't require a wrapper program -- though as the wrapper
already exists, and the fs interface would probably be sufficiently awkward
that people would still use a wrapper, that doesn't buy us too much either.

This is not being proposed as any sort of standard kernel API, just a way
for userspace to get access to implementation-specific hcalls.
Implementation-specific backdoor is practically the definition of ioctl. :-)

I would be interested to see a concrete proposal for what this would look
like as a filesystem, though, based on the actual operations that are
available.  How would you deal with getting all the parameters in,
performing the operation, and getting the results back?  What about when
multiple processes are doing this at the same time?  What would the memcpy
hcall look like?

-Scott
Arnd Bergmann - June 6, 2011, 7:48 p.m.
On Monday 06 June 2011 20:15:16 Scott Wood wrote:
> On Mon, 6 Jun 2011 17:53:09 +0200
> Arnd Bergmann <arnd@arndb.de> wrote:

> > > > > You can't delete anything. 
> > > > 
> > > > rm, rmdir
> > > > 
> > > > > You can't create empty nodes.
> > > > 
> > > > mkdir
> > > 
> > > I know how to operate a filesystem.  You can't do these operations *on
> > > another guest's device tree through the hv interface*.
> > 
> > Why not?
> 
> Because the hypervisor does not support it.  It provides only getprop and
> setprop.  I think you took my "you can't do that" statements to be a
> statement about limitations of using a filesystem interfcae -- quite the
> opposite, I was saying the hv functionality is too limited to support a
> filesystem interface with normal semantics.

Ok, sorry for the confusion on my part. It makes a lot more sense now.

> > > And what would be the benefit of this major restructuring and added
> > > complexity?
> > 
> > I think it would be a slightly better abstraction, and the complexity
> > is not as big as you make it sound. I'm mainly countering your statement
> > that it would be a bad interface or that would not possible to do.
> > 
> > I'm not that opposed to having an ioctl interface for your hypervisor
> > interface, but I am opposed to making design choices based on
> > a bad representations of facts or not having considered the options
> > that other people suggest.
> 
> I don't really see how a filesystem is a better abstraction for a wrapper
> around a procedural interface.  A somewhat better argument is that ioctls
> are a pain, and Linux doesn't have a better way to expose a procedural
> interface, that doesn't require a wrapper program -- though as the wrapper
> already exists, and the fs interface would probably be sufficiently awkward
> that people would still use a wrapper, that doesn't buy us too much either.
> 
> This is not being proposed as any sort of standard kernel API, just a way
> for userspace to get access to implementation-specific hcalls.
> Implementation-specific backdoor is practically the definition of ioctl. :-)
> 
> I would be interested to see a concrete proposal for what this would look
> like as a filesystem, though, based on the actual operations that are
> available.  How would you deal with getting all the parameters in,
> performing the operation, and getting the results back?  What about when
> multiple processes are doing this at the same time?  What would the memcpy
> hcall look like?

In fs/libfs.c, we have support for "simple transaction files", where you
write input parameters into a file and then read it back to get the
results. They are very rarely used, but can serve as a way to replace ioctls
with file operations where that makes sense.

For an inter-partition memcpy, a better interface might be a pipe
representation: You have a namespace that is shared between two
partitions, so each side can create directory entries with arbitrary
names in one of the subdirectories of the file system. Then you can
open the file for reading on one side and writing on the other side
and when both sides have started the respective operation, the hypervisor
will copy data. The possible ways to use that functionality are countless.

Similarly, you can mmap a file to get inter-partition shared memory.

	Arnd
Chris Metcalf - June 6, 2011, 9:01 p.m.
On 6/6/2011 12:24 PM, Arnd Bergmann wrote:
> On Monday 06 June 2011, Timur Tabi wrote:.
>> And what about my concern that my driver will be the only one in drivers/virt?
> I have no doubt that more of these will come. Chris Metcalf is currently
> looking for a home for his tilera hypervisor drivers, and we have the
> microsoft hyperv drivers in drivers/staging, so they will hopefully
> move to a proper place later. We also have similar drivers in other
> places, e.g. drivers/ps3/ps3-sys-manager.c, drivers/s390/char/vmcp.c
> or parts of drivers/xen.

It might help if someone (Arnd?) wanted to propose a statement of what
drivers/virt was really for.  If it's for any Linux driver that upcalls to
a hypervisor for any reason, then the Tilera paravirtualized drivers fit in
well.  If it's intended more for drivers that guests running under a
hypervisor can use to talk to the hypervisor itself (e.g. managing
notifications that a hypervisor delivers to a guest to cause it to shut
down or take other actions), then it doesn't seem like the Tilera
paravirtualized device drivers belong there, since they're just using the
Tilera hypervisor synchronously to do I/O or get/set device and driver state.
Konrad Rzeszutek Wilk - June 6, 2011, 9:23 p.m.
On Mon, Jun 06, 2011 at 05:01:36PM -0400, Chris Metcalf wrote:
> On 6/6/2011 12:24 PM, Arnd Bergmann wrote:
> > On Monday 06 June 2011, Timur Tabi wrote:.
> >> And what about my concern that my driver will be the only one in drivers/virt?
> > I have no doubt that more of these will come. Chris Metcalf is currently
> > looking for a home for his tilera hypervisor drivers, and we have the
> > microsoft hyperv drivers in drivers/staging, so they will hopefully
> > move to a proper place later. We also have similar drivers in other
> > places, e.g. drivers/ps3/ps3-sys-manager.c, drivers/s390/char/vmcp.c
> > or parts of drivers/xen.
> 
> It might help if someone (Arnd?) wanted to propose a statement of what
> drivers/virt was really for.  If it's for any Linux driver that upcalls to

Was for? I am not seeing it in v3.0-rc2?

> a hypervisor for any reason, then the Tilera paravirtualized drivers fit in
> well.  If it's intended more for drivers that guests running under a
> hypervisor can use to talk to the hypervisor itself (e.g. managing

I believe that the code that deals with specific subsystem (so block API
for example) would reside in subsystem directory (so drivers/block would have
your virtualization block driver). This allows the maintainer of block
to make sure your driver is OK.

> notifications that a hypervisor delivers to a guest to cause it to shut
> down or take other actions), then it doesn't seem like the Tilera

That looks to be arch/<x>/tilera/virt/ candidate?

> paravirtualized device drivers belong there, since they're just using the
> Tilera hypervisor synchronously to do I/O or get/set device and driver state.

Well, I/O sounds like block API or network API. But then you are also
doing management ioctl - which implies "drivers". "drivers/tilera" does not
work?

> 
> -- 
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
Chris Metcalf - June 6, 2011, 11:04 p.m.
For context, the most recent patch for the tile driver in question is here:

https://patchwork.kernel.org/patch/843892/

On 6/6/2011 5:23 PM, Konrad Rzeszutek Wilk wrote:
> On Mon, Jun 06, 2011 at 05:01:36PM -0400, Chris Metcalf wrote:
>> On 6/6/2011 12:24 PM, Arnd Bergmann wrote:
>>> On Monday 06 June 2011, Timur Tabi wrote:.
>>>> And what about my concern that my driver will be the only one in drivers/virt?
>>> I have no doubt that more of these will come. Chris Metcalf is currently
>>> looking for a home for his tilera hypervisor drivers, and we have the
>>> microsoft hyperv drivers in drivers/staging, so they will hopefully
>>> move to a proper place later. We also have similar drivers in other
>>> places, e.g. drivers/ps3/ps3-sys-manager.c, drivers/s390/char/vmcp.c
>>> or parts of drivers/xen.
>> It might help if someone (Arnd?) wanted to propose a statement of what
>> drivers/virt was really for.  If it's for any Linux driver that upcalls to
> Was for? I am not seeing it in v3.0-rc2?

Sorry, maybe a questionable idiom, but please read past tense in the quoted
text as meaning present tense :-)

>> a hypervisor for any reason, then the Tilera paravirtualized drivers fit in
>> well.  If it's intended more for drivers that guests running under a
>> hypervisor can use to talk to the hypervisor itself (e.g. managing
> I believe that the code that deals with specific subsystem (so block API
> for example) would reside in subsystem directory (so drivers/block would have
> your virtualization block driver). This allows the maintainer of block
> to make sure your driver is OK.

Sure, makes sense.  The new push (as I understand it) is to group primarily
by function, not by bus or architecture.

>> notifications that a hypervisor delivers to a guest to cause it to shut
>> down or take other actions), then it doesn't seem like the Tilera
> That looks to be arch/<x>/tilera/virt/ candidate?

Arnd, among others, has suggested that all drivers live in "drivers"
somewhere, so "arch/tile" may not be the best place.  (To be fair, I
originally had this driver in arch/tile/drivers/, so your idea is certainly
reasonable!)

>> paravirtualized device drivers belong there, since they're just using the
>> Tilera hypervisor synchronously to do I/O or get/set device and driver state.
> Well, I/O sounds like block API or network API. But then you are also
> doing management ioctl - which implies "drivers". "drivers/tilera" does not
> work?

There is certainly precedent for drivers that don't fit cleanly into an
existing category to go in drivers/<arch>, e.g. drivers/s390,
drivers/parisc, etc.  There is also drivers/platform/x86, though that seems
to be for the bus "platform drivers" rather than just a random character
driver like the one in question.

I don't have a particular opinion here; I'm just hoping to develop enough
consensus that I can ask Linus to pull the driver without generating
controversy :-)
Arnd Bergmann - June 7, 2011, 7:08 a.m.
On Tuesday 07 June 2011 01:04:40 Chris Metcalf wrote:
> For context, the most recent patch for the tile driver in question is here:
> 
> https://patchwork.kernel.org/patch/843892/
> 
> On 6/6/2011 5:23 PM, Konrad Rzeszutek Wilk wrote:
> > On Mon, Jun 06, 2011 at 05:01:36PM -0400, Chris Metcalf wrote:
>
> >> a hypervisor for any reason, then the Tilera paravirtualized drivers fit in
> >> well.  If it's intended more for drivers that guests running under a
> >> hypervisor can use to talk to the hypervisor itself (e.g. managing
> > I believe that the code that deals with specific subsystem (so block API
> > for example) would reside in subsystem directory (so drivers/block would have
> > your virtualization block driver). This allows the maintainer of block
> > to make sure your driver is OK.
> 
> Sure, makes sense.  The new push (as I understand it) is to group primarily
> by function, not by bus or architecture.

Yes.

> >> notifications that a hypervisor delivers to a guest to cause it to shut
> >> down or take other actions), then it doesn't seem like the Tilera
> > That looks to be arch/<x>/tilera/virt/ candidate?
> 
> Arnd, among others, has suggested that all drivers live in "drivers"
> somewhere, so "arch/tile" may not be the best place.  (To be fair, I
> originally had this driver in arch/tile/drivers/, so your idea is certainly
> reasonable!)
> 
> >> paravirtualized device drivers belong there, since they're just using the
> >> Tilera hypervisor synchronously to do I/O or get/set device and driver state.
> > Well, I/O sounds like block API or network API. But then you are also
> > doing management ioctl - which implies "drivers". "drivers/tilera" does not
> > work?
> 
> There is certainly precedent for drivers that don't fit cleanly into an
> existing category to go in drivers/<arch>, e.g. drivers/s390,
> drivers/parisc, etc.  There is also drivers/platform/x86, though that seems
> to be for the bus "platform drivers" rather than just a random character
> driver like the one in question.
> 
> I don't have a particular opinion here; I'm just hoping to develop enough
> consensus that I can ask Linus to pull the driver without generating
> controversy :-)

The drivers/s390 and drivers/parisc directories are from a distant past,
we should not add new ones like them. drivers/platform is controversial,
but I think it's ok for stuff that manages platform specific quirks.
The main problem with that is that it doesn't work for embedded systems,
by extension every ARM specific driver could go into drivers/platform/...
and we don't want that.

You can probably argue that the tile drivers do fit in here as long as
they are specific to the hypervisor and not to some SOC specific hardware.

	Arnd
Chris Metcalf - June 7, 2011, 4:49 p.m.
On 6/7/2011 3:08 AM, Arnd Bergmann wrote:
> On Tuesday 07 June 2011 01:04:40 Chris Metcalf wrote:
>> There is certainly precedent for drivers that don't fit cleanly into an
>> existing category to go in drivers/<arch>, e.g. drivers/s390,
>> drivers/parisc, etc.  There is also drivers/platform/x86, though that seems
>> to be for the bus "platform drivers" rather than just a random character
>> driver like the one in question.
> The drivers/s390 and drivers/parisc directories are from a distant past,
> we should not add new ones like them. drivers/platform is controversial,
> but I think it's ok for stuff that manages platform specific quirks.
> The main problem with that is that it doesn't work for embedded systems,
> by extension every ARM specific driver could go into drivers/platform/...
> and we don't want that.
>
> You can probably argue that the tile drivers do fit in here as long as
> they are specific to the hypervisor and not to some SOC specific hardware.

Can you clarify that?   I think you're contrasting something like an ARM
core that was licensed and put in a SoC by some random vendor, and you
could have an endless stream of drivers for that case.  The Tilera core
isn't being licensed; it's sold more like an Intel chip with a fixed set of
interfaces available only from Tilera.  The particular interface in
question here is SPI, and the core itself knows how to boot the chip over
SPI by finding an SPI ROM and reading the boot stream out of it directly
after power-up.

So does that match with your model of "drivers/platform/tile"?  Maybe we
have a winner!  :-)
Arnd Bergmann - June 7, 2011, 7:16 p.m.
On Tuesday 07 June 2011 18:49:02 Chris Metcalf wrote:
> > You can probably argue that the tile drivers do fit in here as long as
> > they are specific to the hypervisor and not to some SOC specific hardware.
> 
> Can you clarify that?   I think you're contrasting something like an ARM
> core that was licensed and put in a SoC by some random vendor, and you
> could have an endless stream of drivers for that case.  The Tilera core
> isn't being licensed; it's sold more like an Intel chip with a fixed set of
> interfaces available only from Tilera.  The particular interface in
> question here is SPI, and the core itself knows how to boot the chip over
> SPI by finding an SPI ROM and reading the boot stream out of it directly
> after power-up.
> 
> So does that match with your model of "drivers/platform/tile"?  Maybe we
> have a winner!  :-)

I'm not really against drivers/platform/tile for this, the only potential
problem that I see with this is that having more stuff in drivers/platform
might lead to having even more other stuff in there that should really
go into another place.

Obviously, if the device is a raw SPI host, the driver should actually go
into drivers/spi/spi_tile.c rather than drivers/platform/tile/spi.c.

For the spi flash driver that goes through the hypervisor abstraction,
I think drivers/virt/tile would be better than driver/platform/tile,
but we should really have a new "abstract flash character driver" subsystem
for that.

	Arnd
Timur Tabi - June 7, 2011, 7:20 p.m.
Arnd Bergmann wrote:
> For the spi flash driver that goes through the hypervisor abstraction,
> I think drivers/virt/tile would be better than driver/platform/tile,
> but we should really have a new "abstract flash character driver" subsystem
> for that.

Why should it matter that the SPI flash driver goes through the hypervisor
abstration?  One of the patches in this patchset is a TTY driver that goes
through the Freescale hypervisor.  I put the drivers in drivers/tty.
Arnd Bergmann - June 7, 2011, 7:34 p.m.
On Tuesday 07 June 2011 21:20:50 Timur Tabi wrote:
> Arnd Bergmann wrote:
> > For the spi flash driver that goes through the hypervisor abstraction,
> > I think drivers/virt/tile would be better than driver/platform/tile,
> > but we should really have a new "abstract flash character driver" subsystem
> > for that.
> 
> Why should it matter that the SPI flash driver goes through the hypervisor
> abstration?  One of the patches in this patchset is a TTY driver that goes
> through the Freescale hypervisor.  I put the drivers in drivers/tty.

The driver in question is for a hypervisor that abstracts the flash memory
using a read/write interface. There is no way you can represent that as
a SPI host driver.

	Arnd

Patch

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index d80dcde..3e016b3 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -216,6 +216,13 @@  config ENCLOSURE_SERVICES
 	  driver (SCSI/ATA) which supports enclosures
 	  or a SCSI enclosure device (SES) to use these services.
 
+config FSL_HV_MANAGER
+	tristate "Freescale hypervisor management driver"
+	depends on FSL_SOC
+	help
+	  This driver allows applications to communicate with the Freescale
+	  Hypervisor.
+
 config SGI_XP
 	tristate "Support communication between SGI SSIs"
 	depends on NET
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 848e846..d93bd76 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -20,6 +20,7 @@  obj-$(CONFIG_SENSORS_BH1770)	+= bh1770glc.o
 obj-$(CONFIG_SENSORS_APDS990X)	+= apds990x.o
 obj-$(CONFIG_SGI_IOC4)		+= ioc4.o
 obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
+obj-$(CONFIG_FSL_HV_MANAGER)	+= fsl_hypervisor.o
 obj-$(CONFIG_KGDB_TESTS)	+= kgdbts.o
 obj-$(CONFIG_SGI_XP)		+= sgi-xp/
 obj-$(CONFIG_SGI_GRU)		+= sgi-gru/
diff --git a/drivers/misc/fsl_hypervisor.c b/drivers/misc/fsl_hypervisor.c
new file mode 100644
index 0000000..a03aa7b
--- /dev/null
+++ b/drivers/misc/fsl_hypervisor.c
@@ -0,0 +1,941 @@ 
+/** @file
+ * Freescale Hypervisor Management Driver
+ *
+ * This driver contains functions to support the Freescale hypervisor.
+ */
+/* Copyright (C) 2008-2010 Freescale Semiconductor, Inc.
+ * Author: Timur Tabi <timur@freescale.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/of.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <linux/notifier.h>
+
+#include <linux/io.h>
+#include <asm/fsl_hcalls.h>
+
+#include <linux/fsl_hypervisor.h>
+
+static BLOCKING_NOTIFIER_HEAD(failover_subscribers);
+
+/**
+ * ioctl_restart: ioctl interface for FSL_HV_IOCTL_PARTITION_RESTART
+ *
+ * Restart a running partition
+ */
+static long ioctl_restart(struct fsl_hv_ioctl_restart __user *p)
+{
+	struct fsl_hv_ioctl_restart param;
+
+	/* Get the parameters from the user */
+	if (copy_from_user(&param, p, sizeof(struct fsl_hv_ioctl_restart)))
+		return -EFAULT;
+
+	param.ret = fh_partition_restart(param.partition);
+
+	if (copy_to_user(&p->ret, &param.ret, sizeof(__u32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * ioctl_status: ioctl interface for FSL_HV_IOCTL_PARTITION_STATUS
+ *
+ * Query the status of a partition
+ */
+static long ioctl_status(struct fsl_hv_ioctl_status __user *p)
+{
+	struct fsl_hv_ioctl_status param;
+	u32 status;
+
+	/* Get the parameters from the user */
+	if (copy_from_user(&param, p, sizeof(struct fsl_hv_ioctl_status)))
+		return -EFAULT;
+
+	param.ret = fh_partition_get_status(param.partition, &status);
+	if (!param.ret)
+		param.status = status;
+
+	if (copy_to_user(p, &param, sizeof(struct fsl_hv_ioctl_status)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * ioctl_start: ioctl interface for FSL_HV_IOCTL_PARTITION_START
+ *
+ * Start a stopped partition.
+ */
+static long ioctl_start(struct fsl_hv_ioctl_start __user *p)
+{
+	struct fsl_hv_ioctl_start param;
+
+	/* Get the parameters from the user */
+	if (copy_from_user(&param, p, sizeof(struct fsl_hv_ioctl_start)))
+		return -EFAULT;
+
+	param.ret = fh_partition_start(param.partition, param.entry_point,
+				       param.load);
+
+	if (copy_to_user(&p->ret, &param.ret, sizeof(__u32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * ioctl_stop: ioctl interface for FSL_HV_IOCTL_PARTITION_STOP
+ *
+ * Stop a running partition
+ */
+static long ioctl_stop(struct fsl_hv_ioctl_stop __user *p)
+{
+	struct fsl_hv_ioctl_stop param;
+
+	/* Get the parameters from the user */
+	if (copy_from_user(&param, p, sizeof(struct fsl_hv_ioctl_stop)))
+		return -EFAULT;
+
+	param.ret = fh_partition_stop(param.partition);
+
+	if (copy_to_user(&p->ret, &param.ret, sizeof(__u32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * ioctl_memcpy: ioctl interface for FSL_HV_IOCTL_MEMCPY
+ *
+ * The FH_MEMCPY hypercall takes an array of address/address/size structures
+ * to represent the data being copied.  As a convenience to the user, this
+ * ioctl takes a user-create buffer and a pointer to a guest physically
+ * contiguous buffer in the remote partition, and creates the
+ * address/address/size array for the hypercall.
+ */
+static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
+{
+	struct fsl_hv_ioctl_memcpy param;
+
+	struct page **pages = NULL;
+	void *sg_list_unaligned = NULL;
+	struct fh_sg_list *sg_list = NULL;
+
+	unsigned int nr_pages;
+	unsigned long lb_offset; /* Offset within a page of the local buffer */
+
+	unsigned int i;
+	long ret = 0;
+	phys_addr_t remote_paddr; /* The next address in the remote buffer */
+	uint32_t count; /* The number of bytes left to copy */
+
+	/* Get the parameters from the user */
+	if (copy_from_user(&param, p, sizeof(struct fsl_hv_ioctl_memcpy)))
+		return -EFAULT;
+
+	/* One partition must be local, the other must be remote.  In other
+	   words, if source and target are both -1, or are both not -1, then
+	   return an error. */
+	if ((param.source == -1) == (param.target == -1))
+		return -EINVAL;
+
+	/*
+	 * The array of pages returned by get_user_pages() covers only
+	 * page-aligned memory.  Since the user buffer is probably not
+	 * page-aligned, we need to handle the discrepancy.
+	 *
+	 * We calculate the offset within a page of the S/G list, and make
+	 * adjustments accordingly.  This will result in a page list that looks
+	 * like this:
+	 *
+	 *      ----    <-- first page starts before the buffer
+	 *     |    |
+	 *     |////|-> ----
+	 *     |////|  |    |
+	 *      ----   |    |
+	 *             |    |
+	 *      ----   |    |
+	 *     |////|  |    |
+	 *     |////|  |    |
+	 *     |////|  |    |
+	 *      ----   |    |
+	 *             |    |
+	 *      ----   |    |
+	 *     |////|  |    |
+	 *     |////|  |    |
+	 *     |////|  |    |
+	 *      ----   |    |
+	 *             |    |
+	 *      ----   |    |
+	 *     |////|  |    |
+	 *     |////|-> ----
+	 *     |    |   <-- last page ends after the buffer
+	 *      ----
+	 *
+	 * The distance between the start of the first page and the start of the
+	 * buffer is lb_offset.  The hashed (///) areas are the parts of the
+	 * page list that contain the actual buffer.
+	 *
+	 * The advantage of this approach is that the number of pages is
+	 * equal to the number of entries in the S/G list that we give to the
+	 * hypervisor.
+	 */
+	lb_offset = param.local_vaddr & (PAGE_SIZE - 1);
+	nr_pages = (param.count + lb_offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	/* Allocate the buffers we need */
+
+	/* pages is an array of struct page pointers that's initialized by
+	   get_user_pages() */
+	pages = kzalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		pr_debug("fsl-hv: could not allocate page list\n");
+		return -ENOMEM;
+	}
+
+	/* sg_list is the list of fh_sg_list objects that we pass to the
+	   hypervisor */
+	sg_list_unaligned = kmalloc(nr_pages * sizeof(struct fh_sg_list) +
+		sizeof(struct fh_sg_list) - 1, GFP_KERNEL);
+	if (!sg_list_unaligned) {
+		pr_debug("fsl-hv: could not allocate S/G list\n");
+		return -ENOMEM;
+	}
+	sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list));
+
+	/* Get the physical addresses of the source buffer */
+	down_read(&current->mm->mmap_sem);
+	ret = get_user_pages(current, current->mm,
+		param.local_vaddr - lb_offset, nr_pages,
+		(param.source == -1) ? READ : WRITE,
+		0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+
+	if (ret != nr_pages) {
+		/* get_user_pages() failed */
+		pr_debug("fsl-hv: could not lock source buffer\n");
+		ret = -EACCES;
+		goto exit;
+	}
+
+	/* reset ret here */
+	ret = 0;
+
+	/* Build the fh_sg_list[] array.  The first page is special
+	   because it's misaligned.*/
+	if (param.source == -1) {
+		sg_list[0].source = page_to_phys(pages[0]) + lb_offset;
+		sg_list[0].target = param.remote_paddr;
+	} else {
+		sg_list[0].source = param.remote_paddr;
+		sg_list[0].target = page_to_phys(pages[0]) + lb_offset;
+	}
+	sg_list[0].size = min_t(uint64_t, param.count, PAGE_SIZE - lb_offset);
+
+	remote_paddr = param.remote_paddr + sg_list[0].size;
+	count = param.count - sg_list[0].size;
+
+	for (i = 1; i < nr_pages; i++) {
+		if (param.source == -1) {
+			/* local to remote */
+			sg_list[i].source = page_to_phys(pages[i]);
+			sg_list[i].target = remote_paddr;
+		} else {
+			/* remote to local */
+			sg_list[i].source = remote_paddr;
+			sg_list[i].target = page_to_phys(pages[i]);
+		}
+		sg_list[i].size = min_t(uint64_t, count, PAGE_SIZE);
+
+		remote_paddr += sg_list[i].size;
+		count -= sg_list[i].size;
+	}
+
+	param.ret = fh_partition_memcpy(param.source, param.target,
+		virt_to_phys(sg_list), nr_pages);
+
+exit:
+	if (pages) {
+		for (i = 0; i < nr_pages; i++)
+			if (pages[i])
+				page_cache_release(pages[i]);
+	}
+
+	kfree(sg_list_unaligned);
+	kfree(pages);
+
+	if (!ret)
+		if (copy_to_user(&p->ret, &param.ret, sizeof(__u32)))
+			return -EFAULT;
+
+	return ret;
+}
+
+/**
+ * ioctl_doorbell: ioctl interface for FSL_HV_IOCTL_DOORBELL
+ *
+ * Ring a doorbell
+ */
+static long ioctl_doorbell(struct fsl_hv_ioctl_doorbell __user *p)
+{
+	struct fsl_hv_ioctl_doorbell param;
+
+	/* Get the parameters from the user */
+	if (copy_from_user(&param, p, sizeof(struct fsl_hv_ioctl_doorbell)))
+		return -EFAULT;
+
+	param.ret = ev_doorbell_send(param.doorbell);
+
+	if (copy_to_user(&p->ret, &param.ret, sizeof(__u32)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static char *strdup_from_user(const char __user *ustr, size_t max)
+{
+	size_t len;
+	char *str;
+
+	len = strnlen_user(ustr, max);
+	if (len > max)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(str, ustr, len))
+		return ERR_PTR(-EFAULT);
+
+	return str;
+}
+
+static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set)
+{
+	struct fsl_hv_ioctl_prop param;
+	char __user *upath, *upropname;
+	void __user *upropval;
+	char *path = NULL, *propname = NULL;
+	void *propval = NULL;
+	int ret = 0;
+
+	/* Get the parameters from the user */
+	if (copy_from_user(&param, p, sizeof(struct fsl_hv_ioctl_prop)))
+		return -EFAULT;
+
+	upath = (char __user *)(uintptr_t)param.path;
+	upropname = (char __user *)(uintptr_t)param.propname;
+	upropval = (void __user *)(uintptr_t)param.propval;
+
+	path = strdup_from_user(upath, FH_DTPROP_MAX_PATHLEN);
+	if (IS_ERR(path)) {
+		ret = PTR_ERR(path);
+		goto out;
+	}
+
+	propname = strdup_from_user(upropname, FH_DTPROP_MAX_PATHLEN);
+	if (IS_ERR(propname)) {
+		ret = PTR_ERR(propname);
+		goto out;
+	}
+
+	if (param.proplen > FH_DTPROP_MAX_PROPLEN) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	propval = kmalloc(param.proplen, GFP_KERNEL);
+	if (!propval) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (set) {
+		if (copy_from_user(propval, upropval, param.proplen)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		param.ret = fh_partition_set_dtprop(param.handle,
+						    virt_to_phys(path),
+						    virt_to_phys(propname),
+						    virt_to_phys(propval),
+						    param.proplen);
+	} else {
+		param.ret = fh_partition_get_dtprop(param.handle,
+						    virt_to_phys(path),
+						    virt_to_phys(propname),
+						    virt_to_phys(propval),
+						    &param.proplen);
+
+		if (param.ret == 0) {
+			if (copy_to_user(upropval, propval, param.proplen) ||
+			    put_user(param.proplen, &p->proplen)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		}
+	}
+
+	if (put_user(param.ret, &p->ret))
+		ret = -EFAULT;
+
+out:
+	kfree(path);
+	kfree(propval);
+	kfree(propname);
+
+	return ret;
+}
+
+/**
+ * fsl_hv_ioctl: ioctl main entry point
+ */
+static long fsl_hv_ioctl(struct file *file, unsigned int cmd,
+			 unsigned long argaddr)
+{
+	union fsl_hv_ioctl_param __user *arg =
+		(union fsl_hv_ioctl_param __user *)argaddr;
+	long ret;
+
+	/* Make sure the application is called the right driver. */
+	if (_IOC_TYPE(cmd) != 0) {
+		pr_debug("fsl-hv: ioctl type %u should be 0\n", _IOC_TYPE(cmd));
+		return -EINVAL;
+	}
+
+	/* Make sure the application set the direction flag correctly. */
+	if (_IOC_DIR(cmd) != (_IOC_READ | _IOC_WRITE)) {
+		pr_debug("fsl-hv: ioctl direction should be _IOWR\n");
+		return -EINVAL;
+	}
+
+	/* Make sure the application is passing the right structure to us. */
+	if (_IOC_SIZE(cmd) < sizeof(union fsl_hv_ioctl_param)) {
+		pr_debug("fsl-hv: ioctl size %u is too small (should be %u)\n",
+			 _IOC_SIZE(cmd), sizeof(union fsl_hv_ioctl_param));
+		return -EINVAL;
+	}
+
+	switch (_IOC_NR(cmd)) {
+	case FSL_HV_IOCTL_PARTITION_RESTART:
+		ret = ioctl_restart(&arg->restart);
+		break;
+	case FSL_HV_IOCTL_PARTITION_GET_STATUS:
+		ret = ioctl_status(&arg->status);
+		break;
+	case FSL_HV_IOCTL_PARTITION_START:
+		ret = ioctl_start(&arg->start);
+		break;
+	case FSL_HV_IOCTL_PARTITION_STOP:
+		ret = ioctl_stop(&arg->stop);
+		break;
+	case FSL_HV_IOCTL_MEMCPY:
+		ret = ioctl_memcpy(&arg->memcpy);
+		break;
+	case FSL_HV_IOCTL_DOORBELL:
+		ret = ioctl_doorbell(&arg->doorbell);
+		break;
+	case FSL_HV_IOCTL_GETPROP:
+		ret = ioctl_dtprop(&arg->prop, 0);
+		break;
+	case FSL_HV_IOCTL_SETPROP:
+		ret = ioctl_dtprop(&arg->prop, 1);
+		break;
+	default:
+		pr_debug("fsl-hv: unknown ioctl %u\n", cmd);
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	return ret;
+}
+
+/* Linked list of processes that have us open */
+struct list_head db_list;
+
+/* spinlock for db_list */
+static DEFINE_SPINLOCK(db_list_lock);
+
+/* The size of the doorbell event queue.  This must be a power of two. */
+#define QSIZE	16
+
+/* Returns the next head/tail pointer, wrapping around the queue if necessary */
+#define nextp(x) (((x) + 1) & (QSIZE - 1))
+
+/* Per-open data structure */
+struct doorbell_queue {
+	struct list_head list;
+	spinlock_t lock;
+	wait_queue_head_t wait;
+	unsigned int head;
+	unsigned int tail;
+	uint32_t q[QSIZE];
+};
+
+/* Linked list of ISRs that we registered */
+struct list_head isr_list;
+
+/* Per-ISR data structure */
+struct doorbell_isr {
+	struct list_head list;
+	unsigned int irq;
+	uint32_t doorbell;	/* The doorbell handle */
+	uint32_t partition;	/* The partition handle, if used */
+	struct work_struct work;
+};
+
+/**
+ * fsl_hv_isr - interrupt handler for all doorbells
+ * @param irq - the IRQ (a.k.a. receive handle)
+ *
+ * We use the same interrupt handler for all doorbells.  Whenever a doorbell
+ * is rung, and we receive an interrupt, we just put the handle for that
+ * doorbell (passed to us as *data) into all of the queues.
+ *
+ */
+static irqreturn_t fsl_hv_isr(int irq, void *data)
+{
+	struct doorbell_queue *dbq;
+	unsigned long flags;
+
+	/* Prevent another core from modifying db_list */
+	spin_lock_irqsave(&db_list_lock, flags);
+
+	list_for_each_entry(dbq, &db_list, list) {
+		if (dbq->head != nextp(dbq->tail)) {
+			dbq->q[dbq->tail] = (uint32_t) (uintptr_t) data;
+			/* This memory barrier eliminates the need to grab
+			 * the spinlock for dbq.
+			 */
+			smp_wmb();
+			dbq->tail = nextp(dbq->tail);
+			wake_up_interruptible(&dbq->wait);
+		}
+	}
+
+	spin_unlock_irqrestore(&db_list_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * fsl_hv_state_change_work_func -- state change worker function
+ *
+ * The state change notification arrives in an interrupt, but we can't call
+ * blocking_notifier_call_chain() in an interrupt handler.  We could call
+ * atomic_notifier_call_chain(), but that would require the clients' call-back
+ * function to run in interrupt context.  Since we don't want to impose that
+ * restriction on the clients, we create a work queue to process the
+ * notification in kernel context.
+ */
+static void fsl_hv_state_change_work_func(struct work_struct *work)
+{
+	struct doorbell_isr *dbisr =
+		container_of(work, struct doorbell_isr, work);
+
+	blocking_notifier_call_chain(&failover_subscribers, dbisr->partition,
+				     NULL);
+}
+
+/**
+ * fsl_hv_state_change_isr - interrupt handler for state-change doorbells
+ */
+static irqreturn_t fsl_hv_state_change_isr(int irq, void *data)
+{
+	unsigned int status;
+	struct doorbell_isr *dbisr = data;
+	int ret;
+
+	/* Determine the new state, and if it's stopped, notify the clients. */
+	ret = fh_partition_get_status(dbisr->partition, &status);
+	if (!ret && (status == FH_PARTITION_STOPPED))
+		schedule_work(&dbisr->work);
+
+	/* Call the normal handler */
+	return fsl_hv_isr(irq, (void *) (uintptr_t) dbisr->doorbell);
+}
+
+/**
+ * fsl_hv_poll - returns a bitmask indicating whether a read will block
+ *
+ * @return unsigned int
+ */
+static unsigned int fsl_hv_poll(struct file *filp, struct poll_table_struct *p)
+{
+	struct doorbell_queue *dbq = filp->private_data;
+	unsigned long flags;
+	unsigned int mask;
+
+	spin_lock_irqsave(&dbq->lock, flags);
+
+	poll_wait(filp, &dbq->wait, p);
+	mask = (dbq->head == dbq->tail) ? 0 : (POLLIN | POLLRDNORM);
+
+	spin_unlock_irqrestore(&dbq->lock, flags);
+
+	return mask;
+}
+
+/**
+ * fsl_hv_read - return the handles for any incoming doorbells
+ *
+ * If there are doorbell handles in the queue for this open instance, then
+ * return them to the caller as an array of 32-bit integers.  Otherwise,
+ * block until there is at least one handle to return.
+ */
+static ssize_t fsl_hv_read(struct file *filp, char __user *buf, size_t len,
+			   loff_t *off)
+{
+	struct doorbell_queue *dbq = filp->private_data;
+	uint32_t __user *p = (uint32_t __user *) buf; /* for put_user() */
+	unsigned long flags;
+	ssize_t count = 0;
+
+	/* Make sure we stop when the user buffer is full. */
+	while (len >= sizeof(uint32_t)) {
+		uint32_t dbell;	/* Local copy of doorbell queue data */
+
+		spin_lock_irqsave(&dbq->lock, flags);
+
+		/* If the queue is empty, then either we're done or we need
+		 * to block.  If the application specified O_NONBLOCK, then
+		 * we return the appropriate error code.
+		 */
+		if (dbq->head == dbq->tail) {
+			spin_unlock_irqrestore(&dbq->lock, flags);
+			if (count)
+				break;
+			if (filp->f_flags & O_NONBLOCK)
+				return -EAGAIN;
+			if (wait_event_interruptible(dbq->wait,
+						     dbq->head != dbq->tail))
+				return -ERESTARTSYS;
+			continue;
+		}
+
+		/* Even though we have an smp_wmb() in the ISR, the core
+		 * might speculatively execute the "dbell = ..." below while
+		 * it's evaluating the if-statement above.  In that case, the
+		 * value put into dbell could be stale if the core accepts the
+		 * speculation. To prevent that, we need a read memory barrier
+		 * here as well.
+		 */
+		smp_rmb();
+
+		/* Copy the data to a temporary local buffer, because
+		 * we can't call copy_to_user() from inside a spinlock
+		 */
+		dbell = dbq->q[dbq->head];
+		dbq->head = nextp(dbq->head);
+
+		spin_unlock_irqrestore(&dbq->lock, flags);
+
+		if (put_user(dbell, p))
+			return -EFAULT;
+		p++;
+		count += sizeof(uint32_t);
+		len -= sizeof(uint32_t);
+	}
+
+	return count;
+}
+
+/**
+ * fsl_hv_open - open the driver
+ *
+ * Open the driver and prepare for reading doorbells.
+ *
+ * Every time an application opens the driver, we create a doorbell queue
+ * for that file handle.  This queue is used for any incoming doorbells.
+ */
+static int fsl_hv_open(struct inode *inode, struct file *filp)
+{
+	struct doorbell_queue *dbq;
+	unsigned long flags;
+	int ret = 0;
+
+	dbq = kzalloc(sizeof(struct doorbell_queue), GFP_KERNEL);
+	if (!dbq) {
+		pr_err("fsl-hv: out of memory\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&dbq->lock);
+	init_waitqueue_head(&dbq->wait);
+
+	spin_lock_irqsave(&db_list_lock, flags);
+	list_add(&dbq->list, &db_list);
+	spin_unlock_irqrestore(&db_list_lock, flags);
+
+	filp->private_data = dbq;
+
+	return ret;
+}
+
+/**
+ * fsl_hv_close - close the driver
+ */
+static int fsl_hv_close(struct inode *inode, struct file *filp)
+{
+	struct doorbell_queue *dbq = filp->private_data;
+	unsigned long flags;
+
+	int ret = 0;
+
+	spin_lock_irqsave(&db_list_lock, flags);
+	list_del(&dbq->list);
+	spin_unlock_irqrestore(&db_list_lock, flags);
+
+	kfree(dbq);
+
+	return ret;
+}
+
+static const struct file_operations fsl_hv_fops = {
+	.owner = THIS_MODULE,
+	.open = fsl_hv_open,
+	.release = fsl_hv_close,
+	.poll = fsl_hv_poll,
+	.read = fsl_hv_read,
+	.unlocked_ioctl = fsl_hv_ioctl,
+};
+
+static struct miscdevice fsl_hv_misc_dev = {
+	MISC_DYNAMIC_MINOR,
+	"fsl-hv",
+	&fsl_hv_fops
+};
+
+static DECLARE_WORK(power_off, (work_func_t) kernel_power_off);
+
+static irqreturn_t fsl_hv_shutdown_isr(int irq, void *data)
+{
+	schedule_work(&power_off);
+
+	/* We should never get here */
+	return IRQ_NONE;
+}
+
+/**
+ * get_parent_handle -- returns the handle of the parent of the given node
+ *
+ * The handle is the value of the 'reg' property
+ */
+static int get_parent_handle(struct device_node *np)
+{
+	struct device_node *parent;
+	const uint32_t *prop;
+	int len;
+
+	parent = of_get_parent(np);
+	if (!parent)
+		/* It's not really possible for this to fail */
+		return -ENODEV;
+
+	prop = of_get_property(parent, "reg", &len);
+	of_node_put(parent);
+
+	if (!prop || (len != sizeof(uint32_t)))
+		/* This can happen only if the node is malformed */
+		return -ENODEV;
+
+	return *prop;
+}
+
+/**
+ * fsl_hv_failover_register -- register a callback for failover events
+ *
+ * This function is called by device drivers to register their callback
+ * functions for fail-over events.
+ */
+int fsl_hv_failover_register(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&failover_subscribers, nb);
+}
+EXPORT_SYMBOL(fsl_hv_failover_register);
+
+/**
+ * fsl_hv_failover_unregister -- unregister a callback for failover events
+ */
+int fsl_hv_failover_unregister(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&failover_subscribers, nb);
+}
+EXPORT_SYMBOL(fsl_hv_failover_unregister);
+
+/**
+ * has_fsl_hypervisor - return TRUE if we're running under FSL hypervisor
+ *
+ * This function checks to see if we're running under the Freescale
+ * hypervisor, and returns zero if we're not, or non-zero if we are.
+ *
+ * First, it checks if MSR[GS]==1, which means we're running under some
+ * hypervisor.  Then it checks if there is a hypervisor node in the device
+ * tree.  Currently, that means there needs to be a node in the root called
+ * "hypervisor" and which has a property named "fsl,hv-version".
+ */
+static int has_fsl_hypervisor(void)
+{
+	struct device_node *node;
+	int ret;
+
+	if (!(mfmsr() & MSR_GS))
+		return 0;
+
+	node = of_find_node_by_path("/hypervisor");
+	if (!node)
+		return 0;
+
+	ret = of_find_property(node, "fsl,hv-version", NULL) != NULL;
+
+	of_node_put(node);
+
+	return ret;
+}
+
+/**
+ * fsl_hypervisor_init: Freescale hypervisor management driver init
+ *
+ * This function is called when this module is loaded.
+ *
+ * Register ourselves as a miscellaneous driver.  This will register the
+ * fops structure and create the right sysfs entries for udev.
+ */
+static int __init fsl_hypervisor_init(void)
+{
+	struct device_node *np;
+	struct doorbell_isr *dbisr, *n;
+	int ret;
+
+	pr_info("Freescale hypervisor management driver\n");
+
+	if (!has_fsl_hypervisor()) {
+		pr_info("fsl-hv: no hypervisor found\n");
+		return -ENODEV;
+	}
+
+	ret = misc_register(&fsl_hv_misc_dev);
+	if (ret) {
+		pr_err("fsl-hv: cannot register device\n");
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&db_list);
+	INIT_LIST_HEAD(&isr_list);
+
+	for_each_compatible_node(np, NULL, "epapr,hv-receive-doorbell") {
+		unsigned int irq;
+		const uint32_t *handle;
+
+		handle = of_get_property(np, "interrupts", NULL);
+		irq = irq_of_parse_and_map(np, 0);
+		if (!handle || (irq == NO_IRQ)) {
+			pr_err("fsl-hv: no 'interrupts' property in %s node\n",
+				np->full_name);
+			continue;
+		}
+
+		dbisr = kzalloc(sizeof(*dbisr), GFP_KERNEL);
+		if (!dbisr)
+			goto out_of_memory;
+
+		dbisr->irq = irq;
+		dbisr->doorbell = *handle;
+		INIT_WORK(&dbisr->work, fsl_hv_state_change_work_func);
+
+		if (of_device_is_compatible(np, "fsl,hv-shutdown-doorbell")) {
+			/* The shutdown doorbell gets its own ISR */
+			ret = request_irq(irq, fsl_hv_shutdown_isr, 0,
+					  np->name, dbisr);
+		} else if (of_device_is_compatible(np,
+			"fsl,hv-state-change-doorbell")) {
+			/* The state change doorbell triggers a notification if
+			 * the state of the managed partition changes to
+			 * "stopped". We need a separate interrupt handler for
+			 * that, and we also need to know the handle of the
+			 * target partition, not just the handle of the
+			 * doorbell.
+			 */
+			dbisr->partition = ret = get_parent_handle(np);
+			if (ret < 0) {
+				pr_err("fsl-hv: node %s has missing or "
+				       "malformed parent\n", np->full_name);
+				kfree(dbisr);
+				continue;
+			}
+			ret = request_irq(irq, fsl_hv_state_change_isr, 0,
+					  np->name, dbisr);
+		} else
+			ret = request_irq(irq, fsl_hv_isr, 0, np->name, dbisr);
+
+		if (ret < 0) {
+			pr_err("fsl-hv: could not request irq %u for node %s\n",
+			       irq, np->full_name);
+			kfree(dbisr);
+			continue;
+		}
+
+		list_add(&dbisr->list, &isr_list);
+
+		pr_info("fsl-hv: registered handler for doorbell %u\n",
+			*handle);
+	}
+
+	return 0;
+
+out_of_memory:
+	list_for_each_entry_safe(dbisr, n, &isr_list, list) {
+		free_irq(dbisr->irq, dbisr);
+		list_del(&dbisr->list);
+		kfree(dbisr);
+	}
+
+	misc_deregister(&fsl_hv_misc_dev);
+
+	return -ENOMEM;
+}
+
+/**
+ * fsl_hypervisor_exit: Freescale hypervisor management driver termination
+ *
+ * This function is called when this driver is unloaded.
+ */
+static void __exit fsl_hypervisor_exit(void)
+{
+	struct doorbell_isr *dbisr, *n;
+
+	list_for_each_entry_safe(dbisr, n, &isr_list, list) {
+		free_irq(dbisr->irq, dbisr);
+		list_del(&dbisr->list);
+		kfree(dbisr);
+	}
+
+	misc_deregister(&fsl_hv_misc_dev);
+}
+
+module_init(fsl_hypervisor_init);
+module_exit(fsl_hypervisor_exit);
+
+MODULE_AUTHOR("Timur Tabi <timur@freescale.com>");
+MODULE_DESCRIPTION("Freescale hypervisor management driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 75cf611..68c341a 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -134,6 +134,7 @@  header-y += firewire-cdev.h
 header-y += firewire-constants.h
 header-y += flat.h
 header-y += fs.h
+header-y += fsl_hypervisor.h
 header-y += fuse.h
 header-y += futex.h
 header-y += gameport.h
diff --git a/include/linux/fsl_hypervisor.h b/include/linux/fsl_hypervisor.h
new file mode 100644
index 0000000..63740a2
--- /dev/null
+++ b/include/linux/fsl_hypervisor.h
@@ -0,0 +1,203 @@ 
+/*
+ * Freescale hypervisor ioctl interface
+ *
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc.
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ *
+ * This file is used by the Freescale hypervisor management driver.  It can
+ * also be included by applications that need to communicate with the driver
+ * via the ioctl interface.
+ */
+
+#ifndef FSL_HYPERVISOR_H
+#define FSL_HYPERVISOR_H
+
+#include <linux/types.h>
+
+/**
+ * Freescale hypervisor ioctl parameter
+ */
+union fsl_hv_ioctl_param {
+
+	/**
+	 * @ret: Return value.
+	 *
+	 * This is always the first word of any structure.
+	 */
+	__u32 ret;
+
+	/**
+	 * struct fsl_hv_ioctl_restart: restart a partition
+	 * @ret: return error code from the hypervisor
+	 * @partition: the ID of the partition to restart, or -1 for the
+	 *             calling partition
+	 *
+	 * Used by FSL_HV_IOCTL_PARTITION_RESTART
+	 */
+	struct fsl_hv_ioctl_restart {
+		__u32 ret;
+		__u32 partition;
+	} restart;
+
+	/**
+	 * struct fsl_hv_ioctl_status: get a partition's status
+	 * @ret: return error code from the hypervisor
+	 * @partition: the ID of the partition to query, or -1 for the
+	 *             calling partition
+	 * @status: The returned status of the partition
+	 *
+	 * Used by FSL_HV_IOCTL_PARTITION_GET_STATUS
+	 *
+	 * Values of 'status':
+	 *    0 = Stopped
+	 *    1 = Running
+	 *    2 = Starting
+	 *    3 = Stopping
+	 */
+	struct fsl_hv_ioctl_status {
+		__u32 ret;
+		__u32 partition;
+		__u32 status;
+	} status;
+
+	/**
+	 * struct fsl_hv_ioctl_start: start a partition
+	 * @ret: return error code from the hypervisor
+	 * @partition: the ID of the partition to control
+	 * @entry_point: The offset within the guest IMA to start execution
+	 * @load: If non-zero, reload the partition's images before starting
+	 *
+	 * Used by FSL_HV_IOCTL_PARTITION_START
+	 */
+	struct fsl_hv_ioctl_start {
+		__u32 ret;
+		__u32 partition;
+		__u32 entry_point;
+		__u32 load;
+	} start;
+
+	/**
+	 * struct fsl_hv_ioctl_stop: stop a partition
+	 * @ret: return error code from the hypervisor
+	 * @partition: the ID of the partition to stop, or -1 for the calling
+	 *             partition
+	 *
+	 * Used by FSL_HV_IOCTL_PARTITION_STOP
+	 */
+	struct fsl_hv_ioctl_stop {
+		__u32 ret;
+		__u32 partition;
+	} stop;
+
+	/**
+	 * struct fsl_hv_ioctl_memcpy: copy memory between partitions
+	 * @ret: return error code from the hypervisor
+	 * @source: the partition ID of the source partition, or -1 for this
+	 *          partition
+	 * @target: the partition ID of the target partition, or -1 for this
+	 *          partition
+	 * @local_addr: user-space virtual address of a buffer in the local
+	 *              partition
+	 * @remote_addr: guest physical address of a buffer in the
+	 *           remote partition
+	 * @count: the number of bytes to copy.  Both the local and remote
+	 *         buffers must be at least 'count' bytes long
+	 *
+	 * Used by FSL_HV_IOCTL_MEMCPY
+	 *
+	 * The 'local' partition is the partition that calls this ioctl.  The
+	 * 'remote' partition is a different partition.  The data is copied from
+	 * the 'source' paritition' to the 'target' partition.
+	 *
+	 * The buffer in the remote partition must be guest physically
+	 * contiguous.
+	 *
+	 * This ioctl does not support copying memory between two remote
+	 * partitions or within the same partition, so either 'source' or
+	 * 'target' (but not both) must be -1.  In other words, either
+	 *
+	 *      source == local and target == remote
+	 * or
+	 *      source == remote and target == local
+	 */
+	struct fsl_hv_ioctl_memcpy {
+		__u32 ret;
+		__u32 source;
+		__u32 target;
+		__u64 local_vaddr;
+		__u64 remote_paddr;
+		__u64 count;
+	} memcpy;
+
+	/**
+	 * struct fsl_hv_ioctl_doorbell: ring a doorbell
+	 * @ret: return error code from the hypervisor
+	 * @doorbell: the handle of the doorbell to ring doorbell
+	 *
+	 * Used by FSL_HV_IOCTL_DOORBELL
+	 */
+	struct fsl_hv_ioctl_doorbell {
+		__u32 ret;
+		__u32 doorbell;
+	} doorbell;
+
+	/**
+	 * struct fsl_hv_ioctl_prop: get/set a device tree property
+	 * @ret: return error code from the hypervisor
+	 * @handle: handle of partition whose tree to access
+	 * @path: virtual address of path name of node to access
+	 * @propname: virtual address of name of property to access
+	 * @propval: virtual address of property data buffer
+	 * @proplen: Size of property data buffer
+	 *
+	 * Used by FSL_HV_IOCTL_DOORBELL
+	 */
+	struct fsl_hv_ioctl_prop {
+		__u32 ret;
+		__u32 handle;
+		__u64 path;
+		__u64 propname;
+		__u64 propval;
+		__u32 proplen;
+	} prop;
+};
+
+/*
+ * ioctl commands.
+ */
+enum {
+	FSL_HV_IOCTL_PARTITION_RESTART = 1, /* Boot another partition */
+	FSL_HV_IOCTL_PARTITION_GET_STATUS = 2, /* Boot another partition */
+	FSL_HV_IOCTL_PARTITION_START = 3, /* Boot another partition */
+	FSL_HV_IOCTL_PARTITION_STOP = 4, /* Stop this or another partition */
+	FSL_HV_IOCTL_MEMCPY = 5, /* Copy data from one partition to another */
+	FSL_HV_IOCTL_DOORBELL = 6, /* Ring a doorbell */
+
+	/* Get a property from another guest's device tree */
+	FSL_HV_IOCTL_GETPROP = 7,
+
+	/* Set a property in another guest's device tree */
+	FSL_HV_IOCTL_SETPROP = 8,
+};
+
+#ifdef __KERNEL__
+
+/**
+ * fsl_hv_event_register -- register a callback for failover events
+ *
+ * This function is called by device drivers to register their callback
+ * functions for fail-over events.
+ */
+int fsl_hv_failover_register(struct notifier_block *nb);
+
+/**
+ * fsl_hv_event_unregister -- unregister a callback for failover events
+ */
+int fsl_hv_failover_unregister(struct notifier_block *nb);
+
+#endif
+
+#endif