diff mbox

[RFC,01/17] shm-signal: shared-memory signals

Message ID 20090331184252.28333.70623.stgit@dev.haskins.net
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Gregory Haskins March 31, 2009, 6:42 p.m. UTC
This interface provides a bidirectional shared-memory based signaling
mechanism.  It can be used by any entities which desire efficient
communication via shared memory.  The implementation details of the
signaling are abstracted so that they may transcend a wide variety
of locale boundaries (e.g. userspace/kernel, guest/host, etc).

The shm_signal mechanism supports event masking as well as spurious
event delivery mitigation.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 include/linux/shm_signal.h |  188 ++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig                |   10 ++
 lib/Makefile               |    1 
 lib/shm_signal.c           |  186 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 385 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/shm_signal.h
 create mode 100644 lib/shm_signal.c


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Avi Kivity March 31, 2009, 8:44 p.m. UTC | #1
Gregory Haskins wrote:
> This interface provides a bidirectional shared-memory based signaling
> mechanism.  It can be used by any entities which desire efficient
> communication via shared memory.  The implementation details of the
> signaling are abstracted so that they may transcend a wide variety
> of locale boundaries (e.g. userspace/kernel, guest/host, etc).
>
> The shm_signal mechanism supports event masking as well as spurious
> event delivery mitigation.
> +
> +/*
> + *---------
> + * The following structures represent data that is shared across boundaries
> + * which may be quite disparate from one another (e.g. Windows vs Linux,
> + * 32 vs 64 bit, etc).  Therefore, care has been taken to make sure they
> + * present data in a manner that is independent of the environment.
> + *-----------
> + */
> +
> +#define SHM_SIGNAL_MAGIC 0x58fa39df
> +#define SHM_SIGNAL_VER   1
> +
> +struct shm_signal_irq {
> +	__u8                  enabled;
> +	__u8                  pending;
> +	__u8                  dirty;
> +};
>   

Some ABIs may choose to pad this, suggest explicit padding.

> +
> +enum shm_signal_locality {
> +	shm_locality_north,
> +	shm_locality_south,
> +};
> +
> +struct shm_signal_desc {
> +	__u32                 magic;
> +	__u32                 ver;
> +	struct shm_signal_irq irq[2];
> +};
>   

Similarly, this should be padded to 0 (mod 8).

Instead of versions, I prefer feature flags which can be independently 
enabled or disabled.

> +
> +/* --- END SHARED STRUCTURES --- */
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/interrupt.h>
> +
> +struct shm_signal_notifier {
> +	void (*signal)(struct shm_signal_notifier *);
> +};
>   

This means "->inject() has been called from the other side"?

(reading below I see this is so.  not used to reading well commented 
code... :)

> +
> +struct shm_signal;
> +
> +struct shm_signal_ops {
> +	int      (*inject)(struct shm_signal *s);
> +	void     (*fault)(struct shm_signal *s, const char *fmt, ...);
>   

Eww.  Must we involve strings and printf formats?

> +	void     (*release)(struct shm_signal *s);
> +};
> +
> +/*
> + * signaling protocol:
> + *
> + * each side of the shm_signal has an "irq" structure with the following
> + * fields:
> + *
> + *    - enabled: controlled by shm_signal_enable/disable() to mask/unmask
> + *               the notification locally
> + *    - dirty:   indicates if the shared-memory is dirty or clean.  This
> + *               is updated regardless of the enabled/pending state so that
> + *               the state is always accurately tracked.
> + *    - pending: indicates if a signal is pending to the remote locale.
> + *               This allows us to determine if a remote-notification is
> + *               already in flight to optimize spurious notifications away.
> + */
>   

When you overlay a ring on top of this, won't the ring indexes convey 
the same information as ->dirty?
Gregory Haskins March 31, 2009, 8:58 p.m. UTC | #2
Avi Kivity wrote:
> Gregory Haskins wrote:
>> This interface provides a bidirectional shared-memory based signaling
>> mechanism.  It can be used by any entities which desire efficient
>> communication via shared memory.  The implementation details of the
>> signaling are abstracted so that they may transcend a wide variety
>> of locale boundaries (e.g. userspace/kernel, guest/host, etc).
>>
>> The shm_signal mechanism supports event masking as well as spurious
>> event delivery mitigation.
>> +
>> +/*
>> + *---------
>> + * The following structures represent data that is shared across
>> boundaries
>> + * which may be quite disparate from one another (e.g. Windows vs
>> Linux,
>> + * 32 vs 64 bit, etc).  Therefore, care has been taken to make sure
>> they
>> + * present data in a manner that is independent of the environment.
>> + *-----------
>> + */
>> +
>> +#define SHM_SIGNAL_MAGIC 0x58fa39df
>> +#define SHM_SIGNAL_VER   1
>> +
>> +struct shm_signal_irq {
>> +    __u8                  enabled;
>> +    __u8                  pending;
>> +    __u8                  dirty;
>> +};
>>   
>
> Some ABIs may choose to pad this, suggest explicit padding.

Yeah, good idea.  What is the official way to do this these days?  Are
GCC pragmas allowed?

>
>> +
>> +enum shm_signal_locality {
>> +    shm_locality_north,
>> +    shm_locality_south,
>> +};
>> +
>> +struct shm_signal_desc {
>> +    __u32                 magic;
>> +    __u32                 ver;
>> +    struct shm_signal_irq irq[2];
>> +};
>>   
>
> Similarly, this should be padded to 0 (mod 8).
Ack

>
> Instead of versions, I prefer feature flags which can be independently
> enabled or disabled.

Totally agreed.  If you look, most of the ABI has a type of "NEGCAP"
(negotiate capabilities) feature.  The version number is a contingency
plan in case I still have to break it for whatever reason.   I will
always opt for the feature bits over bumping the version when its
feasible (especially if/when this is actually in the field).

>
>> +
>> +/* --- END SHARED STRUCTURES --- */
>> +
>> +#ifdef __KERNEL__
>> +
>> +#include <linux/interrupt.h>
>> +
>> +struct shm_signal_notifier {
>> +    void (*signal)(struct shm_signal_notifier *);
>> +};
>>   
>
> This means "->inject() has been called from the other side"?

Yep
>
> (reading below I see this is so.  not used to reading well commented
> code... :)

:)

>
>> +
>> +struct shm_signal;
>> +
>> +struct shm_signal_ops {
>> +    int      (*inject)(struct shm_signal *s);
>> +    void     (*fault)(struct shm_signal *s, const char *fmt, ...);
>>   
>
> Eww.  Must we involve strings and printf formats?

This is still somewhat of a immature part of the design.  Its supposed
to be used so that by default, its a panic.  But on the host side, we
can do something like inject a machine-check.  That way malicious/broken
guests cannot (should not? ;) be able to take down the host.  Note today
I do not map this to anything other than the default panic, so this
needs some love.

But given the asynchronous nature of the fault, I want to be sure we
have decent accounting to avoid bug reports like "silent MCE kills the
guest" ;)  At least this way, we can log the fault string somewhere to
get a clue.

>
>> +    void     (*release)(struct shm_signal *s);
>> +};
>> +
>> +/*
>> + * signaling protocol:
>> + *
>> + * each side of the shm_signal has an "irq" structure with the
>> following
>> + * fields:
>> + *
>> + *    - enabled: controlled by shm_signal_enable/disable() to
>> mask/unmask
>> + *               the notification locally
>> + *    - dirty:   indicates if the shared-memory is dirty or clean. 
>> This
>> + *               is updated regardless of the enabled/pending state
>> so that
>> + *               the state is always accurately tracked.
>> + *    - pending: indicates if a signal is pending to the remote locale.
>> + *               This allows us to determine if a
>> remote-notification is
>> + *               already in flight to optimize spurious
>> notifications away.
>> + */
>>   
>
> When you overlay a ring on top of this, won't the ring indexes convey
> the same information as ->dirty?

I agree that the information may be redundant with components of the
broader shm state.  However, we need this state at this level of scope
in order to function optimally, so I dont think its a huge deal to have
this here as well.  Afterall, the shm_signal library can only assess its
internal state.  We would have to teach it how to glean the broader
state through some mechanism otherwise (callback, perhaps), but I don't
think its worth it.

-Greg

>
>
Avi Kivity March 31, 2009, 9:05 p.m. UTC | #3
Gregory Haskins wrote:
>>> +struct shm_signal_irq {
>>> +    __u8                  enabled;
>>> +    __u8                  pending;
>>> +    __u8                  dirty;
>>> +};
>>>   
>>>       
>> Some ABIs may choose to pad this, suggest explicit padding.
>>     
>
> Yeah, good idea.  What is the official way to do this these days?  Are
> GCC pragmas allowed?
>
>   

I just add a __u8 pad[5] in such cases.

>>> +
>>> +struct shm_signal;
>>> +
>>> +struct shm_signal_ops {
>>> +    int      (*inject)(struct shm_signal *s);
>>> +    void     (*fault)(struct shm_signal *s, const char *fmt, ...);
>>>   
>>>       
>> Eww.  Must we involve strings and printf formats?
>>     
>
> This is still somewhat of a immature part of the design.  Its supposed
> to be used so that by default, its a panic.  But on the host side, we
> can do something like inject a machine-check.  That way malicious/broken
> guests cannot (should not? ;) be able to take down the host.  Note today
> I do not map this to anything other than the default panic, so this
> needs some love.
>
> But given the asynchronous nature of the fault, I want to be sure we
> have decent accounting to avoid bug reports like "silent MCE kills the
> guest" ;)  At least this way, we can log the fault string somewhere to
> get a clue.
>   

I see.

This raises a point I've been thinking of - the symmetrical nature of 
the API vs the assymetrical nature of guest/host or user/kernel 
interfaces.  This is most pronounced in ->inject(); in the host->guest 
direction this is async (host can continue processing while the guest is 
handling the interrupt), whereas in the guest->host direction it is 
synchronous (the guest is blocked while the host is processing the call, 
unless the host explicitly hands off work to a different thread).
Gregory Haskins April 1, 2009, 12:12 p.m. UTC | #4
Avi Kivity wrote:
> Gregory Haskins wrote:
>>>> +struct shm_signal_irq {
>>>> +    __u8                  enabled;
>>>> +    __u8                  pending;
>>>> +    __u8                  dirty;
>>>> +};
>>>>         
>>> Some ABIs may choose to pad this, suggest explicit padding.
>>>     
>>
>> Yeah, good idea.  What is the official way to do this these days?  Are
>> GCC pragmas allowed?
>>
>>   
>
> I just add a __u8 pad[5] in such cases.

Oh, duh.  Dumb question.  I was getting confused with "pack", not pad.  :)

>
>>>> +
>>>> +struct shm_signal;
>>>> +
>>>> +struct shm_signal_ops {
>>>> +    int      (*inject)(struct shm_signal *s);
>>>> +    void     (*fault)(struct shm_signal *s, const char *fmt, ...);
>>>>         
>>> Eww.  Must we involve strings and printf formats?
>>>     
>>
>> This is still somewhat of a immature part of the design.  Its supposed
>> to be used so that by default, its a panic.  But on the host side, we
>> can do something like inject a machine-check.  That way malicious/broken
>> guests cannot (should not? ;) be able to take down the host.  Note today
>> I do not map this to anything other than the default panic, so this
>> needs some love.
>>
>> But given the asynchronous nature of the fault, I want to be sure we
>> have decent accounting to avoid bug reports like "silent MCE kills the
>> guest" ;)  At least this way, we can log the fault string somewhere to
>> get a clue.
>>   
>
> I see.
>
> This raises a point I've been thinking of - the symmetrical nature of
> the API vs the assymetrical nature of guest/host or user/kernel
> interfaces.  This is most pronounced in ->inject(); in the host->guest
> direction this is async (host can continue processing while the guest
> is handling the interrupt), whereas in the guest->host direction it is
> synchronous (the guest is blocked while the host is processing the
> call, unless the host explicitly hands off work to a different thread).

Note that this is exactly what I do (though it is device specific). 
venet-tap has a ioq_notifier registered on its "rx" ring (which is the
tx-ring for the guest) that simply calls ioq_notify_disable() (which
calls shm_signal_disable() under the covers) and it wakes its
rx-thread.  This all happens in the context of the hypercall, which then
returns and allows the vcpu to re-enter guest mode immediately.


>
>
Avi Kivity April 1, 2009, 12:24 p.m. UTC | #5
Gregory Haskins wrote:
> Note that this is exactly what I do (though it is device specific). 
> venet-tap has a ioq_notifier registered on its "rx" ring (which is the
> tx-ring for the guest) that simply calls ioq_notify_disable() (which
> calls shm_signal_disable() under the covers) and it wakes its
> rx-thread.  This all happens in the context of the hypercall, which then
> returns and allows the vcpu to re-enter guest mode immediately.
>   
I think this is suboptimal.  The ring is likely to be cache hot on the 
current cpu, waking a thread will introduce scheduling latency + IPI 
+cache-to-cache transfers.

On a benchmark setup, host resources are likely to exceed guest 
requirements, so you can throw cpu at the problem and no one notices.  
But I think the bits/cycle figure will decrease, even if bits/sec increases.
Gregory Haskins April 1, 2009, 1:57 p.m. UTC | #6
Avi Kivity wrote:
> Gregory Haskins wrote:
>> Note that this is exactly what I do (though it is device specific).
>> venet-tap has a ioq_notifier registered on its "rx" ring (which is the
>> tx-ring for the guest) that simply calls ioq_notify_disable() (which
>> calls shm_signal_disable() under the covers) and it wakes its
>> rx-thread.  This all happens in the context of the hypercall, which then
>> returns and allows the vcpu to re-enter guest mode immediately.
>>   
> I think this is suboptimal.

Heh, yes I know this is your (well documented) position, but I
respectfully disagree. :)

CPUs are not getting much faster, but they are rapidly getting more
cores.  If we want to continue to make software run increasingly faster,
we need to actually use those cores IMO.  Generally this means split
workloads up into as many threads as possible as long as you can keep
pipelines filed.

>   The ring is likely to be cache hot on the current cpu, waking a
> thread will introduce scheduling latency + IPI
This part is a valid criticism, though note that Linux is very adept at
scheduling so we are talking mere ns/us range here, which is dwarfed by
the latency of something like your typical IO device (e.g. 36us for a
rtt packet on 10GE baremetal, etc).  The benefit, of course, is the
potential for increased parallelism which I have plenty of data to show
we are very much taking advantage of here (I can saturate two cores
almost completely according to LTT traces, one doing vcpu work, and the
other running my "rx" thread which schedules the packet on the hardware)

> +cache-to-cache transfers.
This one I take exception to.  While it is perfectly true that splitting
the work between two cores has a greater cache impact than staying on
one, you cannot look at this one metric alone and say "this is bad". 
Its also a function of how efficiently the second (or more) cores are
utilized.  There will be a point in the curve where the cost of cache
coherence will become marginalized by the efficiency added by the extra
compute power.  Some workloads will invariably be on the bad end of that
curve, and therefore doing the work on one core is better.  However, we
cant ignore that there will others that are on the good end of this
spectrum either.  Otherwise, we risk performance stagnation on our
effectively uniprocessor box ;).  In addition, the task-scheduler will
attempt to co-locate tasks that are sharing data according to a best-fit
within the cache hierarchy.  Therefore, we will still be sharing as much
as possible (perhaps only L2, L3, or a local NUMA domain, but this is
still better than nothing)

The way I have been thinking about these issues is something I have been
calling "soft-asics".  In the early days, we had things like a simple
uniprocessor box with a simple dumb ethernet.  People figured out that
if you put more processing power into the NIC, you could offload that
work from the cpu and do more in parallel.   So things like checksum
computation and segmentation duties were a good fit.  More recently, we
see even more advanced hardware where you can do L2 or even L4 packet
classification right in the hardware, etc.  All of these things are
effectively parallel computation, and it occurs in a completely foreign
cache domain!

So a lot of my research has been around the notion of trying to use some
of our cpu cores to do work like some of the advanced asic based offload
engines do.  The cores are often under utilized anyway, and this will
bring some of the features of advanced silicon to commodity resources. 
They also have the added flexibility that its just software, so you can
change or enhance the system at will.

So if you think about it, by using threads like this in venet-tap, I am
effectively using other cores to do csum/segmentation (if the physical
hardware doesn't support it), layer 2 classification (linux bridging),
filtering (iptables in the bridge), queuing, etc as if it was some
"smart" device out on the PCI bus.  The guest just queues up packets
independently in its own memory, while the device just "dma's" the data
on its own (after the initial kick).  The vcpu is keeping the pipeline
filled on its side independently.

>
> On a benchmark setup, host resources are likely to exceed guest
> requirements, so you can throw cpu at the problem and no one notices.
Sure, but with the type of design I have presented this still sorts
itself out naturally even if the host doesn't have the resources.  For
instance, if there is a large number of threads competing for a small
number of cores, we will simply see things like the rx-thread stalling
and going to sleep, or the vcpu thread backpressuring and going idle
(and therefore sleeping).  All of these things are self throttling.  If
you don't have enough resources to run a workload at a desirable
performance level, the system wasn't sized right to begin with. ;)

>   But I think the bits/cycle figure will decrease, even if bits/sec
> increases.
>
Note that this isn't necessarily a bad thing.  I think studies show that
most machines are generally idle a significant percentage of the time,
and this will likely only get worse as we get more and more cores.  So
if I have to consume more cycles to get more bits on the wire, thats
probably ok with most of my customers.   If its not, it would be trivial
to make the venet threading policy a tunable parameter.

-Greg
diff mbox

Patch

diff --git a/include/linux/shm_signal.h b/include/linux/shm_signal.h
new file mode 100644
index 0000000..a65e54e
--- /dev/null
+++ b/include/linux/shm_signal.h
@@ -0,0 +1,188 @@ 
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *      Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_SHM_SIGNAL_H
+#define _LINUX_SHM_SIGNAL_H
+
+#include <asm/types.h>
+
+/*
+ *---------
+ * The following structures represent data that is shared across boundaries
+ * which may be quite disparate from one another (e.g. Windows vs Linux,
+ * 32 vs 64 bit, etc).  Therefore, care has been taken to make sure they
+ * present data in a manner that is independent of the environment.
+ *-----------
+ */
+
+#define SHM_SIGNAL_MAGIC 0x58fa39df
+#define SHM_SIGNAL_VER   1
+
+struct shm_signal_irq {
+	__u8                  enabled;
+	__u8                  pending;
+	__u8                  dirty;
+};
+
+enum shm_signal_locality {
+	shm_locality_north,
+	shm_locality_south,
+};
+
+struct shm_signal_desc {
+	__u32                 magic;
+	__u32                 ver;
+	struct shm_signal_irq irq[2];
+};
+
+/* --- END SHARED STRUCTURES --- */
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+
+struct shm_signal_notifier {
+	void (*signal)(struct shm_signal_notifier *);
+};
+
+struct shm_signal;
+
+struct shm_signal_ops {
+	int      (*inject)(struct shm_signal *s);
+	void     (*fault)(struct shm_signal *s, const char *fmt, ...);
+	void     (*release)(struct shm_signal *s);
+};
+
+enum {
+	shm_signal_in_wakeup,
+};
+
+struct shm_signal {
+	atomic_t                    refs;
+	spinlock_t                  lock;
+	enum shm_signal_locality    locale;
+	unsigned long               flags;
+	struct shm_signal_ops      *ops;
+	struct shm_signal_desc     *desc;
+	struct shm_signal_notifier *notifier;
+	struct tasklet_struct       deferred_notify;
+};
+
+#define SHM_SIGNAL_FAULT(s, fmt, args...)  \
+  ((s)->ops->fault ? (s)->ops->fault((s), fmt, ## args) : panic(fmt, ## args))
+
+ /*
+  * These functions should only be used internally
+  */
+void _shm_signal_release(struct shm_signal *s);
+void _shm_signal_wakeup(struct shm_signal *s);
+
+/**
+ * shm_signal_init() - initialize an SHM_SIGNAL
+ * @s:        SHM_SIGNAL context
+ *
+ * Initializes SHM_SIGNAL context before first use
+ *
+ **/
+void shm_signal_init(struct shm_signal *s);
+
+/**
+ * shm_signal_get() - acquire an SHM_SIGNAL context reference
+ * @s:        SHM_SIGNAL context
+ *
+ **/
+static inline struct shm_signal *shm_signal_get(struct shm_signal *s)
+{
+	atomic_inc(&s->refs);
+
+	return s;
+}
+
+/**
+ * shm_signal_put() - release an SHM_SIGNAL context reference
+ * @s:        SHM_SIGNAL context
+ *
+ **/
+static inline void shm_signal_put(struct shm_signal *s)
+{
+	if (atomic_dec_and_test(&s->refs))
+		_shm_signal_release(s);
+}
+
+/**
+ * shm_signal_enable() - enables local notifications on an SHM_SIGNAL
+ * @s:        SHM_SIGNAL context
+ * @flags:      Reserved for future use, must be 0
+ *
+ * Enables/unmasks the registered notifier (if applicable) to receive wakeups
+ * whenever the remote side performs an shm_signal() operation. A notification
+ * will be dispatched immediately if any pending signals have already been
+ * issued prior to invoking this call.
+ *
+ * This is synonymous with unmasking an interrupt.
+ *
+ * Returns: success = 0, <0 = ERRNO
+ *
+ **/
+int shm_signal_enable(struct shm_signal *s, int flags);
+
+/**
+ * shm_signal_disable() - disable local notifications on an SHM_SIGNAL
+ * @s:        SHM_SIGNAL context
+ * @flags:      Reserved for future use, must be 0
+ *
+ * Disables/masks the registered shm_signal_notifier (if applicable) from
+ * receiving any further notifications.  Any subsequent calls to shm_signal()
+ * by the remote side will update the shm as dirty, but will not traverse the
+ * locale boundary and will not invoke the notifier callback.  Signals
+ * delivered while masked will be deferred until shm_signal_enable() is
+ * invoked.
+ *
+ * This is synonymous with masking an interrupt
+ *
+ * Returns: success = 0, <0 = ERRNO
+ *
+ **/
+int shm_signal_disable(struct shm_signal *s, int flags);
+
+/**
+ * shm_signal_inject() - notify the remote side about shm changes
+ * @s:        SHM_SIGNAL context
+ * @flags:      Reserved for future use, must be 0
+ *
+ * Marks the shm state as "dirty" and, if enabled, will traverse
+ * a locale boundary to inject a remote notification.  The remote
+ * side controls whether the notification should be delivered via
+ * the shm_signal_enable/disable() interface.
+ *
+ * The specifics of how to traverse a locale boundary are abstracted
+ * by the shm_signal_ops->signal() interface and provided by a particular
+ * implementation.  However, typically going north to south would be
+ * something like a syscall/hypercall, and going south to north would be
+ * something like a posix-signal/guest-interrupt.
+ *
+ * Returns: success = 0, <0 = ERRNO
+ *
+ **/
+int shm_signal_inject(struct shm_signal *s, int flags);
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SHM_SIGNAL_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 03c2c24..32d82fe 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -174,4 +174,14 @@  config DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
        bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
        depends on EXPERIMENTAL && BROKEN
 
+config SHM_SIGNAL
+	boolean "SHM Signal - Generic shared-memory signaling mechanism"
+	default n
+	help
+	 Provides a shared-memory based signaling mechansim to indicate
+         memory-dirty notifications between two end-points.
+
+	 If unsure, say N
+
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 32b0e64..bc36327 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -71,6 +71,7 @@  obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 obj-$(CONFIG_SMP) += percpu_counter.o
 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
+obj-$(CONFIG_SHM_SIGNAL) += shm_signal.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
 obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
diff --git a/lib/shm_signal.c b/lib/shm_signal.c
new file mode 100644
index 0000000..fa1770c
--- /dev/null
+++ b/lib/shm_signal.c
@@ -0,0 +1,186 @@ 
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * See include/linux/shm_signal.h for documentation
+ *
+ * Author:
+ *      Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/shm_signal.h>
+
+int shm_signal_enable(struct shm_signal *s, int flags)
+{
+	struct shm_signal_irq *irq = &s->desc->irq[s->locale];
+	unsigned long iflags;
+
+	spin_lock_irqsave(&s->lock, iflags);
+
+	irq->enabled = 1;
+	wmb();
+
+	if ((irq->dirty || irq->pending)
+	    && !test_bit(shm_signal_in_wakeup, &s->flags)) {
+		rmb();
+		tasklet_schedule(&s->deferred_notify);
+	}
+
+	spin_unlock_irqrestore(&s->lock, iflags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(shm_signal_enable);
+
+int shm_signal_disable(struct shm_signal *s, int flags)
+{
+	struct shm_signal_irq *irq = &s->desc->irq[s->locale];
+
+	irq->enabled = 0;
+	wmb();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(shm_signal_disable);
+
+/*
+ * signaling protocol:
+ *
+ * each side of the shm_signal has an "irq" structure with the following
+ * fields:
+ *
+ *    - enabled: controlled by shm_signal_enable/disable() to mask/unmask
+ *               the notification locally
+ *    - dirty:   indicates if the shared-memory is dirty or clean.  This
+ *               is updated regardless of the enabled/pending state so that
+ *               the state is always accurately tracked.
+ *    - pending: indicates if a signal is pending to the remote locale.
+ *               This allows us to determine if a remote-notification is
+ *               already in flight to optimize spurious notifications away.
+ */
+int shm_signal_inject(struct shm_signal *s, int flags)
+{
+	/* Load the irq structure from the other locale */
+	struct shm_signal_irq *irq = &s->desc->irq[!s->locale];
+
+	/*
+	 * We always mark the remote side as dirty regardless of whether
+	 * they need to be notified.
+	 */
+	irq->dirty = 1;
+	wmb();   /* dirty must be visible before we test the pending state */
+
+	if (irq->enabled && !irq->pending) {
+		rmb();
+
+		/*
+		 * If the remote side has enabled notifications, and we do
+		 * not see a notification pending, we must inject a new one.
+		 */
+		irq->pending = 1;
+		wmb(); /* make it visible before we do the injection */
+
+		s->ops->inject(s);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(shm_signal_inject);
+
+void _shm_signal_wakeup(struct shm_signal *s)
+{
+	struct shm_signal_irq *irq = &s->desc->irq[s->locale];
+	int dirty;
+	unsigned long flags;
+
+	spin_lock_irqsave(&s->lock, flags);
+
+	__set_bit(shm_signal_in_wakeup, &s->flags);
+
+	/*
+	 * The outer loop protects against race conditions between
+	 * irq->dirty and irq->pending updates
+	 */
+	while (irq->enabled && (irq->dirty || irq->pending)) {
+
+		/*
+		 * Run until we completely exhaust irq->dirty (it may
+		 * be re-dirtied by the remote side while we are in the
+		 * callback).  We let "pending" remain untouched until we have
+		 * processed them all so that the remote side knows we do not
+		 * need a new notification (yet).
+		 */
+		do {
+			irq->dirty = 0;
+			/* the unlock is an implicit wmb() for dirty = 0 */
+			spin_unlock_irqrestore(&s->lock, flags);
+
+			if (s->notifier)
+				s->notifier->signal(s->notifier);
+
+			spin_lock_irqsave(&s->lock, flags);
+			dirty = irq->dirty;
+			rmb();
+
+		} while (irq->enabled && dirty);
+
+		barrier();
+
+		/*
+		 * We can finally acknowledge the notification by clearing
+		 * "pending" after all of the dirty memory has been processed
+		 * Races against this clearing are handled by the outer loop.
+		 * Subsequent iterations of this loop will execute with
+		 * pending=0 potentially leading to future spurious
+		 * notifications, but this is an acceptable tradeoff as this
+		 * will be rare and harmless.
+		 */
+		irq->pending = 0;
+		wmb();
+
+	}
+
+	__clear_bit(shm_signal_in_wakeup, &s->flags);
+	spin_unlock_irqrestore(&s->lock, flags);
+
+}
+EXPORT_SYMBOL_GPL(_shm_signal_wakeup);
+
+void _shm_signal_release(struct shm_signal *s)
+{
+	s->ops->release(s);
+}
+EXPORT_SYMBOL_GPL(_shm_signal_release);
+
+static void
+deferred_notify(unsigned long data)
+{
+	struct shm_signal *s = (struct shm_signal *)data;
+
+	_shm_signal_wakeup(s);
+}
+
+void shm_signal_init(struct shm_signal *s)
+{
+	memset(s, 0, sizeof(*s));
+	atomic_set(&s->refs, 1);
+	spin_lock_init(&s->lock);
+	tasklet_init(&s->deferred_notify,
+		     deferred_notify,
+		     (unsigned long)s);
+}
+EXPORT_SYMBOL_GPL(shm_signal_init);