diff mbox

[RFC,1/2] PCI-Express Non-Transparent Bridge Support

Message ID 1342215900-3358-1-git-send-email-jon.mason@intel.com
State Not Applicable
Headers show

Commit Message

Jon Mason July 13, 2012, 9:44 p.m. UTC
A PCI-Express non-transparent bridge (NTB) is a point-to-point PCIe bus
connecting 2 systems, providing electrical isolation between the two subsystems.
A non-transparent bridge is functionally similar to a transparent bridge except
that both sides of the bridge have their own independent address domains.  The
host on one side of the bridge will not have the visibility of the complete
memory or I/O space on the other side of the bridge.  To communicate across the
non-transparent bridge, each NTB endpoint has one (or more) apertures exposed to
the local system.  Writes to these apertures are mirrored to memory on the
remote system.  Communications can also occur through the use of doorbell
registers that initiate interrupts to the alternate domain, and scratch-pad
registers accessible from both sides.

The NTB device driver is needed to configure these memory windows, doorbell, and
scratch-pad registers as well as use them in such a way as they can be turned
into a viable communication channel to the remote system.  ntb_hw.[ch]
determines the usage model (NTB to NTB or NTB to Root Port) and abstracts away
the underlying hardware to provide access and a common interface to the doorbell
registers, scratch pads, and memory windows.  These hardware interfaces are
exported so that other, non-mainlined kernel drivers can access these.
ntb_transport.[ch] also uses the exported interfaces in ntb_hw.[ch] to setup a
communication channel(s) and provide a reliable way of transferring data from
one side to the other, which it then exports so that "client" drivers can access
them.  These client drivers are used to provide a standard kernel interface
(i.e., Ethernet device) to NTB, such that Linux can transfer data from one
system to the other in a standard way.

Signed-off-by: Jon Mason <jon.mason@intel.com>
---
 MAINTAINERS                 |    6 +
 drivers/Kconfig             |    2 +
 drivers/Makefile            |    1 +
 drivers/ntb/Kconfig         |   13 +
 drivers/ntb/Makefile        |    3 +
 drivers/ntb/ntb_hw.c        | 1283 +++++++++++++++++++++++++++++++++++++++++++
 drivers/ntb/ntb_hw.h        |  115 ++++
 drivers/ntb/ntb_regs.h      |  150 +++++
 drivers/ntb/ntb_transport.c | 1283 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/ntb.h         |   78 +++
 10 files changed, 2934 insertions(+), 0 deletions(-)
 create mode 100644 drivers/ntb/Kconfig
 create mode 100644 drivers/ntb/Makefile
 create mode 100644 drivers/ntb/ntb_hw.c
 create mode 100644 drivers/ntb/ntb_hw.h
 create mode 100644 drivers/ntb/ntb_regs.h
 create mode 100644 drivers/ntb/ntb_transport.c
 create mode 100644 include/linux/ntb.h

Comments

stephen hemminger July 14, 2012, midnight UTC | #1
On Fri, 13 Jul 2012 14:44:59 -0700
Jon Mason <jon.mason@intel.com> wrote:

> A PCI-Express non-transparent bridge (NTB) is a point-to-point PCIe bus
> connecting 2 systems, providing electrical isolation between the two subsystems.
> A non-transparent bridge is functionally similar to a transparent bridge except
> that both sides of the bridge have their own independent address domains.  The
> host on one side of the bridge will not have the visibility of the complete
> memory or I/O space on the other side of the bridge.  To communicate across the
> non-transparent bridge, each NTB endpoint has one (or more) apertures exposed to
> the local system.  Writes to these apertures are mirrored to memory on the
> remote system.  Communications can also occur through the use of doorbell
> registers that initiate interrupts to the alternate domain, and scratch-pad
> registers accessible from both sides.
> 
> The NTB device driver is needed to configure these memory windows, doorbell, and
> scratch-pad registers as well as use them in such a way as they can be turned
> into a viable communication channel to the remote system.  ntb_hw.[ch]
> determines the usage model (NTB to NTB or NTB to Root Port) and abstracts away
> the underlying hardware to provide access and a common interface to the doorbell
> registers, scratch pads, and memory windows.  These hardware interfaces are
> exported so that other, non-mainlined kernel drivers can access these.
> ntb_transport.[ch] also uses the exported interfaces in ntb_hw.[ch] to setup a
> communication channel(s) and provide a reliable way of transferring data from
> one side to the other, which it then exports so that "client" drivers can access
> them.  These client drivers are used to provide a standard kernel interface
> (i.e., Ethernet device) to NTB, such that Linux can transfer data from one
> system to the other in a standard way.
> 
> Signed-off-by: Jon Mason <jon.mason@intel.com>

> +
> +static int max_num_cbs = 2;
> +module_param(max_num_cbs, uint, 0644);
> +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");

Rather than making it a fixed size, could you dynamically set these up
with rtnl_link_ops?

> +static struct ntb_device *ntbdev;

What about multiple boards in system?

> +/**
> + * ntb_hw_link_status() - return the hardware link status
> + * @ndev: pointer to ntb_device instance
> + *
> + * Returns true if the hardware is connected to the remote system
> + *
> + * RETURNS: true or false based on the hardware link state
> + */
> +bool ntb_hw_link_status(struct ntb_device *ndev)
> +{
> +	return ndev->link_status == NTB_LINK_UP;
> +}
> +EXPORT_SYMBOL(ntb_hw_link_status);

Why isn't this inline in some header?

> +/**
> + * ntb_query_pdev() - return the pci_dev pointer
> + * @ndev: pointer to ntb_device instance
> + *
> + * Given the ntb pointer return the pci_dev pointerfor the NTB hardware device
> + *
> + * RETURNS: a pointer to the ntb pci_dev
> + */
> +struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
> +{
> +	return ndev->pdev;
> +}
> +EXPORT_SYMBOL(ntb_query_pdev);
> +
> +/**
> + * ntb_query_max_cbs() - return the maximum number of callback tuples
> + * @ndev: pointer to ntb_device instance
> + *
> + * The number of callbacks can vary depending on the platform and MSI-X/MSI
> + * enablement
> + *
> + * RETURNS: the maximum number of callback tuples (3, 15, or 33)
> + */
> +unsigned int ntb_query_max_cbs(struct ntb_device *ndev)
> +{
> +	return ndev->max_cbs > max_num_cbs ? max_num_cbs : ndev->max_cbs;
> +}
> +EXPORT_SYMBOL(ntb_query_max_cbs);
> +
> +/**
> + * ntb_register_event_callback() - register event callback
> + * @ndev: pointer to ntb_device instance
> + * @func: callback function to register
> + *
> + * This function registers a callback for any HW driver events such as link
> + * up/down, power management notices and etc.
> + *
> + * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
> + */
> +int ntb_register_event_callback(struct ntb_device *ndev,
> +				void (*func)(void *handle, unsigned int event))
> +{
> +	if (ndev->event_cb)
> +		return -EINVAL;
> +
> +	ndev->event_cb = func;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(ntb_register_event_callback);
> +
> +/**
> + * ntb_unregister_event_callback() - unregisters the event callback
> + * @ndev: pointer to ntb_device instance
> + *
> + * This function unregisters the existing callback from transport
> + */
> +void ntb_unregister_event_callback(struct ntb_device *ndev)
> +{
> +	ndev->event_cb = NULL;
> +}
> +EXPORT_SYMBOL(ntb_unregister_event_callback);
> +

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger July 14, 2012, 12:13 a.m. UTC | #2
On Fri, 13 Jul 2012 14:44:59 -0700
Jon Mason <jon.mason@intel.com> wrote:

> A PCI-Express non-transparent bridge (NTB) is a point-to-point PCIe bus
> connecting 2 systems, providing electrical isolation between the two subsystems.
> A non-transparent bridge is functionally similar to a transparent bridge except
> that both sides of the bridge have their own independent address domains.  The
> host on one side of the bridge will not have the visibility of the complete
> memory or I/O space on the other side of the bridge.  To communicate across the
> non-transparent bridge, each NTB endpoint has one (or more) apertures exposed to
> the local system.  Writes to these apertures are mirrored to memory on the
> remote system.  Communications can also occur through the use of doorbell
> registers that initiate interrupts to the alternate domain, and scratch-pad
> registers accessible from both sides.
> 
> The NTB device driver is needed to configure these memory windows, doorbell, and
> scratch-pad registers as well as use them in such a way as they can be turned
> into a viable communication channel to the remote system.  ntb_hw.[ch]
> determines the usage model (NTB to NTB or NTB to Root Port) and abstracts away
> the underlying hardware to provide access and a common interface to the doorbell
> registers, scratch pads, and memory windows.  These hardware interfaces are
> exported so that other, non-mainlined kernel drivers can access these.
> ntb_transport.[ch] also uses the exported interfaces in ntb_hw.[ch] to setup a
> communication channel(s) and provide a reliable way of transferring data from
> one side to the other, which it then exports so that "client" drivers can access
> them.  These client drivers are used to provide a standard kernel interface
> (i.e., Ethernet device) to NTB, such that Linux can transfer data from one
> system to the other in a standard way.
> 
> Signed-off-by: Jon Mason <jon.mason@intel.com>

This driver does some reimplementing of standard type operations is this
because you are trying to use the same code on multiple platforms?

Example:
+
+static void ntb_list_add_head(spinlock_t *lock, struct list_head *entry,
+			      struct list_head *list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(lock, flags);
+	list_add(entry, list);
+	spin_unlock_irqrestore(lock, flags);
+}
+
+static void ntb_list_add_tail(spinlock_t *lock, struct list_head *entry,
+			      struct list_head *list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(lock, flags);
+	list_add_tail(entry, list);
+	spin_unlock_irqrestore(lock, flags);
+}

Which are used on skb's and yet we already have sk_buff_head with locking?

I know you probably are committed to this API, but is there some way to
reuse existing shared memory used by virtio-net between two ports?


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Mason July 14, 2012, 6:19 a.m. UTC | #3
On Fri, Jul 13, 2012 at 05:13:44PM -0700, Stephen Hemminger wrote:
> On Fri, 13 Jul 2012 14:44:59 -0700
> Jon Mason <jon.mason@intel.com> wrote:
> 
> > A PCI-Express non-transparent bridge (NTB) is a point-to-point PCIe bus
> > connecting 2 systems, providing electrical isolation between the two subsystems.
> > A non-transparent bridge is functionally similar to a transparent bridge except
> > that both sides of the bridge have their own independent address domains.  The
> > host on one side of the bridge will not have the visibility of the complete
> > memory or I/O space on the other side of the bridge.  To communicate across the
> > non-transparent bridge, each NTB endpoint has one (or more) apertures exposed to
> > the local system.  Writes to these apertures are mirrored to memory on the
> > remote system.  Communications can also occur through the use of doorbell
> > registers that initiate interrupts to the alternate domain, and scratch-pad
> > registers accessible from both sides.
> > 
> > The NTB device driver is needed to configure these memory windows, doorbell, and
> > scratch-pad registers as well as use them in such a way as they can be turned
> > into a viable communication channel to the remote system.  ntb_hw.[ch]
> > determines the usage model (NTB to NTB or NTB to Root Port) and abstracts away
> > the underlying hardware to provide access and a common interface to the doorbell
> > registers, scratch pads, and memory windows.  These hardware interfaces are
> > exported so that other, non-mainlined kernel drivers can access these.
> > ntb_transport.[ch] also uses the exported interfaces in ntb_hw.[ch] to setup a
> > communication channel(s) and provide a reliable way of transferring data from
> > one side to the other, which it then exports so that "client" drivers can access
> > them.  These client drivers are used to provide a standard kernel interface
> > (i.e., Ethernet device) to NTB, such that Linux can transfer data from one
> > system to the other in a standard way.
> > 
> > Signed-off-by: Jon Mason <jon.mason@intel.com>
> 
> This driver does some reimplementing of standard type operations is this
> because you are trying to use the same code on multiple platforms?
> 
> Example:
> +
> +static void ntb_list_add_head(spinlock_t *lock, struct list_head *entry,
> +			      struct list_head *list)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(lock, flags);
> +	list_add(entry, list);
> +	spin_unlock_irqrestore(lock, flags);
> +}
> +
> +static void ntb_list_add_tail(spinlock_t *lock, struct list_head *entry,
> +			      struct list_head *list)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(lock, flags);
> +	list_add_tail(entry, list);
> +	spin_unlock_irqrestore(lock, flags);
> +}
> 
> Which are used on skb's and yet we already have sk_buff_head with locking?
> 
> I know you probably are committed to this API, but is there some way to
> reuse existing shared memory used by virtio-net between two ports?
> 
> 
The intention is to be able to have multiple client drivers/virtual devices that are able to use NTB as the transport to the remote system.  This is the reason why a void* is passed into the transport instead of skb*, making all of the extra book keeping necessary.  Currently, only the virtual Ethernet has been done, which may be part of the confusion.  I'd like to be able to find a way to have the virtio devices use ntb (and save me the work of reinventing the wheel), but step one is getting this code accepted :)

Thanks,
Jon
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH July 14, 2012, 5:04 p.m. UTC | #4
On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> The NTB device driver is needed to configure these memory windows, doorbell, and
> scratch-pad registers as well as use them in such a way as they can be turned
> into a viable communication channel to the remote system.  ntb_hw.[ch]
> determines the usage model (NTB to NTB or NTB to Root Port) and abstracts away
> the underlying hardware to provide access and a common interface to the doorbell
> registers, scratch pads, and memory windows.  These hardware interfaces are
> exported so that other, non-mainlined kernel drivers can access these.

Why would you have non-mainlined drivers?

Can you submit the drivers at the same time so we see how you are using
these new interfaces?

> +++ b/drivers/ntb/ntb_hw.c
> @@ -0,0 +1,1283 @@
> +/*
> + * This file is provided under a dual BSD/GPLv2 license.  When using or
> + *   redistributing this file, you may do so under either license.
> + *
> + *   GPL LICENSE SUMMARY
> + *
> + *   Copyright(c) 2012 Intel Corporation. All rights reserved.
> + *
> + *   This program is free software; you can redistribute it and/or modify
> + *   it under the terms of version 2 of the GNU General Public License as
> + *   published by the Free Software Foundation.
> + *
> + *   This program is distributed in the hope that it will be useful, but
> + *   WITHOUT ANY WARRANTY; without even the implied warranty of
> + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + *   General Public License for more details.
> + *
> + *   You should have received a copy of the GNU General Public License
> + *   along with this program; if not, write to the Free Software
> + *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
> + *   The full GNU General Public License is included in this distribution
> + *   in the file called LICENSE.GPL.

You really want to track the office movements of the FSF for the next 40
years?  You should ask your lawyers if you can remove this paragraph.

> +/**
> + * ntb_hw_link_status() - return the hardware link status
> + * @ndev: pointer to ntb_device instance
> + *
> + * Returns true if the hardware is connected to the remote system
> + *
> + * RETURNS: true or false based on the hardware link state
> + */
> +bool ntb_hw_link_status(struct ntb_device *ndev)
> +{
> +	return ndev->link_status == NTB_LINK_UP;
> +}
> +EXPORT_SYMBOL(ntb_hw_link_status);

EXPORT_SYMBOL_GPL() perhaps, for these, and the other symbols you are
creating?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Kroah-Hartman July 14, 2012, 5:10 p.m. UTC | #5
On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> +static int max_num_cbs = 2;
> +module_param(max_num_cbs, uint, 0644);
> +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
> +
> +static bool no_msix;
> +module_param(no_msix, bool, 0644);
> +MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");

How would a user, or a distro, know to set these options?  Why are they
even options at all?


> +struct ntb_device {
> +	struct pci_dev *pdev;
> +	struct msix_entry *msix_entries;
> +	void __iomem *reg_base;
> +	struct ntb_mw mw[NTB_NUM_MW];
> +	struct {
> +		unsigned int max_spads;
> +		unsigned int max_db_bits;
> +		unsigned int msix_cnt;
> +	} limits;
> +	struct {
> +		void __iomem *pdb;
> +		void __iomem *pdb_mask;
> +		void __iomem *sdb;
> +		void __iomem *sbar2_xlat;
> +		void __iomem *sbar4_xlat;
> +		void __iomem *spad_write;
> +		void __iomem *spad_read;
> +		void __iomem *lnk_cntl;
> +		void __iomem *lnk_stat;
> +		void __iomem *spci_cmd;
> +	} reg_ofs;
> +	void *ntb_transport;
> +	void (*event_cb)(void *handle, unsigned int event);

Shouldn't the event be an enum?

> +	struct ntb_db_cb *db_cb;
> +	unsigned char hw_type;
> +	unsigned char conn_type;
> +	unsigned char dev_type;
> +	unsigned char num_msix;
> +	unsigned char bits_per_vector;
> +	unsigned char max_cbs;
> +	unsigned char link_status;
> +	struct delayed_work hb_timer;
> +	unsigned long last_ts;
> +};

Why isn't this either a 'struct device' itself, or why isn't the 'struct
pci_device' embedded within it?  What controls the lifetime of this
device?  Why doesn't it show up in sysfs?  Don't you want it to show up
in the global device tree?

> +static DEFINE_PCI_DEVICE_TABLE(ntb_pci_tbl) = {
> +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
> +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
> +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_CLASSIC_JSF)},
> +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_RP_JSF)},
> +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_RP_SNB)},
> +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
> +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_CLASSIC_SNB)},
> +	{0}
> +};
> +MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
> +
> +static struct ntb_device *ntbdev;

You can really only have just one of these in the whole system?  Is that
wise?  Why isn't it dynamic and tied to the pci device itself as a
child?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Hagood July 15, 2012, 12:37 p.m. UTC | #6
Have you looked at any of the work that the PXI group has done on NTB
support within PXI?
http://www.ni.com/white-paper/12523/en

I was on that working group, and one of the first capabilities I
suggested for it was IP over NTB - I was going to implement this at my
employer, but the project took a different turn and this idea got put on
the back burner. I am glad somebody else has seen fit to take it up.

Also, a stupid question but one I have to ask: have you taken into
account the idea that the CPUs on each side of the NTB might have
different endian-ness? I was looking at a case with a PPC on one side of
the bridge and an X86 on the other.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Mason July 15, 2012, 11:50 p.m. UTC | #7
On Sat, Jul 14, 2012 at 10:04:11AM -0700, Greg KH wrote:
> On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > The NTB device driver is needed to configure these memory windows, doorbell, and
> > scratch-pad registers as well as use them in such a way as they can be turned
> > into a viable communication channel to the remote system.  ntb_hw.[ch]
> > determines the usage model (NTB to NTB or NTB to Root Port) and abstracts away
> > the underlying hardware to provide access and a common interface to the doorbell
> > registers, scratch pads, and memory windows.  These hardware interfaces are
> > exported so that other, non-mainlined kernel drivers can access these.
> 
> Why would you have non-mainlined drivers?
> 
> Can you submit the drivers at the same time so we see how you are using
> these new interfaces?

There are none at this time.  In the near future, the transport will be modified to use IOAT instead of the CPU copy to improve throughput performance, and it may be beneficial to have that separate.  If you wish for me to remove the hooks until it is necessary for that, then I can.

> 
> > +++ b/drivers/ntb/ntb_hw.c
> > @@ -0,0 +1,1283 @@
> > +/*
> > + * This file is provided under a dual BSD/GPLv2 license.  When using or
> > + *   redistributing this file, you may do so under either license.
> > + *
> > + *   GPL LICENSE SUMMARY
> > + *
> > + *   Copyright(c) 2012 Intel Corporation. All rights reserved.
> > + *
> > + *   This program is free software; you can redistribute it and/or modify
> > + *   it under the terms of version 2 of the GNU General Public License as
> > + *   published by the Free Software Foundation.
> > + *
> > + *   This program is distributed in the hope that it will be useful, but
> > + *   WITHOUT ANY WARRANTY; without even the implied warranty of
> > + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + *   General Public License for more details.
> > + *
> > + *   You should have received a copy of the GNU General Public License
> > + *   along with this program; if not, write to the Free Software
> > + *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
> > + *   The full GNU General Public License is included in this distribution
> > + *   in the file called LICENSE.GPL.
> 
> You really want to track the office movements of the FSF for the next 40
> years?  You should ask your lawyers if you can remove this paragraph.

Standard boilerplate junk, but it never hurts to ask.
 
> > +/**
> > + * ntb_hw_link_status() - return the hardware link status
> > + * @ndev: pointer to ntb_device instance
> > + *
> > + * Returns true if the hardware is connected to the remote system
> > + *
> > + * RETURNS: true or false based on the hardware link state
> > + */
> > +bool ntb_hw_link_status(struct ntb_device *ndev)
> > +{
> > +	return ndev->link_status == NTB_LINK_UP;
> > +}
> > +EXPORT_SYMBOL(ntb_hw_link_status);
> 
> EXPORT_SYMBOL_GPL() perhaps, for these, and the other symbols you are
> creating?

Will do.

> 
> thanks,
> 
> greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH July 15, 2012, 11:53 p.m. UTC | #8
On Sun, Jul 15, 2012 at 04:50:41PM -0700, Jon Mason wrote:
> On Sat, Jul 14, 2012 at 10:04:11AM -0700, Greg KH wrote:
> > On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > > The NTB device driver is needed to configure these memory windows, doorbell, and
> > > scratch-pad registers as well as use them in such a way as they can be turned
> > > into a viable communication channel to the remote system.  ntb_hw.[ch]
> > > determines the usage model (NTB to NTB or NTB to Root Port) and abstracts away
> > > the underlying hardware to provide access and a common interface to the doorbell
> > > registers, scratch pads, and memory windows.  These hardware interfaces are
> > > exported so that other, non-mainlined kernel drivers can access these.
> > 
> > Why would you have non-mainlined drivers?
> > 
> > Can you submit the drivers at the same time so we see how you are using
> > these new interfaces?
> 
> There are none at this time.  In the near future, the transport will
> be modified to use IOAT instead of the CPU copy to improve throughput
> performance, and it may be beneficial to have that separate.  If you
> wish for me to remove the hooks until it is necessary for that, then I
> can.

Yes, please do so, we don't add apis for things that are not in-kernel
as they almost always need to change once we actually get a user of
them, as you know.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Mason July 15, 2012, 11:55 p.m. UTC | #9
On Sat, Jul 14, 2012 at 10:10:15AM -0700, Greg KH wrote:
> On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > +static int max_num_cbs = 2;
> > +module_param(max_num_cbs, uint, 0644);
> > +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
> > +
> > +static bool no_msix;
> > +module_param(no_msix, bool, 0644);
> > +MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");
> 
> How would a user, or a distro, know to set these options?  Why are they
> even options at all?

Good question.  There is actually a potential benefit to disabling MSI-X.  The NTB device on one of our platforms only has 3 MSI-X vectors.  In the current driver design, that would limit them to 3 client/virtual devices.  However, there are 15bits in the ISR that can be used for the same purpose.  So, if you disable MSI-X, you can have 15 instead of 3.  

> 
> 
> > +struct ntb_device {
> > +	struct pci_dev *pdev;
> > +	struct msix_entry *msix_entries;
> > +	void __iomem *reg_base;
> > +	struct ntb_mw mw[NTB_NUM_MW];
> > +	struct {
> > +		unsigned int max_spads;
> > +		unsigned int max_db_bits;
> > +		unsigned int msix_cnt;
> > +	} limits;
> > +	struct {
> > +		void __iomem *pdb;
> > +		void __iomem *pdb_mask;
> > +		void __iomem *sdb;
> > +		void __iomem *sbar2_xlat;
> > +		void __iomem *sbar4_xlat;
> > +		void __iomem *spad_write;
> > +		void __iomem *spad_read;
> > +		void __iomem *lnk_cntl;
> > +		void __iomem *lnk_stat;
> > +		void __iomem *spci_cmd;
> > +	} reg_ofs;
> > +	void *ntb_transport;
> > +	void (*event_cb)(void *handle, unsigned int event);
> 
> Shouldn't the event be an enum?

No, that would be too smart.

> 
> > +	struct ntb_db_cb *db_cb;
> > +	unsigned char hw_type;
> > +	unsigned char conn_type;
> > +	unsigned char dev_type;
> > +	unsigned char num_msix;
> > +	unsigned char bits_per_vector;
> > +	unsigned char max_cbs;
> > +	unsigned char link_status;
> > +	struct delayed_work hb_timer;
> > +	unsigned long last_ts;
> > +};
> 
> Why isn't this either a 'struct device' itself, or why isn't the 'struct
> pci_device' embedded within it?  What controls the lifetime of this
> device?  Why doesn't it show up in sysfs?  Don't you want it to show up
> in the global device tree?
> 
> > +static DEFINE_PCI_DEVICE_TABLE(ntb_pci_tbl) = {
> > +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
> > +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
> > +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_CLASSIC_JSF)},
> > +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_RP_JSF)},
> > +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_RP_SNB)},
> > +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
> > +	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_CLASSIC_SNB)},
> > +	{0}
> > +};
> > +MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
> > +
> > +static struct ntb_device *ntbdev;
> 
> You can really only have just one of these in the whole system?  Is that
> wise?  Why isn't it dynamic and tied to the pci device itself as a
> child?

Good point, I will fix that up.

Thanks for the review!

> 
> thanks,
> 
> greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Kroah-Hartman July 16, 2012, 12:19 a.m. UTC | #10
On Sun, Jul 15, 2012 at 04:55:48PM -0700, Jon Mason wrote:
> On Sat, Jul 14, 2012 at 10:10:15AM -0700, Greg KH wrote:
> > On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > > +static int max_num_cbs = 2;
> > > +module_param(max_num_cbs, uint, 0644);
> > > +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
> > > +
> > > +static bool no_msix;
> > > +module_param(no_msix, bool, 0644);
> > > +MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");
> > 
> > How would a user, or a distro, know to set these options?  Why are they
> > even options at all?
> 
> Good question.  There is actually a potential benefit to disabling
> MSI-X.  The NTB device on one of our platforms only has 3 MSI-X
> vectors.  In the current driver design, that would limit them to 3
> client/virtual devices.  However, there are 15bits in the ISR that can
> be used for the same purpose.  So, if you disable MSI-X, you can have
> 15 instead of 3.  

But again, how would a user, or a distro, know to set these?  Where is
the documentation describing it?  Why really have these options at all
and not just fix the platform issues (only 3 MSI-X vectors?  Really?)

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
chetan L July 16, 2012, 4:49 p.m. UTC | #11
Hi Jon,

On Fri, Jul 13, 2012 at 5:44 PM, Jon Mason <jon.mason@intel.com> wrote:

Just a few minor comments/questions:

....

> +struct ntb_transport_qp {
> +       struct ntb_device *ndev;
> +
> +       bool client_ready;
> +       bool qp_link;
> +       u8 qp_num;      /* Only 64 QP's are allowed.  0-63 */
> +
> +       void (*tx_handler) (struct ntb_transport_qp *qp);
> +       struct tasklet_struct tx_work;

Is it ok to rename the following vars for convenience sake?

> +       struct list_head txq;
tx_pend_q - (pending_queue) or tx_out_q - (outstanding_queue) - or
pick any new string you like - other than a mono-syllable definition

> +       struct list_head txc;
tx_compl_q - completion queue

> +       struct list_head txe;
tx_avail_e - available entry queue


> +       spinlock_t txq_lock;
> +       spinlock_t txc_lock;
> +       spinlock_t txe_lock;

then match the variants accordingly.

> +       struct list_head rxq;
> +       struct list_head rxc;
> +       struct list_head rxe;
> +       spinlock_t rxq_lock;
> +       spinlock_t rxc_lock;
> +       spinlock_t rxe_lock;

similarly the rx-counterpart


..................

> +static void ntb_tx_copy_task(struct ntb_transport_qp *qp,
> +                            struct ntb_queue_entry *entry,
> +                            void *offset)
> +{
> +       struct ntb_payload_header *hdr = offset;
> +       int rc;
> +
> +       offset += sizeof(struct ntb_payload_header);
> +       memcpy_toio(offset, entry->buf, entry->len);
> +
> +       hdr->len = entry->len;
> +       hdr->ver = qp->tx_pkts;
> +
> +       /* Ensure that the data is fully copied out before setting the flag */
> +       wmb();
> +       hdr->flags = entry->flags | DESC_DONE_FLAG;
> +
> +       rc = ntb_ring_sdb(qp->ndev, qp->qp_num);
> +       if (rc)
> +               pr_err("%s: error ringing db %d\n", __func__, qp->qp_num);
> +
> +       if (entry->len > 0) {

how do you interpret this len variable and decide if it's a good/bad completion?

> +               qp->tx_bytes += entry->len;
> +
> +               /* Add fully transmitted data to completion queue */
> +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> +
> +               if (qp->tx_handler)
> +                       qp->tx_handler(qp);
> +       } else
> +               ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);

I could be wrong but how is the original skb handled if the code path
goes in the else clause?
Also, in the else clause, how about a ntb_list_add_head rather than a _tail.

> +
> +static int ntb_process_tx(struct ntb_transport_qp *qp,
> +                         struct ntb_queue_entry *entry)
> +{
> +       struct ntb_payload_header *hdr;
> +       void *offset;
> +
> +       offset = qp->tx_offset;
> +       hdr = offset;
> +
> +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
> +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
> +                entry->buf);
> +       if (hdr->flags) {
> +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
> +               qp->tx_ring_full++;
> +               return -EAGAIN;
> +       }
> +
> +       if (entry->len > transport_mtu) {
> +               pr_err("Trying to send pkt size of %d\n", entry->len);
> +               entry->flags = HW_ERROR_FLAG;
> +
> +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> +
> +               if (qp->tx_handler)
> +                       qp->tx_handler(qp);
> +
> +               return 0;
> +       }
> +
> +       ntb_tx_copy_task(qp, entry, offset);

what happens when ntb_sdb_ring returns an error? would you still want
to increment tx_pkts below?

> +
> +       qp->tx_offset =
> +           (qp->tx_offset +
> +            ((transport_mtu + sizeof(struct ntb_payload_header)) * 2) >=
> +            qp->tx_mw_end) ? qp->tx_mw_begin : qp->tx_offset + transport_mtu +
> +           sizeof(struct ntb_payload_header);
> +
> +       qp->tx_pkts++;
> +
> +       return 0;
> +}
> +

........................


> +void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
> +{
> +       struct ntb_queue_entry *entry;
> +       void *buf;
> +
> +       if (!qp)
> +               return NULL;
> +
> +       entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc);
> +       if (!entry)
> +               return NULL;
> +
> +       buf = entry->callback_data;
> +       if (entry->flags != HW_ERROR_FLAG)
> +               *len = entry->len;
> +       else
> +               *len = -EIO;
> +
> +       ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);

how about a ntb_list_add_head?


Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Mason July 16, 2012, 5:55 p.m. UTC | #12
On Sun, Jul 15, 2012 at 05:19:21PM -0700, Greg KH wrote:
> On Sun, Jul 15, 2012 at 04:55:48PM -0700, Jon Mason wrote:
> > On Sat, Jul 14, 2012 at 10:10:15AM -0700, Greg KH wrote:
> > > On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > > > +static int max_num_cbs = 2;
> > > > +module_param(max_num_cbs, uint, 0644);
> > > > +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
> > > > +
> > > > +static bool no_msix;
> > > > +module_param(no_msix, bool, 0644);
> > > > +MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");
> > > 
> > > How would a user, or a distro, know to set these options?  Why are they
> > > even options at all?
> > 
> > Good question.  There is actually a potential benefit to disabling
> > MSI-X.  The NTB device on one of our platforms only has 3 MSI-X
> > vectors.  In the current driver design, that would limit them to 3
> > client/virtual devices.  However, there are 15bits in the ISR that can
> > be used for the same purpose.  So, if you disable MSI-X, you can have
> > 15 instead of 3.  
> 
> But again, how would a user, or a distro, know to set these?  Where is
> the documentation describing it?  Why really have these options at all
> and not just fix the platform issues (only 3 MSI-X vectors?  Really?)

I believe we'll want multiple clients (or have multiqueue Ethernet).  I'm happy to add something to /Documentation to describe it and why it would be useful, or I can remove it and re-introduce it when I add multiqueue Ethernet.

3 MSI-X vectors (plus one for PCI-E link up/down) on Xeon NTB, and 33 for Atom NTB.  Yeah, really.

> 
> thanks,
> 
> greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
chetan L July 16, 2012, 6:26 p.m. UTC | #13
Jon,

On Fri, Jul 13, 2012 at 5:44 PM, Jon Mason <jon.mason@intel.com> wrote:

..............

> +/**
> + * ntb_ring_sdb() - Set the doorbell on the secondary/external side
> + * @ndev: pointer to ntb_device instance
> + * @db: doorbell to ring
> + *
> + * This function allows triggering of a doorbell on the secondary/external
> + * side that will initiate an interrupt on the remote host
> + *
> + * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
> + */
> +int ntb_ring_sdb(struct ntb_device *ndev, unsigned int db)
> +{
> +       dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
> +

> +       if (db >= ndev->max_cbs)
> +               return -EINVAL;

How about moving this max_cbs error check in the upper level
callers(example in ntb_process_tx)?
That way you won't have to defer handling some negative cases all the
way till the end.

So ntb_process_tx could now look like:

.....
error=0;
if (entry->len > transport_mtu) {
...
error=1;
}
else if (qp->qp_num >= qp->ndev->max_cbs) {
...
error=1;
}

if (unlikely(error)) {
      ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
      if (qp->tx_handler)
           qp->tx_handler(qp);

      return 0;
}
.................

No further comments below

> +
> +static int ntb_process_tx(struct ntb_transport_qp *qp,
> +                         struct ntb_queue_entry *entry)
> +{
> +       struct ntb_payload_header *hdr;
> +       void *offset;
> +
> +       offset = qp->tx_offset;
> +       hdr = offset;
> +
> +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
> +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
> +                entry->buf);
> +       if (hdr->flags) {
> +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
> +               qp->tx_ring_full++;
> +               return -EAGAIN;
> +       }
> +
> +       if (entry->len > transport_mtu) {
> +               pr_err("Trying to send pkt size of %d\n", entry->len);
> +               entry->flags = HW_ERROR_FLAG;
> +
> +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> +
> +               if (qp->tx_handler)
> +                       qp->tx_handler(qp);
> +
> +               return 0;
> +       }
> +
> +       ntb_tx_copy_task(qp, entry, offset);
> +
> +       qp->tx_offset =
> +           (qp->tx_offset +
> +            ((transport_mtu + sizeof(struct ntb_payload_header)) * 2) >=
> +            qp->tx_mw_end) ? qp->tx_mw_begin : qp->tx_offset + transport_mtu +
> +           sizeof(struct ntb_payload_header);
> +
> +       qp->tx_pkts++;
> +
> +       return 0;
> +}
> +


Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Kroah-Hartman July 16, 2012, 6:30 p.m. UTC | #14
On Mon, Jul 16, 2012 at 10:55:06AM -0700, Jon Mason wrote:
> On Sun, Jul 15, 2012 at 05:19:21PM -0700, Greg KH wrote:
> > On Sun, Jul 15, 2012 at 04:55:48PM -0700, Jon Mason wrote:
> > > On Sat, Jul 14, 2012 at 10:10:15AM -0700, Greg KH wrote:
> > > > On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > > > > +static int max_num_cbs = 2;
> > > > > +module_param(max_num_cbs, uint, 0644);
> > > > > +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
> > > > > +
> > > > > +static bool no_msix;
> > > > > +module_param(no_msix, bool, 0644);
> > > > > +MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");
> > > > 
> > > > How would a user, or a distro, know to set these options?  Why are they
> > > > even options at all?
> > > 
> > > Good question.  There is actually a potential benefit to disabling
> > > MSI-X.  The NTB device on one of our platforms only has 3 MSI-X
> > > vectors.  In the current driver design, that would limit them to 3
> > > client/virtual devices.  However, there are 15bits in the ISR that can
> > > be used for the same purpose.  So, if you disable MSI-X, you can have
> > > 15 instead of 3.  
> > 
> > But again, how would a user, or a distro, know to set these?  Where is
> > the documentation describing it?  Why really have these options at all
> > and not just fix the platform issues (only 3 MSI-X vectors?  Really?)
> 
> I believe we'll want multiple clients (or have multiqueue Ethernet).
> I'm happy to add something to /Documentation to describe it and why it
> would be useful, or I can remove it and re-introduce it when I add
> multiqueue Ethernet.

I'd suggest waiting and adding it later if really needed (see previous
comment about not adding code/features before they are actually needed.)

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Mason July 16, 2012, 6:38 p.m. UTC | #15
On Mon, Jul 16, 2012 at 12:49:39PM -0400, chetan loke wrote:
> Hi Jon,
> 
> On Fri, Jul 13, 2012 at 5:44 PM, Jon Mason <jon.mason@intel.com> wrote:
> 
> Just a few minor comments/questions:
> 
> ....
> 
> > +struct ntb_transport_qp {
> > +       struct ntb_device *ndev;
> > +
> > +       bool client_ready;
> > +       bool qp_link;
> > +       u8 qp_num;      /* Only 64 QP's are allowed.  0-63 */
> > +
> > +       void (*tx_handler) (struct ntb_transport_qp *qp);
> > +       struct tasklet_struct tx_work;
> 
> Is it ok to rename the following vars for convenience sake?
> 
> > +       struct list_head txq;
> tx_pend_q - (pending_queue) or tx_out_q - (outstanding_queue) - or
> pick any new string you like - other than a mono-syllable definition
> 
> > +       struct list_head txc;
> tx_compl_q - completion queue
> 
> > +       struct list_head txe;
> tx_avail_e - available entry queue
> 
> 
> > +       spinlock_t txq_lock;
> > +       spinlock_t txc_lock;
> > +       spinlock_t txe_lock;
> 
> then match the variants accordingly.
> 
> > +       struct list_head rxq;
> > +       struct list_head rxc;
> > +       struct list_head rxe;
> > +       spinlock_t rxq_lock;
> > +       spinlock_t rxc_lock;
> > +       spinlock_t rxe_lock;
> 
> similarly the rx-counterpart

Are they difficult to understand?  I can change them, but it seems rather moot since you seemed to immediately understand the logic behind the names.

> 
> 
> ..................
> 
> > +static void ntb_tx_copy_task(struct ntb_transport_qp *qp,
> > +                            struct ntb_queue_entry *entry,
> > +                            void *offset)
> > +{
> > +       struct ntb_payload_header *hdr = offset;
> > +       int rc;
> > +
> > +       offset += sizeof(struct ntb_payload_header);
> > +       memcpy_toio(offset, entry->buf, entry->len);
> > +
> > +       hdr->len = entry->len;
> > +       hdr->ver = qp->tx_pkts;
> > +
> > +       /* Ensure that the data is fully copied out before setting the flag */
> > +       wmb();
> > +       hdr->flags = entry->flags | DESC_DONE_FLAG;
> > +
> > +       rc = ntb_ring_sdb(qp->ndev, qp->qp_num);
> > +       if (rc)
> > +               pr_err("%s: error ringing db %d\n", __func__, qp->qp_num);
> > +
> > +       if (entry->len > 0) {
> 
> how do you interpret this len variable and decide if it's a good/bad completion?

The length of 0 is for messages from the remote system, which currently only consists of a "link down" message notifying the local system to no longer send any messages to the remote side.

> 
> > +               qp->tx_bytes += entry->len;
> > +
> > +               /* Add fully transmitted data to completion queue */
> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> > +
> > +               if (qp->tx_handler)
> > +                       qp->tx_handler(qp);
> > +       } else
> > +               ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
> 
> I could be wrong but how is the original skb handled if the code path
> goes in the else clause?

There is no skb is the length is zero.  Since the only client is virtual ethernet, it will always be > 60.  However, I should add a sanity check for 0 length in tx_enqueue.

> Also, in the else clause, how about a ntb_list_add_head rather than a _tail.

Why add to the head, it was just used?

> > +
> > +static int ntb_process_tx(struct ntb_transport_qp *qp,
> > +                         struct ntb_queue_entry *entry)
> > +{
> > +       struct ntb_payload_header *hdr;
> > +       void *offset;
> > +
> > +       offset = qp->tx_offset;
> > +       hdr = offset;
> > +
> > +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
> > +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
> > +                entry->buf);
> > +       if (hdr->flags) {
> > +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
> > +               qp->tx_ring_full++;
> > +               return -EAGAIN;
> > +       }
> > +
> > +       if (entry->len > transport_mtu) {
> > +               pr_err("Trying to send pkt size of %d\n", entry->len);
> > +               entry->flags = HW_ERROR_FLAG;
> > +
> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> > +
> > +               if (qp->tx_handler)
> > +                       qp->tx_handler(qp);
> > +
> > +               return 0;
> > +       }
> > +
> > +       ntb_tx_copy_task(qp, entry, offset);
> 
> what happens when ntb_sdb_ring returns an error? would you still want
> to increment tx_pkts below?

It's not fatal if the remote side isn't notified.  It will hurt latency, since the next packet would be the one that triggers the next interrupt.  Also, the only way it could ever fail would be if it was an invalid interrupt bit being set, which should never happen.

> > +
> > +       qp->tx_offset =
> > +           (qp->tx_offset +
> > +            ((transport_mtu + sizeof(struct ntb_payload_header)) * 2) >=
> > +            qp->tx_mw_end) ? qp->tx_mw_begin : qp->tx_offset + transport_mtu +
> > +           sizeof(struct ntb_payload_header);
> > +
> > +       qp->tx_pkts++;
> > +
> > +       return 0;
> > +}
> > +
> 
> ........................
> 
> 
> > +void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
> > +{
> > +       struct ntb_queue_entry *entry;
> > +       void *buf;
> > +
> > +       if (!qp)
> > +               return NULL;
> > +
> > +       entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc);
> > +       if (!entry)
> > +               return NULL;
> > +
> > +       buf = entry->callback_data;
> > +       if (entry->flags != HW_ERROR_FLAG)
> > +               *len = entry->len;
> > +       else
> > +               *len = -EIO;
> > +
> > +       ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
> 
> how about a ntb_list_add_head?

Is there a benefit to adding to the head versus tail?  It makes more sense to me to pull from the head and add to the tail.

> 
> 
> Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
chetan L July 16, 2012, 7:27 p.m. UTC | #16
On Mon, Jul 16, 2012 at 2:38 PM, Jon Mason <jon.mason@intel.com> wrote:
> On Mon, Jul 16, 2012 at 12:49:39PM -0400, chetan loke wrote:

....

>> Is it ok to rename the following vars for convenience sake?
>>
>> > +       struct list_head txq;
>> tx_pend_q - (pending_queue) or tx_out_q - (outstanding_queue) - or
>> pick any new string you like - other than a mono-syllable definition
>>
>> > +       struct list_head txc;
>> tx_compl_q - completion queue
>>
>> > +       struct list_head txe;
>> tx_avail_e - available entry queue
>>
>>
>> > +       spinlock_t txq_lock;
>> > +       spinlock_t txc_lock;
>> > +       spinlock_t txe_lock;
>>
>> then match the variants accordingly.
>>
>> > +       struct list_head rxq;
>> > +       struct list_head rxc;
>> > +       struct list_head rxe;
>> > +       spinlock_t rxq_lock;
>> > +       spinlock_t rxc_lock;
>> > +       spinlock_t rxe_lock;
>>
>> similarly the rx-counterpart
>
> Are they difficult to understand?  I can change them, but it seems rather moot since you seemed to immediately understand the logic behind the names.
>

Immediately understand? Not at first glance. I had to re-read the
functions. Its really is a minor change and variables will then become
self-explanatory. I can almost feel that a developer who works on this
code for the first time might end up choosing the wrong 'syllable' and
locking an entirely different lock.

Infact add a prefix 'ntb' to all the rx/tx locks. That way grep'ing
output of lockstat also becomes easier.

It now reads: ntb_tx_pend_lock

>>
>>
>> ..................
>>
>> > +static void ntb_tx_copy_task(struct ntb_transport_qp *qp,
>> > +                            struct ntb_queue_entry *entry,
>> > +                            void *offset)
>> > +{
>> > +       struct ntb_payload_header *hdr = offset;
>> > +       int rc;
>> > +
>> > +       offset += sizeof(struct ntb_payload_header);
>> > +       memcpy_toio(offset, entry->buf, entry->len);
>> > +
>> > +       hdr->len = entry->len;
>> > +       hdr->ver = qp->tx_pkts;
>> > +
>> > +       /* Ensure that the data is fully copied out before setting the flag */
>> > +       wmb();
>> > +       hdr->flags = entry->flags | DESC_DONE_FLAG;
>> > +
>> > +       rc = ntb_ring_sdb(qp->ndev, qp->qp_num);
>> > +       if (rc)
>> > +               pr_err("%s: error ringing db %d\n", __func__, qp->qp_num);
>> > +
>> > +       if (entry->len > 0) {
>>
>> how do you interpret this len variable and decide if it's a good/bad completion?
>
> The length of 0 is for messages from the remote system, which currently only consists of a "link down" message notifying the local system to no longer send any messages to the remote side.
>

May be I didn't read the code properly. Is there a length-comment that
explains the above? If not then just by pure code inspection it would
seem that a skb was leaked. So add the above comment that way we can
save time for other netdev folks too.


>>
>> > +               qp->tx_bytes += entry->len;
>> > +
>> > +               /* Add fully transmitted data to completion queue */
>> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
>> > +
>> > +               if (qp->tx_handler)
>> > +                       qp->tx_handler(qp);
>> > +       } else
>> > +               ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
>>
>> I could be wrong but how is the original skb handled if the code path
>> goes in the else clause?
>
> There is no skb is the length is zero.  Since the only client is virtual ethernet, it will always be > 60.  However, I should add a sanity check for 0 length in tx_enqueue.
>
>> Also, in the else clause, how about a ntb_list_add_head rather than a _tail.
>
> Why add to the head, it was just used?

Yes, just re-use what's hot(best guess).

>
>> > +
>> > +static int ntb_process_tx(struct ntb_transport_qp *qp,
>> > +                         struct ntb_queue_entry *entry)
>> > +{
>> > +       struct ntb_payload_header *hdr;
>> > +       void *offset;
>> > +
>> > +       offset = qp->tx_offset;
>> > +       hdr = offset;
>> > +
>> > +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
>> > +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
>> > +                entry->buf);
>> > +       if (hdr->flags) {
>> > +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
>> > +               qp->tx_ring_full++;
>> > +               return -EAGAIN;
>> > +       }
>> > +
>> > +       if (entry->len > transport_mtu) {
>> > +               pr_err("Trying to send pkt size of %d\n", entry->len);
>> > +               entry->flags = HW_ERROR_FLAG;
>> > +
>> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
>> > +
>> > +               if (qp->tx_handler)
>> > +                       qp->tx_handler(qp);
>> > +
>> > +               return 0;
>> > +       }
>> > +
>> > +       ntb_tx_copy_task(qp, entry, offset);
>>
>> what happens when ntb_sdb_ring returns an error? would you still want
>> to increment tx_pkts below?
>
> It's not fatal if the remote side isn't notified.  It will hurt latency, since the next packet would be the one that triggers the next interrupt.  Also, the only way it could ever fail would be if it was an invalid interrupt bit being set, which should never happen.
>

What happens when the 'db >= ndev->max_cbs' check fails? Under what
circumstances will that happen? When it does happen how does the
remote side then gets notified or should it even get notified?

'which should never happen'? FYI - I have seen and debugged(not this
one but doorbells and what not) weirdness while working on CLARiiON +
PCie-interconnects. Board bring-up is a PITA. So you get the idea ...

>>
>> > +void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
>> > +{
>> > +       struct ntb_queue_entry *entry;
>> > +       void *buf;
>> > +
>> > +       if (!qp)
>> > +               return NULL;
>> > +
>> > +       entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc);
>> > +       if (!entry)
>> > +               return NULL;
>> > +
>> > +       buf = entry->callback_data;
>> > +       if (entry->flags != HW_ERROR_FLAG)
>> > +               *len = entry->len;
>> > +       else
>> > +               *len = -EIO;
>> > +
>> > +       ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
>>
>> how about a ntb_list_add_head?
>
> Is there a benefit to adding to the head versus tail?  It makes more sense to me to pull from the head and add to the tail.
>

Yes, explained above. Today there are 100(..DEF_NUM...) entries.
Tomorrow there could be more. So why not re-use what's hot? You could
also think of this as yet another way of forcing detection of
double-use corruption/bug. I'm not saying that there's a bug here but
you get the idea.

Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Mason July 17, 2012, 12:23 a.m. UTC | #17
On Mon, Jul 16, 2012 at 03:27:48PM -0400, chetan loke wrote:
> On Mon, Jul 16, 2012 at 2:38 PM, Jon Mason <jon.mason@intel.com> wrote:
> > On Mon, Jul 16, 2012 at 12:49:39PM -0400, chetan loke wrote:
> 
> ....
> 
> >> Is it ok to rename the following vars for convenience sake?
> >>
> >> > +       struct list_head txq;
> >> tx_pend_q - (pending_queue) or tx_out_q - (outstanding_queue) - or
> >> pick any new string you like - other than a mono-syllable definition
> >>
> >> > +       struct list_head txc;
> >> tx_compl_q - completion queue
> >>
> >> > +       struct list_head txe;
> >> tx_avail_e - available entry queue
> >>
> >>
> >> > +       spinlock_t txq_lock;
> >> > +       spinlock_t txc_lock;
> >> > +       spinlock_t txe_lock;
> >>
> >> then match the variants accordingly.
> >>
> >> > +       struct list_head rxq;
> >> > +       struct list_head rxc;
> >> > +       struct list_head rxe;
> >> > +       spinlock_t rxq_lock;
> >> > +       spinlock_t rxc_lock;
> >> > +       spinlock_t rxe_lock;
> >>
> >> similarly the rx-counterpart
> >
> > Are they difficult to understand?  I can change them, but it seems rather moot since you seemed to immediately understand the logic behind the names.
> >
> 
> Immediately understand? Not at first glance. I had to re-read the
> functions. Its really is a minor change and variables will then become
> self-explanatory. I can almost feel that a developer who works on this
> code for the first time might end up choosing the wrong 'syllable' and
> locking an entirely different lock.
> 
> Infact add a prefix 'ntb' to all the rx/tx locks. That way grep'ing
> output of lockstat also becomes easier.
> 
> It now reads: ntb_tx_pend_lock

Makes sense

> 
> >>
> >>
> >> ..................
> >>
> >> > +static void ntb_tx_copy_task(struct ntb_transport_qp *qp,
> >> > +                            struct ntb_queue_entry *entry,
> >> > +                            void *offset)
> >> > +{
> >> > +       struct ntb_payload_header *hdr = offset;
> >> > +       int rc;
> >> > +
> >> > +       offset += sizeof(struct ntb_payload_header);
> >> > +       memcpy_toio(offset, entry->buf, entry->len);
> >> > +
> >> > +       hdr->len = entry->len;
> >> > +       hdr->ver = qp->tx_pkts;
> >> > +
> >> > +       /* Ensure that the data is fully copied out before setting the flag */
> >> > +       wmb();
> >> > +       hdr->flags = entry->flags | DESC_DONE_FLAG;
> >> > +
> >> > +       rc = ntb_ring_sdb(qp->ndev, qp->qp_num);
> >> > +       if (rc)
> >> > +               pr_err("%s: error ringing db %d\n", __func__, qp->qp_num);
> >> > +
> >> > +       if (entry->len > 0) {
> >>
> >> how do you interpret this len variable and decide if it's a good/bad completion?
> >
> > The length of 0 is for messages from the remote system, which currently only consists of a "link down" message notifying the local system to no longer send any messages to the remote side.
> >
> 
> May be I didn't read the code properly. Is there a length-comment that
> explains the above? If not then just by pure code inspection it would
> seem that a skb was leaked. So add the above comment that way we can
> save time for other netdev folks too.

I'll add a comment.

> 
> 
> >>
> >> > +               qp->tx_bytes += entry->len;
> >> > +
> >> > +               /* Add fully transmitted data to completion queue */
> >> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> >> > +
> >> > +               if (qp->tx_handler)
> >> > +                       qp->tx_handler(qp);
> >> > +       } else
> >> > +               ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
> >>
> >> I could be wrong but how is the original skb handled if the code path
> >> goes in the else clause?
> >
> > There is no skb is the length is zero.  Since the only client is virtual ethernet, it will always be > 60.  However, I should add a sanity check for 0 length in tx_enqueue.
> >
> >> Also, in the else clause, how about a ntb_list_add_head rather than a _tail.
> >
> > Why add to the head, it was just used?
> 
> Yes, just re-use what's hot(best guess).
> 
> >
> >> > +
> >> > +static int ntb_process_tx(struct ntb_transport_qp *qp,
> >> > +                         struct ntb_queue_entry *entry)
> >> > +{
> >> > +       struct ntb_payload_header *hdr;
> >> > +       void *offset;
> >> > +
> >> > +       offset = qp->tx_offset;
> >> > +       hdr = offset;
> >> > +
> >> > +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
> >> > +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
> >> > +                entry->buf);
> >> > +       if (hdr->flags) {
> >> > +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
> >> > +               qp->tx_ring_full++;
> >> > +               return -EAGAIN;
> >> > +       }
> >> > +
> >> > +       if (entry->len > transport_mtu) {
> >> > +               pr_err("Trying to send pkt size of %d\n", entry->len);
> >> > +               entry->flags = HW_ERROR_FLAG;
> >> > +
> >> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> >> > +
> >> > +               if (qp->tx_handler)
> >> > +                       qp->tx_handler(qp);
> >> > +
> >> > +               return 0;
> >> > +       }
> >> > +
> >> > +       ntb_tx_copy_task(qp, entry, offset);
> >>
> >> what happens when ntb_sdb_ring returns an error? would you still want
> >> to increment tx_pkts below?
> >
> > It's not fatal if the remote side isn't notified.  It will hurt latency, since the next packet would be the one that triggers the next interrupt.  Also, the only way it could ever fail would be if it was an invalid interrupt bit being set, which should never happen.
> >
> 
> What happens when the 'db >= ndev->max_cbs' check fails? Under what
> circumstances will that happen? When it does happen how does the
> remote side then gets notified or should it even get notified?

It should never happen, as there are checks when enqueuing that the transport queue is valid.  It is more of a sanity check than anything else.

> 'which should never happen'? FYI - I have seen and debugged(not this
> one but doorbells and what not) weirdness while working on CLARiiON +
> PCie-interconnects. Board bring-up is a PITA. So you get the idea ...

I understand, that is why I had the sanity check code.  Paranoia can be a good thing.
 
> >>
> >> > +void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
> >> > +{
> >> > +       struct ntb_queue_entry *entry;
> >> > +       void *buf;
> >> > +
> >> > +       if (!qp)
> >> > +               return NULL;
> >> > +
> >> > +       entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc);
> >> > +       if (!entry)
> >> > +               return NULL;
> >> > +
> >> > +       buf = entry->callback_data;
> >> > +       if (entry->flags != HW_ERROR_FLAG)
> >> > +               *len = entry->len;
> >> > +       else
> >> > +               *len = -EIO;
> >> > +
> >> > +       ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
> >>
> >> how about a ntb_list_add_head?
> >
> > Is there a benefit to adding to the head versus tail?  It makes more sense to me to pull from the head and add to the tail.
> >
> 
> Yes, explained above. Today there are 100(..DEF_NUM...) entries.
> Tomorrow there could be more. So why not re-use what's hot? You could
> also think of this as yet another way of forcing detection of
> double-use corruption/bug. I'm not saying that there's a bug here but
> you get the idea.

I'm fairly sure the list is suboptimal performance-wise, but it is much easier to use when developing.  Perhaps I should use a fixed size array instead.

Thanks for the reviews!

> 
> Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index d1d9ae6..70d7e0d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4818,6 +4818,12 @@  S:	Maintained
 F:	Documentation/scsi/NinjaSCSI.txt
 F:	drivers/scsi/nsp32*
 
+NTB DRIVER
+M:	Jon Mason <jon.mason@intel.com>
+S:	Supported
+F:	drivers/ntb/
+F:	include/linux/ntb.h
+
 NTFS FILESYSTEM
 M:	Anton Altaparmakov <anton@tuxera.com>
 L:	linux-ntfs-dev@lists.sourceforge.net
diff --git a/drivers/Kconfig b/drivers/Kconfig
index bfc9186..ebc16d3 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -148,4 +148,6 @@  source "drivers/iio/Kconfig"
 
 source "drivers/vme/Kconfig"
 
+source "drivers/ntb/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 2ba29ff..39bba94 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -137,3 +137,4 @@  obj-$(CONFIG_EXTCON)		+= extcon/
 obj-$(CONFIG_MEMORY)		+= memory/
 obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_VME_BUS)		+= vme/
+obj-$(CONFIG_NTB)		+= ntb/
diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig
new file mode 100644
index 0000000..f69df793
--- /dev/null
+++ b/drivers/ntb/Kconfig
@@ -0,0 +1,13 @@ 
+config NTB
+       tristate "Intel Non-Transparent Bridge support"
+       depends on PCI
+       depends on X86
+       help
+        The PCI-E Non-transparent bridge hardware is a point-to-point PCI-E bus
+        connecting 2 systems.  When configured, writes to the device's PCI
+        mapped memory will be mirrored to a buffer on the remote system.  The
+        ntb Linux driver uses this point-to-point communication as a method to
+        transfer data from one system to the other.
+
+        If unsure, say N.
+
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
new file mode 100644
index 0000000..0b53393
--- /dev/null
+++ b/drivers/ntb/Makefile
@@ -0,0 +1,3 @@ 
+obj-$(CONFIG_NTB) += ntb.o 
+
+ntb-objs := ntb_hw.o ntb_transport.o
diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
new file mode 100644
index 0000000..8f46317
--- /dev/null
+++ b/drivers/ntb/ntb_hw.c
@@ -0,0 +1,1283 @@ 
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *   The full GNU General Public License is included in this distribution
+ *   in the file called LICENSE.GPL.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "ntb_hw.h"
+#include "ntb_regs.h"
+
+#define NTB_NAME	"Intel(R) PCI-E Non-Transparent Bridge Driver"
+#define NTB_VER		"0.20"
+
+MODULE_DESCRIPTION(NTB_NAME);
+MODULE_VERSION(NTB_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+static int max_num_cbs = 2;
+module_param(max_num_cbs, uint, 0644);
+MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
+
+static bool no_msix;
+module_param(no_msix, bool, 0644);
+MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");
+
+enum {
+	NTB_CONN_CLASSIC = 0,
+	NTB_CONN_B2B,
+	NTB_CONN_RP,
+};
+
+enum {
+	NTB_DEV_USD = 0,
+	NTB_DEV_DSD,
+};
+
+enum {
+	SNB_HW = 0,
+	BWD_HW,
+};
+
+struct ntb_mw {
+	dma_addr_t phys_addr;
+	void __iomem *vbase;
+	resource_size_t bar_sz;
+};
+
+struct ntb_db_cb {
+	void (*callback) (int db_num);
+	unsigned int db_num;
+	struct ntb_device *ndev;
+};
+
+struct ntb_device {
+	struct pci_dev *pdev;
+	struct msix_entry *msix_entries;
+	void __iomem *reg_base;
+	struct ntb_mw mw[NTB_NUM_MW];
+	struct {
+		unsigned int max_spads;
+		unsigned int max_db_bits;
+		unsigned int msix_cnt;
+	} limits;
+	struct {
+		void __iomem *pdb;
+		void __iomem *pdb_mask;
+		void __iomem *sdb;
+		void __iomem *sbar2_xlat;
+		void __iomem *sbar4_xlat;
+		void __iomem *spad_write;
+		void __iomem *spad_read;
+		void __iomem *lnk_cntl;
+		void __iomem *lnk_stat;
+		void __iomem *spci_cmd;
+	} reg_ofs;
+	void *ntb_transport;
+	void (*event_cb)(void *handle, unsigned int event);
+
+	struct ntb_db_cb *db_cb;
+	unsigned char hw_type;
+	unsigned char conn_type;
+	unsigned char dev_type;
+	unsigned char num_msix;
+	unsigned char bits_per_vector;
+	unsigned char max_cbs;
+	unsigned char link_status;
+	struct delayed_work hb_timer;
+	unsigned long last_ts;
+};
+
+/* Translate memory window 0,1 to BAR 2,4 */
+#define MW_TO_BAR(mw)	(mw * 2 + 2)
+
+static DEFINE_PCI_DEVICE_TABLE(ntb_pci_tbl) = {
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_CLASSIC_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_RP_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_RP_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_CLASSIC_SNB)},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
+
+static struct ntb_device *ntbdev;
+
+/**
+ * ntb_hw_link_status() - return the hardware link status
+ * @ndev: pointer to ntb_device instance
+ *
+ * Returns true if the hardware is connected to the remote system
+ *
+ * RETURNS: true or false based on the hardware link state
+ */
+bool ntb_hw_link_status(struct ntb_device *ndev)
+{
+	return ndev->link_status == NTB_LINK_UP;
+}
+EXPORT_SYMBOL(ntb_hw_link_status);
+
+/**
+ * ntb_query_pdev() - return the pci_dev pointer
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer return the pci_dev pointerfor the NTB hardware device
+ *
+ * RETURNS: a pointer to the ntb pci_dev
+ */
+struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
+{
+	return ndev->pdev;
+}
+EXPORT_SYMBOL(ntb_query_pdev);
+
+/**
+ * ntb_query_max_cbs() - return the maximum number of callback tuples
+ * @ndev: pointer to ntb_device instance
+ *
+ * The number of callbacks can vary depending on the platform and MSI-X/MSI
+ * enablement
+ *
+ * RETURNS: the maximum number of callback tuples (3, 15, or 33)
+ */
+unsigned int ntb_query_max_cbs(struct ntb_device *ndev)
+{
+	return ndev->max_cbs > max_num_cbs ? max_num_cbs : ndev->max_cbs;
+}
+EXPORT_SYMBOL(ntb_query_max_cbs);
+
+/**
+ * ntb_register_event_callback() - register event callback
+ * @ndev: pointer to ntb_device instance
+ * @func: callback function to register
+ *
+ * This function registers a callback for any HW driver events such as link
+ * up/down, power management notices and etc.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_register_event_callback(struct ntb_device *ndev,
+				void (*func)(void *handle, unsigned int event))
+{
+	if (ndev->event_cb)
+		return -EINVAL;
+
+	ndev->event_cb = func;
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_register_event_callback);
+
+/**
+ * ntb_unregister_event_callback() - unregisters the event callback
+ * @ndev: pointer to ntb_device instance
+ *
+ * This function unregisters the existing callback from transport
+ */
+void ntb_unregister_event_callback(struct ntb_device *ndev)
+{
+	ndev->event_cb = NULL;
+}
+EXPORT_SYMBOL(ntb_unregister_event_callback);
+
+/**
+ * ntb_register_db_callback() - register a callback for doorbell interrupt
+ * @ndev: pointer to ntb_device instance
+ * @idx: doorbell index to register callback, zero based
+ * @func: callback function to register
+ *
+ * This function registers a callback function for the doorbell interrupt
+ * on the primary side. The function will unmask the doorbell as well to
+ * allow interrupt.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
+			     void (*func) (int db_num))
+{
+	unsigned long mask;
+
+	if (idx >= ndev->max_cbs || ndev->db_cb[idx].callback) {
+		dev_warn(&ndev->pdev->dev, "Invalid Index.\n");
+		return -EINVAL;
+	}
+
+	ndev->db_cb[idx].callback = func;
+
+	/* unmask interrupt */
+	mask = readw(ndev->reg_ofs.pdb_mask);
+	clear_bit(idx * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.pdb_mask);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_register_db_callback);
+
+/**
+ * ntb_unregister_db_callback() - unregister a callback for doorbell interrupt
+ * @ndev: pointer to ntb_device instance
+ * @idx: doorbell index to register callback, zero based
+ *
+ * This function unregisters a callback function for the doorbell interrupt
+ * on the primary side. The function will also mask the said doorbell.
+ */
+void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx)
+{
+	unsigned long mask;
+
+	if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback)
+		return;
+
+	mask = readw(ndev->reg_ofs.pdb_mask);
+	set_bit(idx * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.pdb_mask);
+
+	ndev->db_cb[idx].callback = NULL;
+}
+EXPORT_SYMBOL(ntb_unregister_db_callback);
+
+/**
+ * ntb_register_transport() - Register NTB transport with NTB HW driver
+ * @transport: transport identifier
+ *
+ * This function allows a transport to reserve the hardware driver for
+ * NTB usage.
+ *
+ * RETURNS: pointer to ntb_device, NULL on error.
+ */
+struct ntb_device *ntb_register_transport(void *transport)
+{
+	struct ntb_device *ndev = ntbdev;
+
+	if (ndev->ntb_transport)
+		return NULL;
+
+	ndev->ntb_transport = transport;
+	return ndev;
+}
+EXPORT_SYMBOL(ntb_register_transport);
+
+/**
+ * ntb_unregister_transport() - Unregister the transport with the NTB HW driver
+ * @ndev - ntb_device of the transport to be freed
+ *
+ * This function unregisters the transport from the HW driver and performs any
+ * necessary cleanups.
+ */
+void ntb_unregister_transport(struct ntb_device *ndev)
+{
+	int i;
+
+	if (!ndev->ntb_transport)
+		return;
+
+	for (i = 0; i < ndev->max_cbs; i++)
+		ntb_unregister_db_callback(ndev, i);
+
+	ntb_unregister_event_callback(ndev);
+	ndev->ntb_transport = NULL;
+}
+EXPORT_SYMBOL(ntb_unregister_transport);
+
+/**
+ * ntb_get_max_spads() - get the total scratch regs usable
+ * @ndev: pointer to ntb_device instance
+ *
+ * This function returns the max 32bit scratchpad registers usable by the
+ * upper layer.
+ *
+ * RETURNS: total number of scratch pad registers available
+ */
+int ntb_get_max_spads(struct ntb_device *ndev)
+{
+	return ndev->limits.max_spads;
+}
+EXPORT_SYMBOL(ntb_get_max_spads);
+
+/**
+ * ntb_write_local_spad() - write to the secondary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	dev_dbg(&ndev->pdev->dev, "Writing %x to local scratch pad index %d\n",
+		val, idx);
+	writel(val, ndev->reg_ofs.spad_read + idx * 4);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_write_local_spad);
+
+/**
+ * ntb_read_local_spad() - read from the primary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	*val = readl(ndev->reg_ofs.spad_write + idx * 4);
+	dev_dbg(&ndev->pdev->dev,
+		"Reading %x from local scratch pad index %d\n", *val, idx);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_read_local_spad);
+
+/**
+ * ntb_write_remote_spad() - write to the secondary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	dev_dbg(&ndev->pdev->dev, "Writing %x to remote scratch pad index %d\n",
+		val, idx);
+	writel(val, ndev->reg_ofs.spad_write + idx * 4);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_write_remote_spad);
+
+/**
+ * ntb_read_remote_spad() - read from the primary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	*val = readl(ndev->reg_ofs.spad_read + idx * 4);
+	dev_dbg(&ndev->pdev->dev,
+		"Reading %x from remote scratch pad index %d\n", *val, idx);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_read_remote_spad);
+
+/**
+ * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the base virtual address of the memory window
+ * specified.
+ *
+ * RETURNS: pointer to virtual address, or NULL on error.
+ */
+void *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw > NTB_NUM_MW)
+		return NULL;
+
+	return ndev->mw[mw].vbase;
+}
+EXPORT_SYMBOL(ntb_get_mw_vbase);
+
+/**
+ * ntb_get_mw_size() - return size of NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the physical size of the memory window specified
+ *
+ * RETURNS: the size of the memory window or zero on error
+ */
+resource_size_t ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw > NTB_NUM_MW)
+		return 0;
+
+	return ndev->mw[mw].bar_sz;
+}
+EXPORT_SYMBOL(ntb_get_mw_size);
+
+/**
+ * ntb_set_mw_addr - set the memory window address
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ * @addr: base address for data
+ *
+ * This function sets the base physical address of the memory window.  This
+ * memory address is where data from the remote system will be transfered into
+ * or out of depending on how the transport is configured.
+ */
+void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr)
+{
+	if (mw > NTB_NUM_MW)
+		return;
+
+	dev_dbg(&ndev->pdev->dev, "Writing addr %Lx to BAR %d\n", addr,
+		MW_TO_BAR(mw));
+
+	ndev->mw[mw].phys_addr = addr;
+
+	switch (MW_TO_BAR(mw)) {
+	case NTB_BAR_23:
+		writeq(addr, ndev->reg_ofs.sbar2_xlat);
+		break;
+	case NTB_BAR_45:
+		writeq(addr, ndev->reg_ofs.sbar4_xlat);
+		break;
+	}
+}
+EXPORT_SYMBOL(ntb_set_mw_addr);
+
+/**
+ * ntb_ring_sdb() - Set the doorbell on the secondary/external side
+ * @ndev: pointer to ntb_device instance
+ * @db: doorbell to ring
+ *
+ * This function allows triggering of a doorbell on the secondary/external
+ * side that will initiate an interrupt on the remote host
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_ring_sdb(struct ntb_device *ndev, unsigned int db)
+{
+	dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
+
+	if (db >= ndev->max_cbs)
+		return -EINVAL;
+
+	if (ndev->hw_type == BWD_HW)
+		writeq((u64) 1 << db, ndev->reg_ofs.sdb);
+	else
+		writew(((1 << ndev->bits_per_vector) - 1) <<
+		       (db * ndev->bits_per_vector), ndev->reg_ofs.sdb);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_ring_sdb);
+
+static void ntb_link_event(struct ntb_device *ndev, int link_state)
+{
+	unsigned int event;
+
+	if (ndev->link_status == link_state)
+		return;
+
+	if (link_state == NTB_LINK_UP) {
+		u16 status;
+
+		dev_info(&ndev->pdev->dev, "Link Up\n");
+		ndev->link_status = NTB_LINK_UP;
+		event = NTB_EVENT_HW_LINK_UP;
+
+		if (ndev->hw_type == BWD_HW)
+			status = readw(ndev->reg_ofs.lnk_stat);
+		else {
+			int rc = pci_read_config_word(ndev->pdev,
+						      SNB_LINK_STATUS_OFFSET,
+						      &status);
+			if (rc)
+				return;
+		}
+		dev_info(&ndev->pdev->dev, "Link Width %d, Link Speed %d\n",
+			 (status & NTB_LINK_WIDTH_MASK) >> 4,
+			 (status & NTB_LINK_SPEED_MASK));
+	} else {
+		dev_info(&ndev->pdev->dev, "Link Down\n");
+		ndev->link_status = NTB_LINK_DOWN;
+		event = NTB_EVENT_HW_LINK_DOWN;
+	}
+
+	/* notify the upper layer if we have an event change */
+	if (ndev->event_cb)
+		ndev->event_cb(ndev->ntb_transport, event);
+}
+
+static int ntb_link_status(struct ntb_device *ndev)
+{
+	int link_state;
+
+	if (ndev->hw_type == BWD_HW) {
+		u32 ntb_cntl;
+
+		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+		if (ntb_cntl & BWD_CNTL_LINK_DOWN)
+			link_state = NTB_LINK_DOWN;
+		else
+			link_state = NTB_LINK_UP;
+	} else {
+		u16 status;
+		int rc;
+
+		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
+					  &status);
+		if (rc)
+			return rc;
+
+		if (status & NTB_LINK_STATUS_ACTIVE)
+			link_state = NTB_LINK_UP;
+		else
+			link_state = NTB_LINK_DOWN;
+	}
+
+	ntb_link_event(ndev, link_state);
+
+	return 0;
+}
+
+/* BWD doesn't have link status interrupt, poll on that platform */
+static void ntb_handle_heartbeat(struct work_struct *work)
+{
+	struct ntb_device *ndev = container_of(work, struct ntb_device,
+					       hb_timer.work);
+	unsigned long ts = jiffies;
+
+	/* If we haven't gotten an interrupt in a while, check the BWD link
+	 * status bit
+	 */
+	if (ts > ndev->last_ts + NTB_HB_TIMEOUT) {
+		int rc = ntb_link_status(ndev);
+		if (rc)
+			dev_err(&ndev->pdev->dev,
+				"Error determining link status\n");
+	}
+
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+}
+
+static int ntb_xeon_setup(struct ntb_device *ndev)
+{
+	int rc;
+	u8 val;
+
+	ndev->hw_type = SNB_HW;
+
+	rc = pci_read_config_byte(ndev->pdev, NTB_PPD_OFFSET, &val);
+	if (rc)
+		return rc;
+
+	switch (val & SNB_PPD_CONN_TYPE) {
+	case NTB_CONN_B2B:
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_CLASSIC:
+	case NTB_CONN_RP:
+	default:
+		dev_err(&ndev->pdev->dev, "Only B2B supported at this time\n");
+		return -EINVAL;
+	}
+
+	if (val & SNB_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_DSD;
+	else
+		ndev->dev_type = NTB_DEV_USD;
+
+	ndev->reg_ofs.pdb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
+	ndev->reg_ofs.pdb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
+	ndev->reg_ofs.sbar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
+	ndev->reg_ofs.sbar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
+	ndev->reg_ofs.lnk_cntl = ndev->reg_base + SNB_NTBCNTL_OFFSET;
+	ndev->reg_ofs.lnk_stat = ndev->reg_base + SNB_LINK_STATUS_OFFSET;
+	ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
+	ndev->reg_ofs.spci_cmd = ndev->reg_base + SNB_PCICMD_OFFSET;
+
+	if (ndev->conn_type == NTB_CONN_B2B) {
+		ndev->reg_ofs.sdb = ndev->reg_base + SNB_B2B_DOORBELL_OFFSET;
+		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_B2B_SPAD_OFFSET;
+		ndev->limits.max_spads = SNB_MAX_SPADS;
+	} else {
+		ndev->reg_ofs.sdb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
+		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET;
+		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS;
+	}
+
+	ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
+	ndev->limits.msix_cnt = SNB_MSIX_CNT;
+	ndev->bits_per_vector = SNB_DB_BITS_PER_VEC;
+
+	return 0;
+}
+
+static int ntb_bwd_setup(struct ntb_device *ndev)
+{
+	int rc;
+	u32 val;
+
+	ndev->hw_type = BWD_HW;
+
+	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &val);
+	if (rc)
+		return rc;
+
+	switch ((val & BWD_PPD_CONN_TYPE) >> 8) {
+	case NTB_CONN_B2B:
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_RP:
+	default:
+		dev_err(&ndev->pdev->dev, "Only B2B supported at this time\n");
+		return -EINVAL;
+	}
+
+	if (val & BWD_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_DSD;
+	else
+		ndev->dev_type = NTB_DEV_USD;
+
+	/* Initiate PCI-E link training */
+	rc = pci_write_config_dword(ndev->pdev, NTB_PPD_OFFSET,
+				    val | BWD_PPD_INIT_LINK);
+	if (rc)
+		return rc;
+
+	ndev->reg_ofs.pdb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
+	ndev->reg_ofs.pdb_mask = ndev->reg_base + BWD_PDBMSK_OFFSET;
+	ndev->reg_ofs.sbar2_xlat = ndev->reg_base + BWD_SBAR2XLAT_OFFSET;
+	ndev->reg_ofs.sbar4_xlat = ndev->reg_base + BWD_SBAR4XLAT_OFFSET;
+	ndev->reg_ofs.lnk_cntl = ndev->reg_base + BWD_NTBCNTL_OFFSET;
+	ndev->reg_ofs.lnk_stat = ndev->reg_base + BWD_LINK_STATUS_OFFSET;
+	ndev->reg_ofs.spad_read = ndev->reg_base + BWD_SPAD_OFFSET;
+	ndev->reg_ofs.spci_cmd = ndev->reg_base + BWD_PCICMD_OFFSET;
+
+	if (ndev->conn_type == NTB_CONN_B2B) {
+		ndev->reg_ofs.sdb = ndev->reg_base + BWD_B2B_DOORBELL_OFFSET;
+		ndev->reg_ofs.spad_write = ndev->reg_base + BWD_B2B_SPAD_OFFSET;
+		ndev->limits.max_spads = BWD_MAX_SPADS;
+	} else {
+		ndev->reg_ofs.sdb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
+		ndev->reg_ofs.spad_write = ndev->reg_base + BWD_SPAD_OFFSET;
+		ndev->limits.max_spads = BWD_MAX_COMPAT_SPADS;
+	}
+
+	ndev->limits.max_db_bits = BWD_MAX_DB_BITS;
+	ndev->limits.msix_cnt = BWD_MSIX_CNT;
+	ndev->bits_per_vector = BWD_DB_BITS_PER_VEC;
+
+	/* Since bwd doesn't have a link interrupt, setup a heartbeat timer */
+	INIT_DELAYED_WORK(&ndev->hb_timer, ntb_handle_heartbeat);
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+
+	return 0;
+}
+
+static int __devinit ntb_device_setup(struct ntb_device *ndev)
+{
+	int rc;
+
+	switch (ndev->pdev->device) {
+	case PCI_DEVICE_ID_INTEL_NTB_2ND_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_RP_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_RP_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_CLASSIC_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_CLASSIC_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
+		rc = ntb_xeon_setup(ndev);
+		break;
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_BWD:
+		rc = ntb_bwd_setup(ndev);
+		break;
+	default:
+		rc = -ENODEV;
+	}
+
+	/* Enable Bus Master and Memory Space on the secondary side */
+	writew(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER, ndev->reg_ofs.spci_cmd);
+
+	return rc;
+}
+
+static void ntb_device_free(struct ntb_device *ndev)
+{
+	if (ndev->hw_type == BWD_HW)
+		cancel_delayed_work_sync(&ndev->hb_timer);
+}
+
+static irqreturn_t bwd_callback_msix_irq(int irq, void *data)
+{
+	struct ntb_db_cb *db_cb = data;
+	struct ntb_device *ndev = db_cb->ndev;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
+		db_cb->db_num);
+
+	if (db_cb->callback)
+		db_cb->callback(db_cb->db_num);
+
+	/* No need to check for the specific HB irq, any interrupt means
+	 * we're connected.
+	 */
+	ndev->last_ts = jiffies;
+
+	writeq((u64) 1 << db_cb->db_num, ndev->reg_ofs.pdb);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t xeon_callback_msix_irq(int irq, void *data)
+{
+	struct ntb_db_cb *db_cb = data;
+	struct ntb_device *ndev = db_cb->ndev;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
+		db_cb->db_num);
+
+	if (db_cb->callback)
+		db_cb->callback(db_cb->db_num);
+
+	/* On Sandybridge, there are 16 bits in the interrupt register
+	 * but only 4 vectors.  So, 5 bits are assigned to the first 3
+	 * vectors, with the 4th having a single bit for link
+	 * interrupts.
+	 */
+	writew(((1 << ndev->bits_per_vector) - 1) <<
+	       (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.pdb);
+
+	return IRQ_HANDLED;
+}
+
+/* Since we do not have a HW doorbell in BWD, this is only used in JF/JT */
+static irqreturn_t xeon_event_msix_irq(int irq, void *dev)
+{
+	struct ntb_device *ndev = dev;
+	int rc;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for Events\n", irq);
+
+	rc = ntb_link_status(ndev);
+	if (rc)
+		dev_err(&ndev->pdev->dev, "Error determining link status\n");
+
+	/* bit 15 is always the link bit */
+	writew(1 << ndev->limits.max_db_bits, ndev->reg_ofs.pdb);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ntb_interrupt(int irq, void *dev)
+{
+	struct ntb_device *ndev = dev;
+	unsigned int i = 0;
+
+	if (ndev->hw_type == BWD_HW) {
+		u64 pdb = readq(ndev->reg_ofs.pdb);
+
+		dev_dbg(&ndev->pdev->dev, "irq %d - pdb = %Lx\n", irq, pdb);
+
+		while (pdb) {
+			i = __ffs(pdb);
+			pdb &= pdb - 1;
+			bwd_callback_msix_irq(irq, &ndev->db_cb[i]);
+		}
+	} else {
+		u16 pdb = readw(ndev->reg_ofs.pdb);
+
+		dev_dbg(&ndev->pdev->dev, "irq %d - pdb = %x sdb %x\n", irq,
+			pdb, readw(ndev->reg_ofs.sdb));
+
+		if (pdb & SNB_DB_HW_LINK) {
+			xeon_event_msix_irq(irq, dev);
+			pdb &= ~SNB_DB_HW_LINK;
+		}
+
+		while (pdb) {
+			i = __ffs(pdb);
+			pdb &= pdb - 1;
+			xeon_callback_msix_irq(irq, &ndev->db_cb[i]);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int ntb_setup_msix(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	struct msix_entry *msix;
+	int msix_entries;
+	int rc, i, pos;
+	u16 val;
+
+	if (no_msix) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
+	if (!pos) {
+		rc = -EIO;
+		goto err1;
+	}
+
+	rc = pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &val);
+	if (rc)
+		goto err1;
+
+	msix_entries = msix_table_size(val);
+	if (msix_entries > ndev->limits.msix_cnt) {
+		rc = -EINVAL;
+		goto err1;
+	}
+
+	ndev->msix_entries = kmalloc(sizeof(struct msix_entry) * msix_entries,
+				     GFP_KERNEL);
+	if (!ndev->msix_entries) {
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	for (i = 0; i < msix_entries; i++)
+		ndev->msix_entries[i].entry = i;
+
+	rc = pci_enable_msix(pdev, ndev->msix_entries, msix_entries);
+	if (rc < 0)
+		goto err2;
+	if (rc > 0) {
+		/* On SNB, the link interrupt is always tied to 4th vector.  If
+		 * we can't get all 4, then we can't use MSI-X.
+		 */
+		if (ndev->hw_type != BWD_HW) {
+			rc = -EIO;
+			goto err2;
+		}
+
+		dev_warn(&pdev->dev,
+			 "Only %d MSI-X vectors.  Limiting the number of queues to that number.\n",
+			 rc);
+		msix_entries = rc;
+	}
+
+	for (i = 0; i < msix_entries; i++) {
+		msix = &ndev->msix_entries[i];
+		WARN_ON(!msix->vector);
+
+		/* Use the last MSI-X vector for Link status */
+		if (ndev->hw_type == BWD_HW) {
+			rc = request_irq(msix->vector, bwd_callback_msix_irq, 0,
+					 "ntb-callback-msix", &ndev->db_cb[i]);
+			if (rc)
+				goto err3;
+		} else {
+			if (i == msix_entries - 1) {
+				rc = request_irq(msix->vector,
+						 xeon_event_msix_irq, 0,
+						 "ntb-event-msix", ndev);
+				if (rc)
+					goto err3;
+			} else {
+				rc = request_irq(msix->vector,
+						 xeon_callback_msix_irq, 0,
+						 "ntb-callback-msix",
+						 &ndev->db_cb[i]);
+				if (rc)
+					goto err3;
+			}
+		}
+	}
+
+	ndev->num_msix = msix_entries;
+	if (ndev->hw_type == BWD_HW)
+		ndev->max_cbs = msix_entries;
+	else
+		ndev->max_cbs = msix_entries - 1;
+
+	return 0;
+
+err3:
+	while (--i >= 0) {
+		msix = &ndev->msix_entries[i];
+		if (ndev->hw_type != BWD_HW && i == ndev->num_msix - 1)
+			free_irq(msix->vector, ndev);
+		else
+			free_irq(msix->vector, &ndev->db_cb[i]);
+	}
+	pci_disable_msix(pdev);
+err2:
+	kfree(ndev->msix_entries);
+err1:
+	dev_err(&pdev->dev, "Error allocating MSI-X interrupt\n");
+err:
+	ndev->num_msix = 0;
+	return rc;
+}
+
+static int ntb_setup_msi(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int rc;
+
+	rc = pci_enable_msi(pdev);
+	if (rc)
+		return rc;
+
+	rc = request_irq(pdev->irq, ntb_interrupt, 0, "ntb-msi", ndev);
+	if (rc) {
+		pci_disable_msi(pdev);
+		dev_err(&pdev->dev, "Error allocating MSI interrupt\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static int ntb_setup_intx(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int rc;
+
+	pci_msi_off(pdev);
+
+	/* Verify intx is enabled */
+	pci_intx(pdev, 1);
+
+	rc = request_irq(pdev->irq, ntb_interrupt, IRQF_SHARED, "ntb-intx",
+			 ndev);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int __devinit ntb_setup_interrupts(struct ntb_device *ndev)
+{
+	int rc;
+
+	/* On BWD, disable all interrupts.  On SNB, disable all but Link
+	 * Interrupt.  The rest will be unmasked as callbacks are registered.
+	 */
+	if (ndev->hw_type == BWD_HW)
+		writeq(~0, ndev->reg_ofs.pdb_mask);
+	else
+		writew(~(1 << ndev->limits.max_db_bits),
+		       ndev->reg_ofs.pdb_mask);
+
+	rc = ntb_setup_msix(ndev);
+	if (!rc)
+		goto done;
+
+	ndev->bits_per_vector = 1;
+	ndev->max_cbs = ndev->limits.max_db_bits;
+
+	rc = ntb_setup_msi(ndev);
+	if (!rc)
+		goto done;
+
+	rc = ntb_setup_intx(ndev);
+	if (rc) {
+		dev_err(&ndev->pdev->dev, "no usable interrupts\n");
+		return rc;
+	}
+
+done:
+	return 0;
+}
+
+static void __devexit ntb_free_interrupts(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+
+	/* mask interrupts */
+	if (ndev->hw_type == BWD_HW)
+		writeq(~0, ndev->reg_ofs.pdb_mask);
+	else
+		writew(~0, ndev->reg_ofs.pdb_mask);
+
+	if (ndev->num_msix) {
+		struct msix_entry *msix;
+		u32 i;
+
+		for (i = 0; i < ndev->num_msix; i++) {
+			msix = &ndev->msix_entries[i];
+			if (ndev->hw_type != BWD_HW && i == ndev->num_msix - 1)
+				free_irq(msix->vector, ndev);
+			else
+				free_irq(msix->vector, &ndev->db_cb[i]);
+		}
+		pci_disable_msix(pdev);
+	} else {
+		free_irq(pdev->irq, ndev);
+
+		if (pci_dev_msi_enabled(pdev))
+			pci_disable_msi(pdev);
+	}
+}
+
+static int __devinit ntb_create_callbacks(struct ntb_device *ndev)
+{
+	int i;
+
+	/* Checken-egg issue.  We won't know how many callbacks are necessary
+	 * until we see how many MSI-X vectors we get, but these pointers need
+	 * to be passed into the MSI-X register fucntion.  So, we allocate the
+	 * max, knowing that they might not all be used, to work around this.
+	 */
+	ndev->db_cb = kcalloc(ndev->limits.max_db_bits,
+			      sizeof(struct ntb_db_cb),
+			      GFP_KERNEL);
+	if (!ndev->db_cb)
+		return -ENOMEM;
+
+	for (i = 0; i < ndev->limits.max_db_bits; i++) {
+		ndev->db_cb[i].db_num = i;
+		ndev->db_cb[i].ndev = ndev;
+	}
+
+	return 0;
+}
+
+static void ntb_free_callbacks(struct ntb_device *ndev)
+{
+	int i;
+
+	for (i = 0; i < ndev->limits.max_db_bits; i++)
+		ntb_unregister_db_callback(ndev, i);
+
+	kfree(ndev->db_cb);
+}
+
+static int __devinit
+ntb_pci_probe(struct pci_dev *pdev,
+	      __attribute__((unused)) const struct pci_device_id *id)
+{
+	struct ntb_device *ndev;
+	int rc, i;
+
+	ndev = kzalloc(sizeof(struct ntb_device), GFP_KERNEL);
+	if (!ndev)
+		return -ENOMEM;
+
+	ntbdev = ndev;
+	ndev->pdev = pdev;
+	ndev->link_status = NTB_LINK_DOWN;
+	pci_set_drvdata(pdev, ndev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		goto err;
+
+	pci_set_master(ndev->pdev);
+
+	rc = pci_request_selected_regions(pdev, NTB_BAR_MASK, KBUILD_MODNAME);
+	if (rc)
+		goto err1;
+
+	ndev->reg_base = pci_ioremap_bar(pdev, NTB_BAR_MMIO);
+	if (!ndev->reg_base) {
+		dev_warn(&pdev->dev, "Cannot remap BAR 0\n");
+		rc = -EIO;
+		goto err2;
+	}
+
+	for (i = 0; i < NTB_NUM_MW; i++) {
+		ndev->mw[i].bar_sz = pci_resource_len(pdev, MW_TO_BAR(i));
+		ndev->mw[i].vbase =
+		    ioremap_wc(pci_resource_start(pdev, MW_TO_BAR(i)),
+			       ndev->mw[i].bar_sz);
+		dev_info(&pdev->dev, "MW %d size %d\n", i,
+			 (u32) pci_resource_len(pdev, MW_TO_BAR(i)));
+		if (!ndev->mw[i].vbase) {
+			dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
+				 MW_TO_BAR(i));
+			rc = -EIO;
+			goto err3;
+		}
+	}
+
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err3;
+
+		dev_warn(&pdev->dev, "Cannot DMA highmem\n");
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err3;
+
+		dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n");
+	}
+
+	rc = ntb_device_setup(ndev);
+	if (rc)
+		goto err3;
+
+	rc = ntb_create_callbacks(ndev);
+	if (rc)
+		goto err4;
+
+	rc = ntb_setup_interrupts(ndev);
+	if (rc)
+		goto err5;
+
+	/* The scratchpad registers keep the values between rmmod/insmod,
+	 * blast them now
+	 */
+	for (i = 0; i < ndev->limits.max_spads; i++) {
+		ntb_write_local_spad(ndev, i, 0);
+		ntb_write_remote_spad(ndev, i, 0);
+	}
+
+	/* Let's bring the NTB link up */
+	writel(NTB_CNTL_BAR23_SNOOP | NTB_CNTL_BAR45_SNOOP,
+	       ndev->reg_ofs.lnk_cntl);
+
+	return 0;
+
+err5:
+	ntb_free_callbacks(ndev);
+err4:
+	ntb_device_free(ndev);
+err3:
+	for (i--; i >= 0; i--)
+		iounmap(ndev->mw[i].vbase);
+	iounmap(ndev->reg_base);
+err2:
+	pci_release_selected_regions(pdev, NTB_BAR_MASK);
+err1:
+	pci_disable_device(pdev);
+err:
+	kfree(ndev);
+
+	dev_err(&pdev->dev, "Error loading %s module\n", KBUILD_MODNAME);
+	return rc;
+}
+
+static void __devexit ntb_pci_remove(struct pci_dev *pdev)
+{
+	struct ntb_device *ndev = pci_get_drvdata(pdev);
+	int i;
+	u32 ntb_cntl;
+
+	/* Bring NTB link down */
+	ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+	ntb_cntl |= NTB_LINK_DISABLE;
+	writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+
+	ntb_free_interrupts(ndev);
+	ntb_free_callbacks(ndev);
+	ntb_device_free(ndev);
+
+	for (i = 0; i < NTB_NUM_MW; i++)
+		iounmap(ndev->mw[i].vbase);
+
+	iounmap(ndev->reg_base);
+	pci_release_selected_regions(pdev, NTB_BAR_MASK);
+	pci_disable_device(pdev);
+	kfree(ndev);
+}
+
+static struct pci_driver ntb_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = ntb_pci_tbl,
+	.probe = ntb_pci_probe,
+	.remove = __devexit_p(ntb_pci_remove),
+};
+
+static int __init ntb_init_module(void)
+{
+	pr_info("%s: %s, version %s\n", KBUILD_MODNAME, NTB_NAME, NTB_VER);
+
+	return pci_register_driver(&ntb_pci_driver);
+}
+module_init(ntb_init_module);
+
+static void __exit ntb_exit_module(void)
+{
+	pci_unregister_driver(&ntb_pci_driver);
+
+	pr_info("%s: Driver removed\n", KBUILD_MODNAME);
+}
+module_exit(ntb_exit_module);
diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
new file mode 100644
index 0000000..4cad371
--- /dev/null
+++ b/drivers/ntb/ntb_hw.h
@@ -0,0 +1,115 @@ 
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *   The full GNU General Public License is included in this distribution
+ *   in the file called LICENSE.GPL.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF		0x3725
+#define PCI_DEVICE_ID_INTEL_NTB_CLASSIC_JSF	0x3726
+#define PCI_DEVICE_ID_INTEL_NTB_RP_JSF		0x3727
+#define PCI_DEVICE_ID_INTEL_NTB_RP_SNB		0x3C08
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB		0x3C0D
+#define PCI_DEVICE_ID_INTEL_NTB_CLASSIC_SNB	0x3C0E
+#define PCI_DEVICE_ID_INTEL_NTB_2ND_SNB		0x3C0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD		0x0C4E
+
+#define msix_table_size(control)	((control & PCI_MSIX_FLAGS_QSIZE)+1)
+
+#define NTB_BAR_MMIO		0
+#define NTB_BAR_23		2
+#define NTB_BAR_45		4
+#define NTB_BAR_MASK		((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
+				 (1 << NTB_BAR_45))
+
+#define NTB_LINK_DOWN		0
+#define NTB_LINK_UP		1
+
+#define NTB_HB_TIMEOUT		msecs_to_jiffies(1000)
+
+#define NTB_NUM_MW		2
+
+struct ntb_device;
+
+enum {
+	NTB_EVENT_SW_EVENT0	= (1 << 0),
+	NTB_EVENT_SW_EVENT1	= (1 << 1),
+	NTB_EVENT_SW_EVENT2	= (1 << 2),
+	NTB_EVENT_HW_ERROR	= (1 << 3),
+	NTB_EVENT_HW_LINK_UP	= (1 << 4),
+	NTB_EVENT_HW_LINK_DOWN	= (1 << 5),
+};
+
+bool ntb_hw_link_status(struct ntb_device *ndev);
+struct pci_dev *ntb_query_pdev(struct ntb_device *ndev);
+unsigned int ntb_query_max_cbs(struct ntb_device *ndev);
+struct ntb_device *ntb_register_transport(void *transport);
+void ntb_unregister_transport(struct ntb_device *ndev);
+void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr);
+int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
+			     void (*db_cb_func) (int db_num));
+void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx);
+int ntb_register_event_callback(struct ntb_device *ndev,
+				void (*event_cb_func) (void *handle,
+						       unsigned int event));
+void ntb_unregister_event_callback(struct ntb_device *ndev);
+int ntb_get_max_spads(struct ntb_device *ndev);
+int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
+int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
+int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+void *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
+resource_size_t ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
+int ntb_ring_sdb(struct ntb_device *ndev, unsigned int idx);
diff --git a/drivers/ntb/ntb_regs.h b/drivers/ntb/ntb_regs.h
new file mode 100644
index 0000000..c7b8a24
--- /dev/null
+++ b/drivers/ntb/ntb_regs.h
@@ -0,0 +1,150 @@ 
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *   The full GNU General Public License is included in this distribution
+ *   in the file called LICENSE.GPL.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+
+#define NTB_LINK_ENABLE		0x0000
+#define NTB_LINK_DISABLE	0x0002
+#define NTB_LINK_STATUS_ACTIVE	0x2000
+#define NTB_LINK_SPEED_MASK	0x000f
+#define NTB_LINK_WIDTH_MASK	0x03f0
+
+#define SNB_MSIX_CNT		4
+#define SNB_MAX_SPADS		16
+#define SNB_MAX_COMPAT_SPADS	8
+/* Reserve the uppermost bit for link interrupt */
+#define SNB_MAX_DB_BITS		15
+#define SNB_DB_BITS_PER_VEC	5
+
+#define SNB_DB_HW_LINK		0x8000
+
+#define SNB_PCICMD_OFFSET	0x0504
+#define SNB_DEVCTRL_OFFSET	0x0598
+#define SNB_LINK_STATUS_OFFSET	0x01A2
+
+#define SNB_PBAR2LMT_OFFSET	0x0000
+#define SNB_PBAR4LMT_OFFSET	0x0008
+#define SNB_PBAR2XLAT_OFFSET	0x0010
+#define SNB_PBAR4XLAT_OFFSET	0x0018
+#define SNB_SBAR2LMT_OFFSET	0x0020
+#define SNB_SBAR4LMT_OFFSET	0x0028
+#define SNB_SBAR2XLAT_OFFSET	0x0030
+#define SNB_SBAR4XLAT_OFFSET	0x0038
+#define SNB_SBAR0BASE_OFFSET	0x0040
+#define SNB_SBAR2BASE_OFFSET	0x0048
+#define SNB_SBAR4BASE_OFFSET	0x0050
+#define SNB_NTBCNTL_OFFSET	0x0058
+#define SNB_SBDF_OFFSET		0x005C
+#define SNB_PDOORBELL_OFFSET	0x0060
+#define SNB_PDBMSK_OFFSET	0x0062
+#define SNB_SDOORBELL_OFFSET	0x0064
+#define SNB_SDBMSK_OFFSET	0x0066
+#define SNB_USMEMMISS		0x0070
+#define SNB_SPAD_OFFSET		0x0080
+#define SNB_SPADSEMA4_OFFSET	0x00c0
+#define SNB_WCCNTRL_OFFSET	0x00e0
+#define SNB_B2B_SPAD_OFFSET	0x0100
+#define SNB_B2B_DOORBELL_OFFSET	0x0140
+#define SNB_B2B_XLAT_OFFSET	0x0144
+
+#define BWD_MSIX_CNT		34
+#define BWD_MAX_SPADS		16
+#define BWD_MAX_COMPAT_SPADS	16
+#define BWD_MAX_DB_BITS		34
+#define BWD_DB_BITS_PER_VEC	1
+
+#define BWD_PCICMD_OFFSET	0xb004
+#define BWD_MBAR23_OFFSET	0xb018
+#define BWD_MBAR45_OFFSET	0xb020
+#define BWD_DEVCTRL_OFFSET	0xb048
+#define BWD_LINK_STATUS_OFFSET	0xb052
+
+#define BWD_SBAR2XLAT_OFFSET	0x0008
+#define BWD_SBAR4XLAT_OFFSET	0x0010
+#define BWD_PDOORBELL_OFFSET	0x0020
+#define BWD_PDBMSK_OFFSET	0x0028
+#define BWD_NTBCNTL_OFFSET	0x0060
+#define BWD_EBDF_OFFSET		0x0064
+#define BWD_SPAD_OFFSET		0x0080
+#define BWD_SPADSEMA_OFFSET	0x00c0
+#define BWD_STKYSPAD_OFFSET	0x00c4
+#define BWD_PBAR2XLAT_OFFSET	0x8008
+#define BWD_PBAR4XLAT_OFFSET	0x8010
+#define BWD_B2B_DOORBELL_OFFSET	0x8020
+#define BWD_B2B_SPAD_OFFSET	0x8080
+#define BWD_B2B_SPADSEMA_OFFSET	0x80c0
+#define BWD_B2B_STKYSPAD_OFFSET	0x80c4
+
+#define NTB_CNTL_BAR23_SNOOP	(1 << 2)
+#define NTB_CNTL_BAR45_SNOOP	(1 << 6)
+#define BWD_CNTL_LINK_DOWN	(1 << 16)
+
+#define NTB_PPD_OFFSET		0x00D4
+#define SNB_PPD_CONN_TYPE	0x0003
+#define SNB_PPD_DEV_TYPE	0x0010
+#define BWD_PPD_INIT_LINK	0x0004
+#define BWD_PPD_CONN_TYPE	0x0300
+#define BWD_PPD_DEV_TYPE	0x1000
+
+#define BWD_PBAR2XLAT_USD_ADDR	0x0000004000000000
+#define BWD_PBAR4XLAT_USD_ADDR	0x0000008000000000
+#define BWD_MBAR23_USD_ADDR	0x000000410000000C
+#define BWD_MBAR45_USD_ADDR	0x000000810000000C
+#define BWD_PBAR2XLAT_DSD_ADDR	0x0000004100000000
+#define BWD_PBAR4XLAT_DSD_ADDR	0x0000008100000000
+#define BWD_MBAR23_DSD_ADDR	0x000000400000000C
+#define BWD_MBAR45_DSD_ADDR	0x000000800000000C
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
new file mode 100644
index 0000000..88ab23a
--- /dev/null
+++ b/drivers/ntb/ntb_transport.c
@@ -0,0 +1,1283 @@ 
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *   The full GNU General Public License is included in this distribution
+ *   in the file called LICENSE.GPL.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include "ntb_hw.h"
+
+static int transport_mtu = 0x4014;
+module_param(transport_mtu, uint, 0644);
+MODULE_PARM_DESC(transport_mtu, "Maximum size of NTB transport packets");
+
+struct ntb_queue_entry {
+	/* ntb_queue list reference */
+	struct list_head entry;
+	/* pointers to data to be transfered */
+	void *callback_data;
+	void *buf;
+	unsigned int len;
+	unsigned int flags;
+};
+
+struct ntb_transport_qp {
+	struct ntb_device *ndev;
+
+	bool client_ready;
+	bool qp_link;
+	u8 qp_num;	/* Only 64 QP's are allowed.  0-63 */
+
+	void (*tx_handler) (struct ntb_transport_qp *qp);
+	struct tasklet_struct tx_work;
+	struct list_head txq;
+	struct list_head txc;
+	struct list_head txe;
+	spinlock_t txq_lock;
+	spinlock_t txc_lock;
+	spinlock_t txe_lock;
+	void *tx_mw_begin;
+	void *tx_mw_end;
+	void *tx_offset;
+
+	void (*rx_handler) (struct ntb_transport_qp *qp);
+	struct tasklet_struct rx_work;
+	struct list_head rxq;
+	struct list_head rxc;
+	struct list_head rxe;
+	spinlock_t rxq_lock;
+	spinlock_t rxc_lock;
+	spinlock_t rxe_lock;
+	void *rx_buff_begin;
+	void *rx_buff_end;
+	void *rx_offset;
+
+	void (*event_handler) (int status);
+	struct delayed_work link_work;
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_stats;
+
+	/* Stats */
+	u64 rx_bytes;
+	u64 rx_pkts;
+	u64 rx_ring_empty;
+	u64 rx_err_no_buf;
+	u64 rx_err_oflow;
+	u64 rx_err_ver;
+	u64 tx_bytes;
+	u64 tx_pkts;
+	u64 tx_ring_full;
+};
+
+struct ntb_transport_mw {
+	size_t size;
+	void *virt_addr;
+	dma_addr_t dma_addr;
+};
+
+struct ntb_transport {
+	struct ntb_device *ndev;
+	struct ntb_transport_mw mw[NTB_NUM_MW];
+	struct ntb_transport_qp *qps;
+	unsigned int max_qps;
+	unsigned long qp_bitmap;
+	bool transport_link;
+	struct delayed_work link_work;
+	struct dentry *debugfs_dir;
+};
+
+enum {
+	DESC_DONE_FLAG = 1 << 0,
+	LINK_DOWN_FLAG = 1 << 1,
+	HW_ERROR_FLAG = 1 << 2,
+};
+
+struct ntb_payload_header {
+	u64 ver;
+	unsigned int len;
+	unsigned int flags;
+};
+
+enum {
+	MW0_SZ = 0,
+	MW1_SZ,
+	NUM_QPS,
+	QP_LINKS,
+	MAX_SPAD,
+};
+
+#define QP_TO_MW(qp)		((qp) % NTB_NUM_MW)
+#define NTB_QP_DEF_NUM_ENTRIES	100
+#define NTB_LINK_DOWN_TIMEOUT	10
+
+static struct ntb_transport *transport;
+
+static int debugfs_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
+			    loff_t *offp)
+{
+	struct ntb_transport_qp *qp;
+	char buf[512];
+	ssize_t ret, out_offset, out_count;
+
+	out_count = 512;
+
+	qp = filp->private_data;
+	out_offset = 0;
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "NTB Transport stats\n");
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "rx_bytes - %llu\n", qp->rx_bytes);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "rx_pkts - %llu\n", qp->rx_pkts);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "rx_ring_empty - %llu\n", qp->rx_ring_empty);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "rx_err_no_buf - %llu\n", qp->rx_err_no_buf);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "rx_er_oflow - %llu\n", qp->rx_err_oflow);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "rx_err_ver - %llu\n", qp->rx_err_ver);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "rx_offset - %p\n", qp->rx_offset);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "tx_bytes - %llu\n", qp->tx_bytes);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "tx_pkts - %llu\n", qp->tx_pkts);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "tx_ring_full - %llu\n", qp->tx_ring_full);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "tx_offset - %p\n", qp->tx_offset);
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+	return ret;
+}
+
+static const struct file_operations ntb_qp_debugfs_stats = {
+	.owner = THIS_MODULE,
+	.open = debugfs_open,
+	.read = debugfs_read,
+};
+
+static void ntb_list_add_head(spinlock_t *lock, struct list_head *entry,
+			      struct list_head *list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(lock, flags);
+	list_add(entry, list);
+	spin_unlock_irqrestore(lock, flags);
+}
+
+static void ntb_list_add_tail(spinlock_t *lock, struct list_head *entry,
+			      struct list_head *list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(lock, flags);
+	list_add_tail(entry, list);
+	spin_unlock_irqrestore(lock, flags);
+}
+
+static struct ntb_queue_entry *ntb_list_rm_head(spinlock_t *lock,
+						struct list_head *list)
+{
+	struct ntb_queue_entry *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(lock, flags);
+	if (list_empty(list)) {
+		entry = NULL;
+		goto out;
+	}
+	entry = list_first_entry(list, struct ntb_queue_entry, entry);
+	list_del(&entry->entry);
+out:
+	spin_unlock_irqrestore(lock, flags);
+
+	return entry;
+}
+
+static int ntb_transport_setup_qp_mw(unsigned int qp_num)
+{
+	struct ntb_transport_qp *qp = &transport->qps[qp_num];
+	u8 mw_num = QP_TO_MW(qp_num);
+	unsigned int size, num_qps_mw;
+
+	WARN_ON(transport->mw[mw_num].virt_addr == 0);
+
+	if (transport->max_qps % NTB_NUM_MW && !mw_num)
+		num_qps_mw = transport->max_qps / NTB_NUM_MW +
+		    (transport->max_qps % NTB_NUM_MW - mw_num);
+	else
+		num_qps_mw = transport->max_qps / NTB_NUM_MW;
+
+	size = transport->mw[mw_num].size / num_qps_mw;
+	pr_debug("orig size = %d, num qps = %d, size = %d\n",
+		 (int) transport->mw[mw_num].size, transport->max_qps, size);
+
+	qp->rx_buff_begin = transport->mw[mw_num].virt_addr +
+	    (qp_num / NTB_NUM_MW * size);
+	qp->rx_buff_end = qp->rx_buff_begin + size;
+	pr_info("QP %d - RX Buff start %p end %p\n", qp->qp_num,
+		qp->rx_buff_begin, qp->rx_buff_end);
+	qp->rx_offset = qp->rx_buff_begin;
+
+	qp->tx_mw_begin = ntb_get_mw_vbase(transport->ndev, mw_num) +
+	    (qp_num / NTB_NUM_MW * size);
+	qp->tx_mw_end = qp->tx_mw_begin + size;
+	pr_info("QP %d - TX MW start %p end %p\n", qp->qp_num, qp->tx_mw_begin,
+		qp->tx_mw_end);
+	qp->tx_offset = qp->tx_mw_begin;
+
+	qp->rx_pkts = 0;
+	qp->tx_pkts = 0;
+
+	return 0;
+}
+
+static int ntb_set_mw(int num_mw, unsigned int size)
+{
+	struct ntb_transport_mw *mw = &transport->mw[num_mw];
+	struct pci_dev *pdev = ntb_query_pdev(transport->ndev);
+	void *offset;
+
+	/* Alloc memory for receiving data.  Must be 4k aligned */
+	mw->size = ALIGN(size, 4096);
+
+	mw->virt_addr = dma_alloc_coherent(&pdev->dev, mw->size, &mw->dma_addr,
+					   GFP_KERNEL);
+	if (!mw->virt_addr) {
+		pr_err("Unable to allocate MW buffer of size %d\n",
+		       (int) mw->size);
+		return -ENOMEM;
+	}
+
+	/* setup the hdr offsets with 0's */
+	for (offset = mw->virt_addr;
+	     offset + sizeof(struct ntb_payload_header) < mw->virt_addr + size;
+	     offset += transport_mtu + sizeof(struct ntb_payload_header))
+		memset(offset, 0, sizeof(struct ntb_payload_header));
+
+	/* Notify HW the memory location of the receive buffer */
+	ntb_set_mw_addr(transport->ndev, num_mw, mw->dma_addr);
+
+	return 0;
+}
+
+static void ntb_transport_event_callback(void *data, unsigned int event)
+{
+	struct ntb_transport *nt = data;
+
+	if (event == NTB_EVENT_HW_ERROR)
+		BUG();
+
+	if (event == NTB_EVENT_HW_LINK_UP)
+		schedule_delayed_work(&nt->link_work, 0);
+
+	if (event == NTB_EVENT_HW_LINK_DOWN) {
+		int i;
+
+		nt->transport_link = NTB_LINK_DOWN;
+
+		/* Pass along the info to any clients */
+		for (i = 0; i < nt->max_qps; i++)
+			if (!test_bit(i, &nt->qp_bitmap)) {
+				struct ntb_transport_qp *qp = &nt->qps[i];
+
+				if (qp->event_handler &&
+				    qp->qp_link != NTB_LINK_DOWN)
+					qp->event_handler(NTB_LINK_DOWN);
+
+				qp->qp_link = NTB_LINK_DOWN;
+			}
+
+		/* The scratchpad registers keep the values if the remote side
+		 * goes down, blast them now to give them a sane value the next
+		 * time they are accessed
+		 */
+		for (i = 0; i < MAX_SPAD; i++) {
+			ntb_write_local_spad(transport->ndev, i, 0);
+			ntb_write_remote_spad(transport->ndev, i, 0);
+		}
+	}
+}
+
+static void ntb_transport_link_work(struct work_struct *work)
+{
+	struct ntb_transport *nt = container_of(work, struct ntb_transport,
+						link_work.work);
+	struct ntb_device *ndev = nt->ndev;
+	u32 val;
+	int rc, i;
+
+	/* send the local info */
+	rc = ntb_write_remote_spad(ndev, MW0_SZ, ntb_get_mw_size(ndev, 0));
+	if (rc) {
+		pr_err("Error writing %x to remote spad %d\n",
+		       (u32) ntb_get_mw_size(ndev, 0), MW0_SZ);
+		goto out;
+	}
+
+	rc = ntb_write_remote_spad(ndev, MW1_SZ, ntb_get_mw_size(ndev, 1));
+	if (rc) {
+		pr_err("Error writing %x to remote spad %d\n",
+		       (u32) ntb_get_mw_size(ndev, 1), MW1_SZ);
+		goto out;
+	}
+
+	rc = ntb_write_remote_spad(ndev, NUM_QPS, nt->max_qps);
+	if (rc) {
+		pr_err("Error writing %x to remote spad %d\n",
+		       nt->max_qps, NUM_QPS);
+		goto out;
+	}
+
+	rc = ntb_write_remote_spad(ndev, QP_LINKS, 0);
+	if (rc) {
+		pr_err("Error writing %x to remote spad %d\n", 0, QP_LINKS);
+		goto out;
+	}
+
+	/* Query the remote side for its info */
+	rc = ntb_read_remote_spad(ndev, NUM_QPS, &val);
+	if (rc) {
+		pr_err("Error reading remote spad %d\n", NUM_QPS);
+		goto out;
+	}
+
+	if (val != nt->max_qps)
+		goto out;
+	pr_info("Remote max number of qps = %d\n", val);
+
+	rc = ntb_read_remote_spad(ndev, MW0_SZ, &val);
+	if (rc) {
+		pr_err("Error reading remote spad %d\n", MW0_SZ);
+		goto out;
+	}
+
+	if (!val)
+		goto out;
+	pr_info("Remote MW0 size = %d\n", val);
+
+	rc = ntb_set_mw(0, val);
+	if (rc)
+		goto out;
+
+	rc = ntb_read_remote_spad(ndev, MW1_SZ, &val);
+	if (rc) {
+		pr_err("Error reading remote spad %d\n", MW1_SZ);
+		goto out;
+	}
+
+	if (!val)
+		goto out;
+	pr_info("Remote MW1 size = %d\n", val);
+
+	rc = ntb_set_mw(1, val);
+	if (rc)
+		goto out;
+
+	for (i = 0; i < nt->max_qps; i++) {
+		struct ntb_transport_qp *qp = &nt->qps[i];
+
+		rc = ntb_transport_setup_qp_mw(i);
+		if (rc)
+			goto out;
+
+		if (qp->client_ready)
+			schedule_delayed_work(&qp->link_work, 0);
+	}
+
+	nt->transport_link = NTB_LINK_UP;
+
+	return;
+
+out:
+	if (ntb_hw_link_status(ndev))
+		schedule_delayed_work(&nt->link_work,
+				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
+}
+
+static void ntb_qp_link_work(struct work_struct *work)
+{
+	struct ntb_transport_qp *qp;
+	int rc, val;
+
+	qp = container_of(work, struct ntb_transport_qp, link_work.work);
+
+	WARN_ON(transport->transport_link != NTB_LINK_UP);
+
+	rc = ntb_read_local_spad(transport->ndev, QP_LINKS, &val);
+	if (rc) {
+		pr_err("Error reading spad %d\n", QP_LINKS);
+		return;
+	}
+
+	rc = ntb_write_remote_spad(transport->ndev, QP_LINKS,
+				   val | 1 << qp->qp_num);
+	if (rc)
+		pr_err("Error writing %x to remote spad %d\n",
+		       val | 1 << qp->qp_num, QP_LINKS);
+
+	/* query remote spad for qp ready bits */
+	rc = ntb_read_remote_spad(transport->ndev, QP_LINKS, &val);
+	if (rc)
+		pr_err("Error reading remote spad %d\n", QP_LINKS);
+
+	pr_debug("Remote QP link status = %x\n", val);
+
+	/* See if the remote side is up */
+	if (1 << qp->qp_num & val) {
+		qp->qp_link = NTB_LINK_UP;
+
+		if (qp->event_handler)
+			qp->event_handler(NTB_LINK_UP);
+	} else if (ntb_hw_link_status(transport->ndev))
+		schedule_delayed_work(&qp->link_work,
+				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
+}
+
+static void ntb_transport_init_queue(unsigned int qp_num)
+{
+	struct ntb_transport_qp *qp;
+
+	qp = &transport->qps[qp_num];
+	qp->qp_num = qp_num;
+	qp->ndev = transport->ndev;
+	qp->qp_link = NTB_LINK_DOWN;
+
+	if (transport->debugfs_dir) {
+		char debugfs_name[4];
+
+		snprintf(debugfs_name, 4, "qp%d", qp_num);
+		qp->debugfs_dir = debugfs_create_dir(debugfs_name,
+						     transport->debugfs_dir);
+
+		qp->debugfs_stats = debugfs_create_file("stats", S_IRUSR,
+							qp->debugfs_dir, qp,
+							&ntb_qp_debugfs_stats);
+	}
+
+	INIT_DELAYED_WORK(&qp->link_work, ntb_qp_link_work);
+
+	spin_lock_init(&qp->rxc_lock);
+	spin_lock_init(&qp->rxq_lock);
+	spin_lock_init(&qp->rxe_lock);
+	spin_lock_init(&qp->txc_lock);
+	spin_lock_init(&qp->txq_lock);
+	spin_lock_init(&qp->txe_lock);
+
+	INIT_LIST_HEAD(&qp->rxq);
+	INIT_LIST_HEAD(&qp->rxc);
+	INIT_LIST_HEAD(&qp->rxe);
+	INIT_LIST_HEAD(&qp->txq);
+	INIT_LIST_HEAD(&qp->txc);
+	INIT_LIST_HEAD(&qp->txe);
+}
+
+static int ntb_transport_init(void)
+{
+	int rc, i;
+
+	transport = kzalloc(sizeof(struct ntb_transport), GFP_KERNEL);
+	if (!transport)
+		return -ENOMEM;
+
+	if (debugfs_initialized())
+		transport->debugfs_dir = debugfs_create_dir(KBUILD_MODNAME,
+							    NULL);
+	else
+		transport->debugfs_dir = NULL;
+
+	transport->ndev = ntb_register_transport(transport);
+	if (!transport->ndev) {
+		rc = -EIO;
+		goto err;
+	}
+
+	transport->max_qps = ntb_query_max_cbs(transport->ndev);
+	if (!transport->max_qps) {
+		rc = -EIO;
+		goto err1;
+	}
+
+	transport->qps = kcalloc(transport->max_qps,
+				 sizeof(struct ntb_transport_qp), GFP_KERNEL);
+	if (!transport->qps) {
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	transport->qp_bitmap = ((u64) 1 << transport->max_qps) - 1;
+
+	for (i = 0; i < transport->max_qps; i++)
+		ntb_transport_init_queue(i);
+
+	rc = ntb_register_event_callback(transport->ndev,
+					 ntb_transport_event_callback);
+	if (rc)
+		goto err2;
+
+	INIT_DELAYED_WORK(&transport->link_work, ntb_transport_link_work);
+
+	if (ntb_hw_link_status(transport->ndev))
+		schedule_delayed_work(&transport->link_work, 0);
+
+	return 0;
+
+err2:
+	kfree(transport->qps);
+err1:
+	ntb_unregister_transport(transport->ndev);
+err:
+	debugfs_remove_recursive(transport->debugfs_dir);
+	kfree(transport);
+	return rc;
+}
+
+static void ntb_transport_free(void)
+{
+	struct pci_dev *pdev;
+	int i;
+
+	if (!transport)
+		return;
+
+	transport->transport_link = NTB_LINK_DOWN;
+
+	cancel_delayed_work_sync(&transport->link_work);
+
+	debugfs_remove_recursive(transport->debugfs_dir);
+
+	ntb_unregister_event_callback(transport->ndev);
+
+	pdev = ntb_query_pdev(transport->ndev);
+
+	for (i = 0; i < NTB_NUM_MW; i++)
+		if (transport->mw[i].virt_addr)
+			dma_free_coherent(&pdev->dev, transport->mw[i].size,
+					  transport->mw[i].virt_addr,
+					  transport->mw[i].dma_addr);
+
+	kfree(transport->qps);
+	ntb_unregister_transport(transport->ndev);
+	kfree(transport);
+	transport = NULL;
+}
+
+static void ntb_rx_copy_task(struct ntb_transport_qp *qp,
+			     struct ntb_queue_entry *entry, void *offset)
+{
+	struct ntb_payload_header *hdr = offset;
+
+	entry->len = hdr->len;
+	offset += sizeof(struct ntb_payload_header);
+	memcpy(entry->buf, offset, entry->len);
+
+	/* Ensure that the data is fully copied out before clearing the flag */
+	wmb();
+	hdr->flags = 0;
+	ntb_list_add_tail(&qp->rxc_lock, &entry->entry, &qp->rxc);
+
+	if (qp->rx_handler && qp->client_ready)
+		qp->rx_handler(qp);
+}
+
+static int ntb_process_rxc(struct ntb_transport_qp *qp)
+{
+	struct ntb_payload_header *hdr;
+	struct ntb_queue_entry *entry;
+	void *offset;
+
+	entry = ntb_list_rm_head(&qp->rxq_lock, &qp->rxq);
+	if (!entry) {
+		hdr = qp->rx_offset;
+		pr_debug("no buffer - HDR ver %llu, len %d, flags %x\n",
+			hdr->ver, hdr->len, hdr->flags);
+		qp->rx_err_no_buf++;
+		return -ENOMEM;
+	}
+
+	offset = qp->rx_offset;
+	hdr = offset;
+
+	if (!(hdr->flags & DESC_DONE_FLAG)) {
+		ntb_list_add_tail(&qp->rxq_lock, &entry->entry, &qp->rxq);
+		qp->rx_ring_empty++;
+		return -EAGAIN;
+	}
+
+	if (hdr->ver != qp->rx_pkts) {
+		pr_debug("qp %d: version mismatch, expected %llu - got %llu\n",
+			 qp->qp_num, qp->rx_pkts, hdr->ver);
+		ntb_list_add_tail(&qp->rxq_lock, &entry->entry, &qp->rxq);
+		qp->rx_err_ver++;
+		return -EIO;
+	}
+
+	if (hdr->flags & NTB_LINK_DOWN) {
+		pr_info("qp %d: Link Down\n", qp->qp_num);
+		qp->qp_link = NTB_LINK_DOWN;
+		schedule_delayed_work(&qp->link_work,
+				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
+
+		if (qp->event_handler)
+			qp->event_handler(NTB_LINK_DOWN);
+
+		ntb_list_add_tail(&qp->rxq_lock, &entry->entry, &qp->rxq);
+
+		/* Ensure that the data is fully copied out before clearing the
+		 * done flag
+		 */
+		wmb();
+		hdr->flags = 0;
+		goto out;
+	}
+
+	pr_debug("rx offset %p, ver %llu - %d payload received, "
+		 "buf size %d\n", qp->rx_offset, hdr->ver, hdr->len,
+		 entry->len);
+
+	if (hdr->len <= entry->len)
+		ntb_rx_copy_task(qp, entry, offset);
+	else {
+		ntb_list_add_tail(&qp->rxq_lock, &entry->entry, &qp->rxq);
+
+		/* Ensure that the data is fully copied out before clearing the
+		 * done flag
+		 */
+		wmb();
+		hdr->flags = 0;
+		qp->rx_err_oflow++;
+		pr_err("RX overflow! Wanted %d got %d\n", hdr->len, entry->len);
+	}
+
+	qp->rx_bytes += hdr->len;
+	qp->rx_pkts++;
+
+out:
+	qp->rx_offset =
+	    (qp->rx_offset +
+	     ((transport_mtu + sizeof(struct ntb_payload_header)) * 2) >=
+	     qp->rx_buff_end) ? qp->rx_buff_begin : qp->rx_offset +
+	    transport_mtu + sizeof(struct ntb_payload_header);
+
+	return 0;
+}
+
+static void ntb_transport_rx(unsigned long data)
+{
+	struct ntb_transport_qp *qp = (struct ntb_transport_qp *)data;
+	int rc;
+
+	do {
+		rc = ntb_process_rxc(qp);
+	} while (!rc);
+}
+
+static void ntb_transport_rxc_db(int db_num)
+{
+	struct ntb_transport_qp *qp = &transport->qps[db_num];
+
+	pr_debug("%s: doorbell %d received\n", __func__, db_num);
+
+	tasklet_schedule(&qp->rx_work);
+}
+
+static void ntb_tx_copy_task(struct ntb_transport_qp *qp,
+			     struct ntb_queue_entry *entry,
+			     void *offset)
+{
+	struct ntb_payload_header *hdr = offset;
+	int rc;
+
+	offset += sizeof(struct ntb_payload_header);
+	memcpy_toio(offset, entry->buf, entry->len);
+
+	hdr->len = entry->len;
+	hdr->ver = qp->tx_pkts;
+
+	/* Ensure that the data is fully copied out before setting the flag */
+	wmb();
+	hdr->flags = entry->flags | DESC_DONE_FLAG;
+
+	rc = ntb_ring_sdb(qp->ndev, qp->qp_num);
+	if (rc)
+		pr_err("%s: error ringing db %d\n", __func__, qp->qp_num);
+
+	if (entry->len > 0) {
+		qp->tx_bytes += entry->len;
+
+		/* Add fully transmitted data to completion queue */
+		ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
+
+		if (qp->tx_handler)
+			qp->tx_handler(qp);
+	} else
+		ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
+}
+
+static int ntb_process_tx(struct ntb_transport_qp *qp,
+			  struct ntb_queue_entry *entry)
+{
+	struct ntb_payload_header *hdr;
+	void *offset;
+
+	offset = qp->tx_offset;
+	hdr = offset;
+
+	pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
+		 qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
+		 entry->buf);
+	if (hdr->flags) {
+		ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
+		qp->tx_ring_full++;
+		return -EAGAIN;
+	}
+
+	if (entry->len > transport_mtu) {
+		pr_err("Trying to send pkt size of %d\n", entry->len);
+		entry->flags = HW_ERROR_FLAG;
+
+		ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
+
+		if (qp->tx_handler)
+			qp->tx_handler(qp);
+
+		return 0;
+	}
+
+	ntb_tx_copy_task(qp, entry, offset);
+
+	qp->tx_offset =
+	    (qp->tx_offset +
+	     ((transport_mtu + sizeof(struct ntb_payload_header)) * 2) >=
+	     qp->tx_mw_end) ? qp->tx_mw_begin : qp->tx_offset + transport_mtu +
+	    sizeof(struct ntb_payload_header);
+
+	qp->tx_pkts++;
+
+	return 0;
+}
+
+static void ntb_transport_tx(unsigned long data)
+{
+	struct ntb_transport_qp *qp = (struct ntb_transport_qp *)data;
+	struct ntb_queue_entry *entry;
+	int rc;
+
+	do {
+		entry = ntb_list_rm_head(&qp->txq_lock, &qp->txq);
+		if (!entry)
+			break;
+
+		rc = ntb_process_tx(qp, entry);
+	} while (!rc);
+}
+
+static void ntb_send_link_down(struct ntb_transport_qp *qp)
+{
+	struct ntb_queue_entry *entry;
+	int i;
+
+	if (qp->qp_link == NTB_LINK_DOWN)
+		return;
+
+	qp->qp_link = NTB_LINK_DOWN;
+
+	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
+		entry = ntb_list_rm_head(&qp->txe_lock, &qp->txe);
+		if (entry)
+			break;
+		msleep(100);
+	}
+
+	entry->callback_data = NULL;
+	entry->buf = NULL;
+	entry->len = 0;
+	entry->flags = LINK_DOWN_FLAG;
+
+	ntb_list_add_tail(&qp->txq_lock, &entry->entry, &qp->txq);
+	tasklet_schedule(&qp->tx_work);
+}
+
+/**
+ * ntb_transport_create_queue - Create a new NTB transport layer queue
+ * @rx_handler: receive callback function
+ * @tx_handler: transmit callback function
+ * @event_handler: event callback function
+ *
+ * Create a new NTB transport layer queue and provide the queue with a callback
+ * routine for both transmit and receive.  The receive callback routine will be
+ * used to pass up data when the transport has received it on the queue.   The
+ * transmit callback routine will be called when the transport has completed the
+ * transmission of the data on the queue and the data is ready to be freed.
+ *
+ * RETURNS: pointer to newly created ntb_queue, NULL on error.
+ */
+struct ntb_transport_qp *
+ntb_transport_create_queue(void (*rx_handler) (struct ntb_transport_qp *qp),
+			   void (*tx_handler) (struct ntb_transport_qp *qp),
+			   void (*event_handler)(int status))
+{
+	struct ntb_queue_entry *entry;
+	struct ntb_transport_qp *qp;
+	unsigned int free_queue;
+	int rc, i;
+
+	if (!transport) {
+		rc = ntb_transport_init();
+		if (rc)
+			return NULL;
+	}
+
+	free_queue = ffs(transport->qp_bitmap);
+	if (!free_queue)
+		goto err;
+
+	/* decrement free_queue to make it zero based */
+	free_queue--;
+
+	clear_bit(free_queue, &transport->qp_bitmap);
+
+	qp = &transport->qps[free_queue];
+	qp->rx_handler = rx_handler;
+	qp->tx_handler = tx_handler;
+	qp->event_handler = event_handler;
+
+	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+		entry = kzalloc(sizeof(struct ntb_queue_entry), GFP_ATOMIC);
+		if (!entry)
+			goto err1;
+
+		ntb_list_add_tail(&qp->rxe_lock, &entry->entry, &qp->rxe);
+	}
+
+	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
+		entry = kzalloc(sizeof(struct ntb_queue_entry), GFP_ATOMIC);
+		if (!entry)
+			goto err2;
+
+		ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
+	}
+
+	tasklet_init(&qp->rx_work, ntb_transport_rx, (unsigned long) qp);
+	tasklet_init(&qp->tx_work, ntb_transport_tx, (unsigned long) qp);
+
+	rc = ntb_register_db_callback(qp->ndev, free_queue,
+				      ntb_transport_rxc_db);
+	if (rc)
+		goto err3;
+
+	pr_info("NTB Transport QP %d created\n", qp->qp_num);
+
+	return qp;
+
+err3:
+	tasklet_disable(&qp->rx_work);
+	tasklet_disable(&qp->tx_work);
+err2:
+	while ((entry = ntb_list_rm_head(&qp->txe_lock, &qp->txe)))
+		kfree(entry);
+err1:
+	while ((entry = ntb_list_rm_head(&qp->rxe_lock, &qp->rxe)))
+		kfree(entry);
+	set_bit(free_queue, &transport->qp_bitmap);
+err:
+	return NULL;
+}
+EXPORT_SYMBOL(ntb_transport_create_queue);
+
+/**
+ * ntb_transport_free_queue - Frees NTB transport queue
+ * @qp: NTB queue to be freed
+ *
+ * Frees NTB transport queue
+ */
+void ntb_transport_free_queue(struct ntb_transport_qp *qp)
+{
+	struct ntb_queue_entry *entry;
+
+	if (!qp)
+		return;
+
+	cancel_delayed_work_sync(&qp->link_work);
+
+	ntb_unregister_db_callback(qp->ndev, qp->qp_num);
+	tasklet_disable(&qp->rx_work);
+	tasklet_disable(&qp->tx_work);
+
+	while ((entry = ntb_list_rm_head(&qp->rxe_lock, &qp->rxe)))
+		kfree(entry);
+
+	while ((entry = ntb_list_rm_head(&qp->rxq_lock, &qp->rxq))) {
+		pr_warn("Freeing item from a non-empty queue\n");
+		kfree(entry);
+	}
+
+	while ((entry = ntb_list_rm_head(&qp->rxc_lock, &qp->rxc))) {
+		pr_warn("Freeing item from a non-empty queue\n");
+		kfree(entry);
+	}
+
+	while ((entry = ntb_list_rm_head(&qp->txe_lock, &qp->txe)))
+		kfree(entry);
+
+	while ((entry = ntb_list_rm_head(&qp->txq_lock, &qp->txq))) {
+		pr_warn("Freeing item from a non-empty queue\n");
+		kfree(entry);
+	}
+
+	while ((entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc))) {
+		pr_warn("Freeing item from a non-empty queue\n");
+		kfree(entry);
+	}
+
+	set_bit(qp->qp_num, &transport->qp_bitmap);
+
+	pr_info("NTB Transport QP %d freed\n", qp->qp_num);
+
+	if (transport->qp_bitmap == ((u64) 1 << transport->max_qps) - 1)
+		ntb_transport_free();
+}
+EXPORT_SYMBOL(ntb_transport_free_queue);
+
+/**
+ * ntb_transport_rx_remove - Dequeues enqueued rx packet
+ * @qp: NTB queue to be freed
+ * @len: pointer to variable to write enqueued buffers length
+ *
+ * Dequeues unused buffers from receive queue.  Should only be used during
+ * shutdown of qp.
+ *
+ * RETURNS: NULL error value on error, or void* for success.
+ */
+void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len)
+{
+	struct ntb_queue_entry *entry;
+	void *buf;
+
+	if (!qp || qp->client_ready == NTB_LINK_UP)
+		return NULL;
+
+	entry = ntb_list_rm_head(&qp->rxq_lock, &qp->rxq);
+	if (!entry)
+		return NULL;
+
+	buf = entry->callback_data;
+	*len = entry->len;
+
+	ntb_list_add_tail(&qp->rxe_lock, &entry->entry, &qp->rxe);
+
+	return buf;
+}
+EXPORT_SYMBOL(ntb_transport_rx_remove);
+
+/**
+ * ntb_transport_rx_enqueue - Enqueue a new NTB queue entry
+ * @qp: NTB transport layer queue the entry is to be enqueued on
+ * @cb: per buffer pointer for callback function to use
+ * @data: pointer to data buffer that incoming packets will be copied into
+ * @len: length of the data buffer
+ *
+ * Enqueue a new receive buffer onto the transport queue into which a NTB
+ * payload can be received into.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len)
+{
+	struct ntb_queue_entry *entry;
+
+	if (!qp)
+		return -EINVAL;
+
+	entry = ntb_list_rm_head(&qp->rxe_lock, &qp->rxe);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->callback_data = cb;
+	entry->buf = data;
+	entry->len = len;
+
+	ntb_list_add_tail(&qp->rxq_lock, &entry->entry, &qp->rxq);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_transport_rx_enqueue);
+
+/**
+ * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
+ * @qp: NTB transport layer queue the entry is to be enqueued on
+ * @cb: per buffer pointer for callback function to use
+ * @data: pointer to data buffer that will be sent
+ * @len: length of the data buffer
+ *
+ * Enqueue a new transmit buffer onto the transport queue from which a NTB
+ * payload will be transmitted.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len)
+{
+	struct ntb_queue_entry *entry;
+
+	if (!qp || qp->qp_link != NTB_LINK_UP)
+		return -EINVAL;
+
+	entry = ntb_list_rm_head(&qp->txe_lock, &qp->txe);
+	if (!entry) {
+		/* ring full, kick it */
+		tasklet_schedule(&qp->tx_work);
+		return -ENOMEM;
+	}
+
+	entry->callback_data = cb;
+	entry->buf = data;
+	entry->len = len;
+	entry->flags = 0;
+
+	ntb_list_add_tail(&qp->txq_lock, &entry->entry, &qp->txq);
+
+	tasklet_schedule(&qp->tx_work);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_transport_tx_enqueue);
+
+/**
+ * ntb_transport_tx_dequeue - Dequeue a NTB queue entry
+ * @qp: NTB transport layer queue to be dequeued from
+ * @len: length of the data buffer
+ *
+ * This function will dequeue a buffer from the transmit complete queue.
+ * Entries will only be enqueued on this queue after having been
+ * transfered to the remote side.
+ *
+ * RETURNS: callback pointer of the buffer from the transport queue, or NULL
+ * on empty
+ */
+void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
+{
+	struct ntb_queue_entry *entry;
+	void *buf;
+
+	if (!qp)
+		return NULL;
+
+	entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc);
+	if (!entry)
+		return NULL;
+
+	buf = entry->callback_data;
+	if (entry->flags != HW_ERROR_FLAG)
+		*len = entry->len;
+	else
+		*len = -EIO;
+
+	ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
+
+	return buf;
+}
+EXPORT_SYMBOL(ntb_transport_tx_dequeue);
+
+/**
+ * ntb_transport_rx_dequeue - Dequeue a NTB queue entry
+ * @qp: NTB transport layer queue to be dequeued from
+ * @len: length of the data buffer
+ *
+ * This function will dequeue a buffer from the receive complete queue.
+ * Entries will only be enqueued on this queue after having been fully received.
+ *
+ * RETURNS: callback pointer of the buffer from the transport queue, or NULL
+ * on empty
+ */
+void *ntb_transport_rx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
+{
+	struct ntb_queue_entry *entry;
+	void *buf;
+
+	if (!qp)
+		return NULL;
+
+	entry = ntb_list_rm_head(&qp->rxc_lock, &qp->rxc);
+	if (!entry)
+		return NULL;
+
+	buf = entry->callback_data;
+	*len = entry->len;
+
+	ntb_list_add_tail(&qp->rxe_lock, &entry->entry, &qp->rxe);
+
+	return buf;
+}
+EXPORT_SYMBOL(ntb_transport_rx_dequeue);
+
+/**
+ * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
+ * @qp: NTB transport layer queue to be enabled
+ *
+ * Notify NTB transport layer of client readiness to use queue
+ */
+void ntb_transport_link_up(struct ntb_transport_qp *qp)
+{
+	if (!qp)
+		return;
+
+	qp->client_ready = NTB_LINK_UP;
+
+	if (transport->transport_link == NTB_LINK_UP)
+		schedule_delayed_work(&qp->link_work, 0);
+}
+EXPORT_SYMBOL(ntb_transport_link_up);
+
+/**
+ * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
+ * @qp: NTB transport layer queue to be disabled
+ *
+ * Notify NTB transport layer of client's desire to no longer receive data on
+ * transport queue specified.  It is the client's responsibility to ensure all
+ * entries on queue are purged or otherwise handled appropraitely.
+ */
+void ntb_transport_link_down(struct ntb_transport_qp *qp)
+{
+	int rc, val;
+
+	if (!qp)
+		return;
+
+	qp->client_ready = NTB_LINK_DOWN;
+
+	cancel_delayed_work_sync(&qp->link_work);
+	qp->qp_link = NTB_LINK_DOWN;
+
+	rc = ntb_read_local_spad(transport->ndev, QP_LINKS, &val);
+	if (rc) {
+		pr_err("Error reading spad %d\n", QP_LINKS);
+		return;
+	}
+
+	rc = ntb_write_remote_spad(transport->ndev, QP_LINKS,
+				   val & ~(1 << qp->qp_num));
+	if (rc)
+		pr_err("Error writing %x to remote spad %d\n",
+		       val & ~(1 << qp->qp_num), QP_LINKS);
+
+	if (transport->transport_link == NTB_LINK_UP)
+		ntb_send_link_down(qp);
+}
+EXPORT_SYMBOL(ntb_transport_link_down);
+
+/**
+ * ntb_transport_link_query - Query transport link state
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query connectivity to the remote system of the NTB transport queue
+ *
+ * RETURNS: true for link up or false for link down
+ */
+bool ntb_transport_link_query(struct ntb_transport_qp *qp)
+{
+	return qp->qp_link == NTB_LINK_UP;
+}
+EXPORT_SYMBOL(ntb_transport_link_query);
+
+/**
+ * ntb_transport_qp_num - Query the qp number
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query qp number of the NTB transport queue
+ *
+ * RETURNS: a zero based number specifying the qp number
+ */
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp)
+{
+	return qp->qp_num;
+}
+EXPORT_SYMBOL(ntb_transport_qp_num);
+
+/**
+ * ntb_transport_max_size - Query the max payload size of a qp
+ * @qp: NTB transport layer queue to be queried
+ *
+ * Query the maximum payload size permissible on the given qp
+ *
+ * RETURNS: the max payload size of a qp
+ */
+unsigned int
+ntb_transport_max_size(__attribute__((unused)) struct ntb_transport_qp *qp)
+{
+	return transport_mtu;
+}
+EXPORT_SYMBOL(ntb_transport_max_size);
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
new file mode 100644
index 0000000..4d0efc3
--- /dev/null
+++ b/include/linux/ntb.h
@@ -0,0 +1,78 @@ 
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *   The full GNU General Public License is included in this distribution
+ *   in the file called LICENSE.GPL.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+
+struct ntb_transport_qp;
+
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
+struct ntb_transport_qp *
+ntb_transport_create_queue(void (*rx_handler)(struct ntb_transport_qp *qp),
+			   void (*tx_handler)(struct ntb_transport_qp *qp),
+			   void (*event_handler)(int status));
+void ntb_transport_free_queue(struct ntb_transport_qp *qp);
+int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len);
+void *ntb_transport_rx_dequeue(struct ntb_transport_qp *qp, unsigned int *len);
+void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
+void ntb_transport_link_up(struct ntb_transport_qp *qp);
+void ntb_transport_link_down(struct ntb_transport_qp *qp);
+bool ntb_transport_link_query(struct ntb_transport_qp *qp);