diff mbox

[v2,01/17] IB/Verbs: Implement new callback query_transport() for each HW

Message ID 5523D098.3020007@profitbricks.com
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Michael Wang April 7, 2015, 12:42 p.m. UTC
Add new callback query_transport() and implement for each HW.

Mapping List:
		node-type	link-layer	old-transport	new-transport
nes		RNIC		ETH		IWARP		IWARP
amso1100	RNIC		ETH		IWARP		IWARP
cxgb3   	RNIC		ETH		IWARP		IWARP
cxgb4   	RNIC		ETH		IWARP		IWARP
usnic   	USNIC_UDP	ETH		USNIC_UDP	USNIC_UDP
ocrdma  	IB_CA		ETH		IB		IBOE
mlx4    	IB_CA		IB/ETH		IB		IB/IBOE
mlx5    	IB_CA		IB		IB		IB
ehca    	IB_CA		IB		IB		IB
ipath   	IB_CA		IB		IB		IB
mthca   	IB_CA		IB		IB		IB
qib     	IB_CA		IB		IB		IB

Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Michael Wang <yun.wang@profitbricks.com>
---
 drivers/infiniband/core/device.c             |  1 +
 drivers/infiniband/core/verbs.c              |  4 +++-
 drivers/infiniband/hw/amso1100/c2_provider.c |  7 +++++++
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |  7 +++++++
 drivers/infiniband/hw/cxgb4/provider.c       |  7 +++++++
 drivers/infiniband/hw/ehca/ehca_hca.c        |  6 ++++++
 drivers/infiniband/hw/ehca/ehca_iverbs.h     |  3 +++
 drivers/infiniband/hw/ehca/ehca_main.c       |  1 +
 drivers/infiniband/hw/ipath/ipath_verbs.c    |  7 +++++++
 drivers/infiniband/hw/mlx4/main.c            | 10 ++++++++++
 drivers/infiniband/hw/mlx5/main.c            |  7 +++++++
 drivers/infiniband/hw/mthca/mthca_provider.c |  7 +++++++
 drivers/infiniband/hw/nes/nes_verbs.c        |  6 ++++++
 drivers/infiniband/hw/ocrdma/ocrdma_main.c   |  1 +
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |  6 ++++++
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |  3 +++
 drivers/infiniband/hw/qib/qib_verbs.c        |  7 +++++++
 drivers/infiniband/hw/usnic/usnic_ib_main.c  |  1 +
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |  6 ++++++
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h |  2 ++
 include/rdma/ib_verbs.h                      |  7 ++++++-
 21 files changed, 104 insertions(+), 2 deletions(-)

Comments

Doug Ledford April 8, 2015, 6:29 p.m. UTC | #1
On Tue, 2015-04-07 at 14:42 +0200, Michael Wang wrote:
> Add new callback query_transport() and implement for each HW.

My response here is going to be a long email, but that's because it's
easier to respond to the various patches all in one response in order to
preserve context.  So, while I'm responding to patch 1 of 17, my
response will cover all 17 patches in whole.

> Mapping List:
> 		node-type	link-layer	old-transport	new-transport
> nes		RNIC		ETH		IWARP		IWARP
> amso1100	RNIC		ETH		IWARP		IWARP
> cxgb3   	RNIC		ETH		IWARP		IWARP
> cxgb4   	RNIC		ETH		IWARP		IWARP
> usnic   	USNIC_UDP	ETH		USNIC_UDP	USNIC_UDP
> ocrdma  	IB_CA		ETH		IB		IBOE
> mlx4    	IB_CA		IB/ETH		IB		IB/IBOE
> mlx5    	IB_CA		IB		IB		IB
> ehca    	IB_CA		IB		IB		IB
> ipath   	IB_CA		IB		IB		IB
> mthca   	IB_CA		IB		IB		IB
> qib     	IB_CA		IB		IB		IB
> 
> Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> Cc: Doug Ledford <dledford@redhat.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Sean Hefty <sean.hefty@intel.com>
> Signed-off-by: Michael Wang <yun.wang@profitbricks.com>
> ---
>  drivers/infiniband/core/device.c             |  1 +
>  drivers/infiniband/core/verbs.c              |  4 +++-
>  drivers/infiniband/hw/amso1100/c2_provider.c |  7 +++++++
>  drivers/infiniband/hw/cxgb3/iwch_provider.c  |  7 +++++++
>  drivers/infiniband/hw/cxgb4/provider.c       |  7 +++++++
>  drivers/infiniband/hw/ehca/ehca_hca.c        |  6 ++++++
>  drivers/infiniband/hw/ehca/ehca_iverbs.h     |  3 +++
>  drivers/infiniband/hw/ehca/ehca_main.c       |  1 +
>  drivers/infiniband/hw/ipath/ipath_verbs.c    |  7 +++++++
>  drivers/infiniband/hw/mlx4/main.c            | 10 ++++++++++
>  drivers/infiniband/hw/mlx5/main.c            |  7 +++++++
>  drivers/infiniband/hw/mthca/mthca_provider.c |  7 +++++++
>  drivers/infiniband/hw/nes/nes_verbs.c        |  6 ++++++
>  drivers/infiniband/hw/ocrdma/ocrdma_main.c   |  1 +
>  drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |  6 ++++++
>  drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |  3 +++
>  drivers/infiniband/hw/qib/qib_verbs.c        |  7 +++++++
>  drivers/infiniband/hw/usnic/usnic_ib_main.c  |  1 +
>  drivers/infiniband/hw/usnic/usnic_ib_verbs.c |  6 ++++++
>  drivers/infiniband/hw/usnic/usnic_ib_verbs.h |  2 ++
>  include/rdma/ib_verbs.h                      |  7 ++++++-
>  21 files changed, 104 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 18c1ece..a9587c4 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -76,6 +76,7 @@ static int ib_device_check_mandatory(struct ib_device *device)
>  	} mandatory_table[] = {
>  		IB_MANDATORY_FUNC(query_device),
>  		IB_MANDATORY_FUNC(query_port),
> +		IB_MANDATORY_FUNC(query_transport),
>  		IB_MANDATORY_FUNC(query_pkey),
>  		IB_MANDATORY_FUNC(query_gid),
>  		IB_MANDATORY_FUNC(alloc_pd),

I'm concerned about the performance implications of this.  The size of
this patchset already points out just how many places in the code we
have to check for various aspects of the device transport in order to do
the right thing.  Without going through the entire list to see how many
are on critical hot paths, I'm sure some of them are on at least
partially critical hot paths (like creation of new connections).  I
would prefer to see this change be implemented via a device attribute,
not a functional call query.  That adds a needless function call in
these paths.

> diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> index f93eb8d..83370de 100644
> --- a/drivers/infiniband/core/verbs.c
> +++ b/drivers/infiniband/core/verbs.c
> @@ -133,14 +133,16 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_
>  	if (device->get_link_layer)
>  		return device->get_link_layer(device, port_num);
>  
> -	switch (rdma_node_get_transport(device->node_type)) {
> +	switch (device->query_transport(device, port_num)) {
>  	case RDMA_TRANSPORT_IB:
> +	case RDMA_TRANSPORT_IBOE:
>  		return IB_LINK_LAYER_INFINIBAND;

If we are perserving ABI, then this looks wrong.  Currently, IBOE
returnsi transport IB and link layer Ethernet.  It should not return
link layer IB, it does not support IB link layer operations (such as MAD
access).

>  	case RDMA_TRANSPORT_IWARP:
>  	case RDMA_TRANSPORT_USNIC:
>  	case RDMA_TRANSPORT_USNIC_UDP:
>  		return IB_LINK_LAYER_ETHERNET;
>  	default:
> +		BUG();
>  		return IB_LINK_LAYER_UNSPECIFIED;
>  	}
>  }

[ snip ]

> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 65994a1..d54f91e 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -75,10 +75,13 @@ enum rdma_node_type {
>  };
>  
>  enum rdma_transport_type {
> +	/* legacy for users */
>  	RDMA_TRANSPORT_IB,
>  	RDMA_TRANSPORT_IWARP,
>  	RDMA_TRANSPORT_USNIC,
> -	RDMA_TRANSPORT_USNIC_UDP
> +	RDMA_TRANSPORT_USNIC_UDP,
> +	/* new transport */
> +	RDMA_TRANSPORT_IBOE,
>  };

I'm also concerned about this.  I would like to see this enum
essentially turned into a bitmap.  One that is constructed in such a way
that we can always get the specific test we need with only one compare
against the overall value.  In order to do so, we need to break it down
into the essential elements that are part of each of the transports.
So, for instance, we can define the two link layers we have so far, plus
reserve one for OPA which we know is coming:

RDMA_LINK_LAYER_IB       = 0x00000001,
RDMA_LINK_LAYER_ETH      = 0x00000002,
RDMA_LINK_LAYER_OPA      = 0x00000004,
RDMA_LINK_LAYER_MASK     = 0x0000000f,

We can then define the currently known high level transport types:

RDMA_TRANSPORT_IB        = 0x00000010,
RDMA_TRANSPORT_IWARP     = 0x00000020,
RDMA_TRANSPORT_USNIC     = 0x00000040,
RDMA_TRANSPORT_USNIC_UDP = 0x00000080,
RDMA_TRANSPORT_MASK      = 0x000000f0,

We could then define bits for the IB management types:

RDMA_MGMT_IB             = 0x00000100,
RDMA_MGMT_OPA            = 0x00000200,
RDMA_MGMT_MASK           = 0x00000f00,

Then we have space to define specific quirks:

RDMA_SEPARATE_READ_SGE   = 0x00001000,
RDMA_QUIRKS_MASK         = 0xfffff000

Once those are defined, a few definitions for device drivers to use when
they initialize a device to set the bitmap to the right values:

#define IS_IWARP (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_IWARP |
RDMA_SEPARATE_READ_SGE)
#define IS_IB (RDMA_LINK_LAYER_IB | RDMA_TRANSPORT_IB | RDMA_MGMT_IB)
#define IS_IBOE (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_IB)
#define IS_USNIC (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_USNIC)
#define IS_OPA (RDMA_LINK_LAYER_OPA | RDMA_TRANSPORT_IB | RDMA_MGMT_IB |
RDMA_MGMT_OPA)

Then you need to define the tests:

static inline bool
rdma_transport_is_iwarp(struct ib_device *dev, u8 port)
{
	return dev->port[port]->transport & RDMA_TRANSPORT_IWARP;
}

/* Note: this intentionally covers IB, IBOE, and OPA...use
   rdma_dev_is_ib if you want to only get physical IB devices */
static inline bool
rdma_transport_is_ib(struct ibdev *dev)
{
	return dev->port[port]->transport & RDMA_TRANSPORT_IB;
}

rdma_port_is_ib(struct ib_device *dev, u8 port)
{
	return dev->port[port]->transport & RDMA_LINK_LAYER_IB;
}

rdma_port_is_iboe(struct ib_device *dev, u8 port)
{
	return dev->port[port]->transport & IS_IBOE == IS_IBOE;
}

rdma_port_is_usnic(struct ib_device *dev, u8 port)
{
	return dev->port[port]->transport & RDMA_TRANSPORT_USNIC;
}

rdma_port_is_opa(struct ib_device *dev, u8 port)
{
	return dev->port[port]->transport & RDMA_LINK_LAYER_OPA;
}

rdma_port_is_iwarp(struct ib_device *dev, u8 port)
{
	return rdma_transport_is_iwarp(dev, port);
}

rdma_port_ib_fabric_mgmt(struct ibdev *dev, u8 port)
{
	return dev->port[port]->transport & RDMA_MGMT_IB;
}

rdma_port_opa_mgmt(struct ibdev *dev, u8 port)
{
	return dev->port[port]->transport & RDMA_MGMT_OPA;
}

Other things can be changed too.  Like rdma_port_get_link_layer can
become this:

{
	return dev->transport & RDMA_LINK_LAYER_MASK;
}

From patch 2/17:


> +static inline int rdma_ib_mgmt(struct ib_device *device, u8 port_num)
> +{
> +       enum rdma_transport_type tp = device->query_transport(device,
> port_num);
> +
> +       return (tp == RDMA_TRANSPORT_IB || tp == RDMA_TRANSPORT_IBOE);
> +}

This looks wrong.  IBOE doesn't have IB management.  At least it doesn't
have subnet management.

Actually, reading through the remainder of the patches, there is some
serious confusion taking place here.  In later patches, you use this as
a surrogate for cap_cm, which implies you are talking about connection
management.  This is very different than the rdma_dev_ib_mgmt() test
that I create above, which specifically refers to IB management tasks
unique to IB/OPA: MAD, SM, multicast.

The kernel connection management code is not really limited.  It
supports IB, IBOE, iWARP, and in the future it will support OPA.  There
are some places in the CM code were we test for just IB/IBOE currently,
but that's only because we split iWARP out higher up in the abstraction
hierarchy.  So, calling something rdma_ib_mgmt and meaning a rather
specialized tested in the CM is probably misleading.

To straighten all this out, lets break management out into the two
distinct types:

rdma_port_ib_fabric_mgmt() <- fabric specific management tasks: MAD, SM,
multicast.  The proper test for this with my bitmap above is a simple
transport & RDMA_MGMT_IB test.  If will be true for IB and OPA fabrics.

rdma_port_conn_mgmt() <- connection management, which we currently
support everything except USNIC (correct Sean?), so a test would be
something like !(transport & RDMA_TRANSPORT_USNIC).  This is then split
out into two subgroups, IB style and iWARP stype connection management
(aka, rdma_port_iw_conn_mgmt() and rdma_port_ib_conn_mgmt()).  In my
above bitmap, since I didn't give IBOE its own transport type, these
subgroups still boil down to the simple tests transport & iWARP and
transport & IB like they do today.

From patch 3/17:


> +/**
> + * cap_ib_mad - Check if the port of device has the capability
> Infiniband
> + * Management Datagrams.
> + *
> + * @device: Device to be checked
> + * @port_num: Port number of the device
> + *
> + * Return 0 when port of the device don't support Infiniband
> + * Management Datagrams.
> + */
> +static inline int cap_ib_mad(struct ib_device *device, u8 port_num)
> +{
> +       return rdma_ib_mgmt(device, port_num);
> +}
> +

Why add cap_ib_mad?  It's nothing more than rdma_port_ib_fabric_mgmt
with a new name.  Just use rdma_port_ib_fabric_mgmt() everywhere you
have cap_ib_mad.

From patch 4/17:


> +/**
> + * cap_ib_smi - Check if the port of device has the capability
> Infiniband
> + * Subnet Management Interface.
> + *
> + * @device: Device to be checked
> + * @port_num: Port number of the device
> + *
> + * Return 0 when port of the device don't support Infiniband
> + * Subnet Management Interface.
> + */
> +static inline int cap_ib_smi(struct ib_device *device, u8 port_num)
> +{
> +       return rdma_transport_ib(device, port_num);
> +}
> +

Same as the previous patch.  This is needless indirection.  Just use
rdma_port_ib_fabric_mgmt directly.

Patch 5/17:

Again, just use rdma_port_ib_conn_mgmt() directly.

Patch 6/17:

Again, just use rdma_port_ib_fabric_mgmt() directly.

Patch 7/17:

Again, just use rdma_port_ib_fabric_mgmt() directly.  It's perfectly
applicable to the IB mcast registration requirements.

Patch 8/17:

Here we can create a new test if we are using the bitmap I created
above:

rdma_port_ipoib(struct ib_device *dev, u8 port)
{
	return !(dev->port[port]->transport & RDMA_LINK_LAYER_ETH);
}

This is presuming that OPA will need ipoib devices.  This will cause all
non-Ethernet link layer devices to return true, and right now, that is
all IB and all OPA devices.

Patch 9/17:

Most of the other comments on this patch stand as they are.  I would add
the test:

rdma_port_separate_read_sge(dev, port)
{
	return dev->port[port]->transport & RDMA_SEPERATE_READ_SGE;
}

and add the helper function:

rdma_port_get_read_sge(dev, port)
{
	if (rdma_transport_is_iwarp)
		return 1;
	return dev->port[port]->max_sge;
}

Then, as Jason points out, if at some point in the future the kernel is
modified to support devices with assymetrical read/write SGE sizes, this
function can be modified to support those devices.

Patch 10/17:

As Sean pointed out, force_grh should be rdma_dev_is_iboe().  The cm
handles iw devices, but you notice all of the functions you modify here
start with ib_.  The iwarp connections are funneled through iw_ specific
function variants, and so even though the cm handles iwarp, ib, and roce
devices, you never see anything other than ib/iboe (and opa in the
future) get to the ib_ variants of the functions.  So, they wrote the
original tests as tests against the link layer being ethernet and used
that to differentiate between ib and iboe devices.  It works, but can
confuse people.  So, everyplace that has !rdma_transport_ib should
really be rdma_dev_is_iboe instead.  If we ever merge the iw_ and ib_
functions in the future, having this right will help avoid problems.

Patch 11/17:

I wouldn't reform the link_layer_show except to make it compile with the
new defines I used above.  

Patch 12/17:

Go ahead and add a helper to check all ports on a dev, just make it
rdma_hca_ib_conn_mgmt() and have it loop through
rdma_port_ib_conn_mgmt() return codes.

Patch 13/17:

This patch is largely unneeded if we reworked the bitmap like I have
above.  A lot of the changes you made to switch from case statements to
multiple if statements can go back to being case statements because in
the bitmap IB and IBOE are still both transport IB, so you just do the
case on the transport bits and not on the link layer bits.

Patch 14/17:

Seems ok.

Patch 15/17:

If you implement the bitmap like I list above, then this code will need
fixed up to use the bitmap.  Otherwise it looks OK.

Patch 16/17:

OK.

Patch 17/17:

I would drop this patch.  In the future, the mlx5 driver will support
both Ethernet and IB like mlx4 does, and we would just need to pull this
code back to core instead of only in mlx4.
Jason Gunthorpe April 8, 2015, 7:35 p.m. UTC | #2
On Wed, Apr 08, 2015 at 06:41:22PM +0000, Hefty, Sean wrote:

> I too would like to see this as the end result, but I think it's
> possible to stage the changes by having the static inline calls
> being added convert to using these sort of attributes.

I agree as well, this patch set is already so big.

But Doug may be right, this conversion may need to be part of the
series that is applied in one go for performance reasons. But that is
just a patch at the end to optimize the inlines calls.

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe April 8, 2015, 8:10 p.m. UTC | #3
On Wed, Apr 08, 2015 at 02:29:46PM -0400, Doug Ledford wrote:

> To straighten all this out, lets break management out into the two
> distinct types:
> 
> rdma_port_ib_fabric_mgmt() <- fabric specific management tasks: MAD, SM,
> multicast.  The proper test for this with my bitmap above is a simple
> transport & RDMA_MGMT_IB test.  If will be true for IB and OPA fabrics.

> rdma_port_conn_mgmt() <- connection management, which we currently
> support everything except USNIC (correct Sean?), so a test would be
> something like !(transport & RDMA_TRANSPORT_USNIC).  This is then split
> out into two subgroups, IB style and iWARP stype connection management
> (aka, rdma_port_iw_conn_mgmt() and rdma_port_ib_conn_mgmt()).  In my
> above bitmap, since I didn't give IBOE its own transport type, these
> subgroups still boil down to the simple tests transport & iWARP and
> transport & IB like they do today.

There is a lot more variation here than just these two tests, and those
two tests won't scale to include OPA.

        IB ROCEE OPA
SMI     Y  N     Y    (though the OPA smi looked a bit different)
IB SMP  Y  N     N
OPA SMP N  N     Y
GMP     Y  Y     Y
SA      Y  N     Y
PM      Y  Y     Y    (? guessing for OPA)
CM      Y  Y     Y
GMP needs GRH N Y N

It may be unrealistic, but I was hoping we could largely scrub the
opaque 'is spec iWARP, is spec ROCEE' kinds of tests because they
don't tell anyone what it is the code cares about.

Maybe what is needed is a more precise language for the functions:

> > + * cap_ib_mad - Check if the port of device has the capability
> > Infiniband
> > + * Management Datagrams.

As used this seems to mean:

True if the port can do IB/OPA SMP, or GMP management packets on QP0 or
QP1. (Y Y Y) ie: Do we need the MAD layer at all.

ib_smi seems to be true if QP0 is supported  (Y N Y)

Maybe the above set would make a lot more sense as:
  cap_ib_qp0
  cap_ib_qp1
  cap_opa_qp0

ib_cm seems to mean that the CM protocol from the IBA is used on the
port (Y Y Y)

ib_sa means the IBA SA protocol is supported (Y Y Y)

ib_mcast true if the IBA SA protocol is used for multicast GIDs (Y N
Y)

ipoib means the port supports the ipoib protocol (Y N ?)

This seem reasonable and understandable, even if they are currently a
bit duplicating.

> Patch 9/17:
> 
> Most of the other comments on this patch stand as they are.  I would add
> the test:
> 
> rdma_port_separate_read_sge(dev, port)
> {
> 	return dev->port[port]->transport & RDMA_SEPERATE_READ_SGE;
> }
> 
> and add the helper function:
> 
> rdma_port_get_read_sge(dev, port)
> {
> 	if (rdma_transport_is_iwarp)
> 		return 1;
> 	return dev->port[port]->max_sge;
> }

Hum, that is nice, but it doesn't quite fit with how the ULP needs to
work. The max limit when creating a WR is the value passed into the
qp_cap, not the device maximum limit.

To do this properly we need to extend the qp_cap, and that is just too
big a change. A one bit iWarp quirk is OK for now.

> As Sean pointed out, force_grh should be rdma_dev_is_iboe().  The cm

I actually really prefer cap_mandatory_grh - that is what is going on
here. ie based on that name (as a reviewer) I'd expect to see the mad
layer check that the mandatory GRH is always present, or blow up.

Some of the other checks in this file revolve around pkey, I'm not
sure what rocee does there? cap_pkey_supported ?

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey April 8, 2015, 8:55 p.m. UTC | #4
On 4/8/2015 4:10 PM, Jason Gunthorpe wrote:
> On Wed, Apr 08, 2015 at 02:29:46PM -0400, Doug Ledford wrote:
>
>...
>>
>> rdma_port_get_read_sge(dev, port)
>> {
>> 	if (rdma_transport_is_iwarp)
>> 		return 1;
>> 	return dev->port[port]->max_sge;
>> }
>
> Hum, that is nice, but it doesn't quite fit with how the ULP needs to
> work. The max limit when creating a WR is the value passed into the
> qp_cap, not the device maximum limit.

Agreed, and I will again say that not all devices necessarily support
the same max_sge for all WR types. The current one-size-fits-all API
may make the upper layer think so, but it's possibly being lied to.

> To do this properly we need to extend the qp_cap, and that is just too
> big a change. A one bit iWarp quirk is OK for now.

Yes, it would be a large-ish change, and I like Doug's choice of word
"quirk" to capture these as exceptions, until the means for addressing
them is decided.

Overall, I like Doug's proposals, especially from an upper layer
perspective. I might suggest further refining them into categories, 
perhaps "management", primarily of interest to kernel and the
plumbing of connections; and actual "RDMA semantics", of interest
to RDMA consumers.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 9, 2015, 9:45 a.m. UTC | #5
On 04/08/2015 10:10 PM, Jason Gunthorpe wrote:
[snip]
> 
>> As Sean pointed out, force_grh should be rdma_dev_is_iboe().  The cm
> 
> I actually really prefer cap_mandatory_grh - that is what is going on
> here. ie based on that name (as a reviewer) I'd expect to see the mad
> layer check that the mandatory GRH is always present, or blow up.

Sounds good, will be in next version :-)

Regards,
Michael Wang

> 
> Some of the other checks in this file revolve around pkey, I'm not
> sure what rocee does there? cap_pkey_supported ?
> 
> Jason
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 9, 2015, 12:42 p.m. UTC | #6
On 04/08/2015 10:10 PM, Jason Gunthorpe wrote:
[snip]
> 
> Some of the other checks in this file revolve around pkey, I'm not
> sure what rocee does there? cap_pkey_supported ?

I'm not sure if this count in capability... how shall we describe it?

Regards,
Michael Wang

> 
> Jason
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 9, 2015, 2:34 p.m. UTC | #7
On Wed, 2015-04-08 at 14:10 -0600, Jason Gunthorpe wrote:
> On Wed, Apr 08, 2015 at 02:29:46PM -0400, Doug Ledford wrote:
> 
> > To straighten all this out, lets break management out into the two
> > distinct types:
> > 
> > rdma_port_ib_fabric_mgmt() <- fabric specific management tasks: MAD, SM,
> > multicast.  The proper test for this with my bitmap above is a simple
> > transport & RDMA_MGMT_IB test.  If will be true for IB and OPA fabrics.
> 
> > rdma_port_conn_mgmt() <- connection management, which we currently
> > support everything except USNIC (correct Sean?), so a test would be
> > something like !(transport & RDMA_TRANSPORT_USNIC).  This is then split
> > out into two subgroups, IB style and iWARP stype connection management
> > (aka, rdma_port_iw_conn_mgmt() and rdma_port_ib_conn_mgmt()).  In my
> > above bitmap, since I didn't give IBOE its own transport type, these
> > subgroups still boil down to the simple tests transport & iWARP and
> > transport & IB like they do today.
> 
> There is a lot more variation here than just these two tests, and those
> two tests won't scale to include OPA.
> 
>         IB ROCEE OPA
> SMI     Y  N     Y    (though the OPA smi looked a bit different)
> IB SMP  Y  N     N
> OPA SMP N  N     Y
> GMP     Y  Y     Y
> SA      Y  N     Y
> PM      Y  Y     Y    (? guessing for OPA)
> CM      Y  Y     Y
> GMP needs GRH N Y N
> 

You can still break this down to a manageable bitmap.

SMI, SMP, and SA are all essentially the same and can be combined to one
bitmap that is

IB_SM  0x1
OPA_SM 0x2

and the defines are such that IB devices define IB_SM, and OPA devices
define IB_SM and OPA_SM.  Any minor differences between OPA and IB can
be handled by testing just the OPA_SM bit.  This will exclude all IBOE
devices and iWARP devices.

GMP, PM, and CM are all the same, and are all identical to transport ==
INFINIBAND.

GMP needs GRH happens to be precisely the same as ib_dev_is_iboe.

These are exactly the tests I proposed Jason.  I'm not sure I see your
point here.  I guess my point is that although the scenario of all the
different items seems complex, it really does boil down to needing only
exactly what I proposed earlier to fulfill the entire test matrix.
Jason Gunthorpe April 9, 2015, 4 p.m. UTC | #8
On Thu, Apr 09, 2015 at 02:42:24PM +0200, Michael Wang wrote:
> On 04/08/2015 10:10 PM, Jason Gunthorpe wrote:
> [snip]
> > 
> > Some of the other checks in this file revolve around pkey, I'm not
> > sure what rocee does there? cap_pkey_supported ?
> 
> I'm not sure if this count in capability... how shall we describe it?

I'm not sure how rocee uses pkey, but maybe the the GRH and pkey thing
would work well together under a single 'cap_ethernet_ah' ?

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe April 9, 2015, 4:01 p.m. UTC | #9
On Thu, Apr 09, 2015 at 10:34:30AM -0400, Doug Ledford wrote:

> These are exactly the tests I proposed Jason.  I'm not sure I see your
> point here.  I guess my point is that although the scenario of all the
> different items seems complex, it really does boil down to needing only
> exactly what I proposed earlier to fulfill the entire test matrix.

I have no problem with minimizing a bitmap, but I want the accessors
to make sense first.

My specific problem with your suggestion was combining cap_ib_mad,
cap_ib_sa, and cap_ib_smi into rdma_port_ib_fabric_mgmt.

Not only do the three cap things not return the same value for all
situations, the documentary knowledge is lost by the reduction.

I'd prefer we look at this from a 'what do the call sites need' view,
not a 'how do we minimize' view.

I've written this before: The mess here is that it is too hard to know
what the call sites are actually checking for when it is some baroque
conditional.

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 9, 2015, 9:19 p.m. UTC | #10
On Thu, 2015-04-09 at 10:01 -0600, Jason Gunthorpe wrote:
> On Thu, Apr 09, 2015 at 10:34:30AM -0400, Doug Ledford wrote:
> 
> > These are exactly the tests I proposed Jason.  I'm not sure I see your
> > point here.  I guess my point is that although the scenario of all the
> > different items seems complex, it really does boil down to needing only
> > exactly what I proposed earlier to fulfill the entire test matrix.
> 
> I have no problem with minimizing a bitmap, but I want the accessors
> to make sense first.
> 
> My specific problem with your suggestion was combining cap_ib_mad,
> cap_ib_sa, and cap_ib_smi into rdma_port_ib_fabric_mgmt.
> 
> Not only do the three cap things not return the same value for all
> situations, the documentary knowledge is lost by the reduction.
> 
> I'd prefer we look at this from a 'what do the call sites need' view,
> not a 'how do we minimize' view.
> 
> I've written this before: The mess here is that it is too hard to know
> what the call sites are actually checking for when it is some baroque
> conditional.

The two goals: being specific about what the test is returning and
minimizing the bitmap footprint; are not necessarily opposed.  One can
do both at the same time.
Jason Gunthorpe April 9, 2015, 9:36 p.m. UTC | #11
On Thu, Apr 09, 2015 at 05:19:08PM -0400, Doug Ledford wrote:

> The two goals: being specific about what the test is returning and
> minimizing the bitmap footprint; are not necessarily opposed.  One can
> do both at the same time.

Agree

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 10, 2015, 6:16 a.m. UTC | #12
First off there are 2 separate issues here:

1) We need to communicate if a port supports or requires various management
   support from the ib_mad, ib_cm, and/or ib_sa modules.

2) We need to communicate how a addresses are formated and resolved for a
   particular port


In general I don't think we need to remove all uses of the Transport
or Link Layer.

Although we may be able to remove most of the transport uses.

On Wed, Apr 08, 2015 at 02:10:15PM -0600, Jason Gunthorpe wrote:
> On Wed, Apr 08, 2015 at 02:29:46PM -0400, Doug Ledford wrote:
> 
> > To straighten all this out, lets break management out into the two
> > distinct types:
> > 
> > rdma_port_ib_fabric_mgmt() <- fabric specific management tasks: MAD, SM,
> > multicast.  The proper test for this with my bitmap above is a simple
> > transport & RDMA_MGMT_IB test.  If will be true for IB and OPA fabrics.
> 
> > rdma_port_conn_mgmt() <- connection management, which we currently
> > support everything except USNIC (correct Sean?), so a test would be
> > something like !(transport & RDMA_TRANSPORT_USNIC).  This is then split
> > out into two subgroups, IB style and iWARP stype connection management
> > (aka, rdma_port_iw_conn_mgmt() and rdma_port_ib_conn_mgmt()).  In my
> > above bitmap, since I didn't give IBOE its own transport type, these
> > subgroups still boil down to the simple tests transport & iWARP and
> > transport & IB like they do today.
> 
> There is a lot more variation here than just these two tests, and those
> two tests won't scale to include OPA.
> 
>         IB ROCEE OPA
> SMI     Y  N     Y    (though the OPA smi looked a bit different)

Yes OPA is different but it is based on the class version of the individual
MADs not any particular device/port support.

> IB SMP  Y  N     N

Correction:

  IB SMP  Y  N     Y    (OPA supports the IB NodeInfo query)

> OPA SMP N  N     Y

How is this different from the SMI?

> GMP     Y  Y     Y
> SA      Y  N     Y
> PM      Y  Y     Y    (? guessing for OPA)
                        ^^^
			Yes

> CM      Y  Y     Y
> GMP needs GRH N Y N
> 
> It may be unrealistic, but I was hoping we could largely scrub the
> opaque 'is spec iWARP, is spec ROCEE' kinds of tests because they
> don't tell anyone what it is the code cares about.

I somewhat agree except for things like addressing.  In the area of addressing
I think we are likely to need to define something like "cap_addr_ib",
"cap_addr_iboe", "cap_addr_iwarp".  See below for more details.

> 
> Maybe what is needed is a more precise language for the functions:
> 
> > > + * cap_ib_mad - Check if the port of device has the capability
> > > Infiniband
> > > + * Management Datagrams.
> 
> As used this seems to mean:
> 
> True if the port can do IB/OPA SMP, or GMP management packets on QP0 or
> QP1. (Y Y Y) ie: Do we need the MAD layer at all.
> 
> ib_smi seems to be true if QP0 is supported  (Y N Y)
> 
> Maybe the above set would make a lot more sense as:
>   cap_ib_qp0
>   cap_ib_qp1
>   cap_opa_qp0

I disagree.

All we need right now is is cap_qp0.  All devices currently support QP1.

Then after all this is settled I can add:

               IB ROCEE OPA
OPA MAD Space   N  N    Y    Port is OPA MAD space.

> 
> ib_cm seems to mean that the CM protocol from the IBA is used on the
> port (Y Y Y)

Agree.

> 
> ib_sa means the IBA SA protocol is supported (Y Y Y)

I think this should be (Y N Y)

IBoE has no SA.  The IBoE code "fabricates" a Path Record it does not need to
interact with the SA.

> 
> ib_mcast true if the IBA SA protocol is used for multicast GIDs (Y N
> Y)

Given the above why can't we just have the "ib_sa" flag?

> 
> ipoib means the port supports the ipoib protocol (Y N ?)


OPA does IPoIB so...   (Y N Y)


However, I think checking the link layer is more appropriate here.  It does not
make sense to do IP over IB over Eth.  Even though the IBoE can do the "IB"
protocol.


Making flags in the driver to indicate which ULPs they support is a _bad_
_idea_.

FWIW: I don't consider the MAD and SA (multicast) modules ULPs.   Rather they
are helper modules which are built to share code amongst the drivers to process
things on behalf of the drivers themselves.  As such advertising a need for or
particular support within those modules make sense.

So although strictly speaking we could do IPoIBoEth, I think having IPoIB check
the LL and limiting itself to ports which are IB LL is appropriate.

> 
> This seem reasonable and understandable, even if they are currently a
> bit duplicating.
> 
> > Patch 9/17:
> > 
> > Most of the other comments on this patch stand as they are.  I would add
> > the test:
> > 
> > rdma_port_separate_read_sge(dev, port)
> > {
> > 	return dev->port[port]->transport & RDMA_SEPERATE_READ_SGE;
> > }
> > 
> > and add the helper function:
> > 
> > rdma_port_get_read_sge(dev, port)
> > {
> > 	if (rdma_transport_is_iwarp)
> > 		return 1;
> > 	return dev->port[port]->max_sge;
> > }
> 
> Hum, that is nice, but it doesn't quite fit with how the ULP needs to
> work. The max limit when creating a WR is the value passed into the
> qp_cap, not the device maximum limit.
> 
> To do this properly we need to extend the qp_cap, and that is just too
> big a change. A one bit iWarp quirk is OK for now.

I agree.  This is the one place we probably want to just keep the "Transport"
check.

> 
> > As Sean pointed out, force_grh should be rdma_dev_is_iboe().  The cm
> 
> I actually really prefer cap_mandatory_grh - that is what is going on
> here. ie based on that name (as a reviewer) I'd expect to see the mad
> layer check that the mandatory GRH is always present, or blow up.

While GRH mandatory (for the GMP) is what this is.  The function
ib_init_ah_from_path generically is really handling an "IBoE address" to send
to and therefore we need to force the GRH in the AH.

There is a whole slew of LL and Transport checks which are used to
format/resolve addresses.

Functions which need to know that the address format is "IBoE"

	-- cma.c: cma_acquire_dev
	-- cma.c: cma_modify_qp_rtr
	-- sa_query.c: ib_init_ah_from_path
	-- verbs.c: ib_resolve_eth_l2_attrs

What about a check rdma_port_req_iboe_addr()?

Functions where AF_IB is checked against LL because it does not make sense to
use AF_IB on anything but an IB LL

	-- cma.c: cma_listen_on_dev
	-- cma.c: cma_bind_loopback

These checks should be directly against the LL

Functions where the ARP type is checked against the link layer.

	-- cma.c: cma_acquire_dev
	-- cma.c: cma_bind_loopback

These checks should be directly against the LL

> 
> Some of the other checks in this file revolve around pkey, I'm not
> sure what rocee does there? cap_pkey_supported ?

It seems IBoE just hardcodes the pkey to 0xffff.  I don't see it used anywhere.

Function where port requires "real" PKey

	-- cma.c: cma_ib_init_qp_attr

Check rdma_port_req_pkey()?


Over all for the addressing choices:

The "Transport" (or protocol, or whatever) is Verbs.  The Layer below Verbs
(OPA/IB/Eth/TCP) defines how addressing, route, and connection information is
generated, communicated, and used.

As Jason and Doug have been saying sometimes we want to know when that requires
SA interaction or the use of the CM protocol (or neither).  Other times we just
need to know what the Address format or Link Layer is.


Ira

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 10, 2015, 7:46 a.m. UTC | #13
On 04/09/2015 11:19 PM, Doug Ledford wrote:
[snip]
>>
>> I've written this before: The mess here is that it is too hard to know
>> what the call sites are actually checking for when it is some baroque
>> conditional.
> 
> The two goals: being specific about what the test is returning and
> minimizing the bitmap footprint; are not necessarily opposed.  One can
> do both at the same time.

This could be internal reforming after the cap_XX() stuff works for core
layer, at that time we don't need to touch core layer anymore, just
introducing this bitmap stuff in verb layer, replacing the implementation
of these helpers with the bitmap check, and following the semantic (description).

Regards,
Michael Wang

> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 10, 2015, 7:48 a.m. UTC | #14
> > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> > index 18c1ece..a9587c4 100644
> > --- a/drivers/infiniband/core/device.c
> > +++ b/drivers/infiniband/core/device.c
> > @@ -76,6 +76,7 @@ static int ib_device_check_mandatory(struct ib_device *device)
> >  	} mandatory_table[] = {
> >  		IB_MANDATORY_FUNC(query_device),
> >  		IB_MANDATORY_FUNC(query_port),
> > +		IB_MANDATORY_FUNC(query_transport),
> >  		IB_MANDATORY_FUNC(query_pkey),
> >  		IB_MANDATORY_FUNC(query_gid),
> >  		IB_MANDATORY_FUNC(alloc_pd),
> 
> I'm concerned about the performance implications of this.  The size of
> this patchset already points out just how many places in the code we
> have to check for various aspects of the device transport in order to do
> the right thing.  Without going through the entire list to see how many
> are on critical hot paths, I'm sure some of them are on at least
> partially critical hot paths (like creation of new connections).  I
> would prefer to see this change be implemented via a device attribute,
> not a functional call query.  That adds a needless function call in
> these paths.

I like the idea of a query_transport but at the same time would like to see the
use of "transport" reduced.  A reduction in the use of this call could
eliminate most performance concerns.

So can we keep this abstraction if at the end of the series we limit its use?

> 
> > diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> > index f93eb8d..83370de 100644
> > --- a/drivers/infiniband/core/verbs.c
> > +++ b/drivers/infiniband/core/verbs.c
> > @@ -133,14 +133,16 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_
> >  	if (device->get_link_layer)
> >  		return device->get_link_layer(device, port_num);
> >  
> > -	switch (rdma_node_get_transport(device->node_type)) {
> > +	switch (device->query_transport(device, port_num)) {
> >  	case RDMA_TRANSPORT_IB:
> > +	case RDMA_TRANSPORT_IBOE:
> >  		return IB_LINK_LAYER_INFINIBAND;
> 
> If we are perserving ABI, then this looks wrong.  Currently, IBOE
> returnsi transport IB and link layer Ethernet.  It should not return
> link layer IB, it does not support IB link layer operations (such as MAD
> access).

I think the original code has the bug.

IBoE devices currently return a transport of IB but they probably never get
here because they support the get_link_layer callback used a few lines above.
So this "bug" was probably never hit.

> 
> >  	case RDMA_TRANSPORT_IWARP:
> >  	case RDMA_TRANSPORT_USNIC:
> >  	case RDMA_TRANSPORT_USNIC_UDP:
> >  		return IB_LINK_LAYER_ETHERNET;
> >  	default:
> > +		BUG();
> >  		return IB_LINK_LAYER_UNSPECIFIED;
> >  	}
> >  }
> 
> [ snip ]
> 
> > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> > index 65994a1..d54f91e 100644
> > --- a/include/rdma/ib_verbs.h
> > +++ b/include/rdma/ib_verbs.h
> > @@ -75,10 +75,13 @@ enum rdma_node_type {
> >  };
> >  
> >  enum rdma_transport_type {
> > +	/* legacy for users */
> >  	RDMA_TRANSPORT_IB,
> >  	RDMA_TRANSPORT_IWARP,
> >  	RDMA_TRANSPORT_USNIC,
> > -	RDMA_TRANSPORT_USNIC_UDP
> > +	RDMA_TRANSPORT_USNIC_UDP,
> > +	/* new transport */
> > +	RDMA_TRANSPORT_IBOE,
> >  };
> 
> I'm also concerned about this.  I would like to see this enum
> essentially turned into a bitmap.  One that is constructed in such a way
> that we can always get the specific test we need with only one compare
> against the overall value.  In order to do so, we need to break it down
> into the essential elements that are part of each of the transports.
> So, for instance, we can define the two link layers we have so far, plus
> reserve one for OPA which we know is coming:
> 
> RDMA_LINK_LAYER_IB       = 0x00000001,
> RDMA_LINK_LAYER_ETH      = 0x00000002,
> RDMA_LINK_LAYER_OPA      = 0x00000004,
> RDMA_LINK_LAYER_MASK     = 0x0000000f,

I would reserve more bits here.

> 
> We can then define the currently known high level transport types:
> 
> RDMA_TRANSPORT_IB        = 0x00000010,
> RDMA_TRANSPORT_IWARP     = 0x00000020,
> RDMA_TRANSPORT_USNIC     = 0x00000040,
> RDMA_TRANSPORT_USNIC_UDP = 0x00000080,
> RDMA_TRANSPORT_MASK      = 0x000000f0,

I would reserve more bits here.

> 
> We could then define bits for the IB management types:
> 
> RDMA_MGMT_IB             = 0x00000100,
> RDMA_MGMT_OPA            = 0x00000200,
> RDMA_MGMT_MASK           = 0x00000f00,

We at least need bits for SA / CM support.

I said previously all device types support QP1  I was wrong...  I forgot about
USNIC devices.  So the full management bit mask is.


RDMA_MGMT_IB_MAD         = 0x00000100,
RDMA_MGMT_QP0            = 0x00000200,
RDMA_MGMT_SA             = 0x00000400,
RDMA_MGMT_CM             = 0x00000800,
RDMA_MGMT_OPA_MAD        = 0x00001000,
RDMA_MGMT_MASK           = 0x000fff00,

With a couple of spares.

The MAD stack is pretty agnostic to the types of MADs passing through it so we
don't really need PM flags etc.

> 
> Then we have space to define specific quirks:
> 
> RDMA_SEPARATE_READ_SGE   = 0x00001000,
> RDMA_QUIRKS_MASK         = 0xfffff000

shift for spares...

RDMA_SEPARATE_READ_SGE   = 0x00100000,
RDMA_QUIRKS_MASK         = 0xfff00000

> 
> Once those are defined, a few definitions for device drivers to use when
> they initialize a device to set the bitmap to the right values:
> 
> #define IS_IWARP (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_IWARP |
> RDMA_SEPARATE_READ_SGE)
> #define IS_IB (RDMA_LINK_LAYER_IB | RDMA_TRANSPORT_IB | RDMA_MGMT_IB)
> #define IS_IBOE (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_IB)
> #define IS_USNIC (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_USNIC)
> #define IS_OPA (RDMA_LINK_LAYER_OPA | RDMA_TRANSPORT_IB | RDMA_MGMT_IB |
> RDMA_MGMT_OPA)
> 
> Then you need to define the tests:
> 
> static inline bool
> rdma_transport_is_iwarp(struct ib_device *dev, u8 port)
> {
> 	return dev->port[port]->transport & RDMA_TRANSPORT_IWARP;
> }
> 
> /* Note: this intentionally covers IB, IBOE, and OPA...use
>    rdma_dev_is_ib if you want to only get physical IB devices */
> static inline bool
> rdma_transport_is_ib(struct ibdev *dev)
> {
> 	return dev->port[port]->transport & RDMA_TRANSPORT_IB;
> }
> 
> rdma_port_is_ib(struct ib_device *dev, u8 port)

I prefer

rdma_port_link_layer_is_ib
rdma_port_link_layer_is_eth


> {
> 	return dev->port[port]->transport & RDMA_LINK_LAYER_IB;
> }
> 
> rdma_port_is_iboe(struct ib_device *dev, u8 port)

I'm not sure what this means.

rdma_port_req_iboe_addr seems more appropriate because what we really need to
know is that this device requires an IBoE address format.  (PKey is "fake",
PathRecord is fabricated rather than queried, GRH is required in the AH
conversion.)

> {
> 	return dev->port[port]->transport & IS_IBOE == IS_IBOE;
> }
> 
> rdma_port_is_usnic(struct ib_device *dev, u8 port)

rdma_transport_is_usnic

> {
> 	return dev->port[port]->transport & RDMA_TRANSPORT_USNIC;
> }
> 
> rdma_port_is_opa(struct ib_device *dev, u8 port)

rdma_port_link_layer_is_opa

> {
> 	return dev->port[port]->transport & RDMA_LINK_LAYER_OPA;
> }
> 
> rdma_port_is_iwarp(struct ib_device *dev, u8 port)
> {
> 	return rdma_transport_is_iwarp(dev, port);

Why not call rdma_transport_is_iwarp?

> }
> 
> rdma_port_ib_fabric_mgmt(struct ibdev *dev, u8 port)
> {
> 	return dev->port[port]->transport & RDMA_MGMT_IB;

I agree with Jason that this does not adequately describe the functionality we
are looking for.

> }
> 
> rdma_port_opa_mgmt(struct ibdev *dev, u8 port)

Agree.

> {
> 	return dev->port[port]->transport & RDMA_MGMT_OPA;
> }
> 
> Other things can be changed too.  Like rdma_port_get_link_layer can
> become this:
> 
> {
> 	return dev->transport & RDMA_LINK_LAYER_MASK;
> }
> 
> From patch 2/17:
> 
> 
> > +static inline int rdma_ib_mgmt(struct ib_device *device, u8 port_num)
> > +{
> > +       enum rdma_transport_type tp = device->query_transport(device,
> > port_num);
> > +
> > +       return (tp == RDMA_TRANSPORT_IB || tp == RDMA_TRANSPORT_IBOE);
> > +}
> 
> This looks wrong.  IBOE doesn't have IB management.  At least it doesn't
> have subnet management.

Right that is why we need a bit for CM vs SA capability.

> 
> Actually, reading through the remainder of the patches, there is some
> serious confusion taking place here.  In later patches, you use this as
> a surrogate for cap_cm, which implies you are talking about connection
> management.  This is very different than the rdma_dev_ib_mgmt() test
> that I create above, which specifically refers to IB management tasks
> unique to IB/OPA: MAD, SM, multicast.

multicast is part of SA so should be covered by the SA capability.

> 
> The kernel connection management code is not really limited.  It
> supports IB, IBOE, iWARP, and in the future it will support OPA.  There
> are some places in the CM code were we test for just IB/IBOE currently,
> but that's only because we split iWARP out higher up in the abstraction
> hierarchy.  So, calling something rdma_ib_mgmt and meaning a rather
> specialized tested in the CM is probably misleading.

Right!  So we should have a CM capability.

> 
> To straighten all this out, lets break management out into the two
> distinct types:
> 
> rdma_port_ib_fabric_mgmt() <- fabric specific management tasks: MAD, SM,
> multicast.  The proper test for this with my bitmap above is a simple
> transport & RDMA_MGMT_IB test.  If will be true for IB and OPA fabrics.

General management is covered by

RDMA_MGMT_IB_MAD         = 0x00000100,
RDMA_MGMT_QP0            = 0x00000200,
...
RDMA_MGMT_OPA_MAD        = 0x00001000,


> 
> rdma_port_conn_mgmt() <- connection management, which we currently
> support everything except USNIC (correct Sean?), so a test would be
> something like !(transport & RDMA_TRANSPORT_USNIC).  This is then split
> out into two subgroups, IB style and iWARP stype connection management
> (aka, rdma_port_iw_conn_mgmt() and rdma_port_ib_conn_mgmt()).  In my
> above bitmap, since I didn't give IBOE its own transport type, these
> subgroups still boil down to the simple tests transport & iWARP and
> transport & IB like they do today.

Specific management features CM, route resolution (SA), and special Multicast
management requirements (SA) are covered by:

RDMA_MGMT_SA             = 0x00000400,
RDMA_MGMT_CM             = 0x00000800,


> 
> From patch 3/17:
> 
> 
> > +/**
> > + * cap_ib_mad - Check if the port of device has the capability
> > Infiniband
> > + * Management Datagrams.
> > + *
> > + * @device: Device to be checked
> > + * @port_num: Port number of the device
> > + *
> > + * Return 0 when port of the device don't support Infiniband
> > + * Management Datagrams.
> > + */
> > +static inline int cap_ib_mad(struct ib_device *device, u8 port_num)
> > +{
> > +       return rdma_ib_mgmt(device, port_num);
> > +}
> > +
> 
> Why add cap_ib_mad?  It's nothing more than rdma_port_ib_fabric_mgmt
> with a new name.  Just use rdma_port_ib_fabric_mgmt() everywhere you
> have cap_ib_mad.

Because USNIC apparently does not support MADs at all.  So we end up needing a
"big flag" to turn on/off ib_mad.

RDMA_MGMT_IB_MAD         = 0x00000100,

> 
> From patch 4/17:
> 
> 
> > +/**
> > + * cap_ib_smi - Check if the port of device has the capability
> > Infiniband
> > + * Subnet Management Interface.
> > + *
> > + * @device: Device to be checked
> > + * @port_num: Port number of the device
> > + *
> > + * Return 0 when port of the device don't support Infiniband
> > + * Subnet Management Interface.
> > + */
> > +static inline int cap_ib_smi(struct ib_device *device, u8 port_num)
> > +{
> > +       return rdma_transport_ib(device, port_num);
> > +}
> > +
> 
> Same as the previous patch.  This is needless indirection.  Just use
> rdma_port_ib_fabric_mgmt directly.

No this is not the same...  You said:

<quote>
   rdma_port_ib_fabric_mgmt() <- fabric specific management tasks: MAD, SM,
   multicast.  The proper test for this with my bitmap above is a simple
   transport & RDMA_MGMT_IB test.  If will be true for IB and OPA fabrics.
</quote>

But what we are looking for here is "does the port support QP0"  Previously
flagged as "smi".  Cover this with this flag.

RDMA_MGMT_QP0            = 0x00000200,

So the optimized version of the above is:

static inline int cap_ib_smi(struct ib_device *device, u8 port_num)
{
       return [<device> <port>]->flags & RDMA_MGMT_QP0;
}


> 
> Patch 5/17:
> 
> Again, just use rdma_port_ib_conn_mgmt() directly.

Agreed.

> 
> Patch 6/17:
> 
> Again, just use rdma_port_ib_fabric_mgmt() directly.

No this needs to be

rdma_port_requires_sa()

or something...

> 
> Patch 7/17:
> 
> Again, just use rdma_port_ib_fabric_mgmt() directly.  It's perfectly
> applicable to the IB mcast registration requirements.

No this needs to be

rdma_port_requires_sa()

or something...

> 
> Patch 8/17:
> 
> Here we can create a new test if we are using the bitmap I created
> above:
> 
> rdma_port_ipoib(struct ib_device *dev, u8 port)
> {
> 	return !(dev->port[port]->transport & RDMA_LINK_LAYER_ETH);

I would prefer a link layer (or generic) bit mask rather than '&' of
"transport" and "link layer".  That just seems wrong.

> }
> 
> This is presuming that OPA will need ipoib devices.  This will cause all
> non-Ethernet link layer devices to return true, and right now, that is
> all IB and all OPA devices.

Agreed.

> 
> Patch 9/17:
> 
> Most of the other comments on this patch stand as they are.  I would add
> the test:
> 
> rdma_port_separate_read_sge(dev, port)
> {
> 	return dev->port[port]->transport & RDMA_SEPERATE_READ_SGE;
> }
> 
> and add the helper function:
> 
> rdma_port_get_read_sge(dev, port)
> {
> 	if (rdma_transport_is_iwarp)
> 		return 1;
> 	return dev->port[port]->max_sge;
> }
> 
> Then, as Jason points out, if at some point in the future the kernel is
> modified to support devices with assymetrical read/write SGE sizes, this
> function can be modified to support those devices.
> 
> Patch 10/17:
> 
> As Sean pointed out, force_grh should be rdma_dev_is_iboe().  The cm
> handles iw devices, but you notice all of the functions you modify here
> start with ib_.  The iwarp connections are funneled through iw_ specific
> function variants, and so even though the cm handles iwarp, ib, and roce
> devices, you never see anything other than ib/iboe (and opa in the
> future) get to the ib_ variants of the functions.  So, they wrote the
> original tests as tests against the link layer being ethernet and used
> that to differentiate between ib and iboe devices.  It works, but can
> confuse people.  So, everyplace that has !rdma_transport_ib should
> really be rdma_dev_is_iboe instead.  If we ever merge the iw_ and ib_
> functions in the future, having this right will help avoid problems.

I guess rdma_dev_is_iboe is ok.  But it seems like we are keying off the
addresses not necessarily the devices.

> 
> Patch 11/17:
> 
> I wouldn't reform the link_layer_show except to make it compile with the
> new defines I used above.  
> 
> Patch 12/17:
> 
> Go ahead and add a helper to check all ports on a dev, just make it
> rdma_hca_ib_conn_mgmt() and have it loop through
> rdma_port_ib_conn_mgmt() return codes.
> 
> Patch 13/17:
> 
> This patch is largely unneeded if we reworked the bitmap like I have
> above.  A lot of the changes you made to switch from case statements to
> multiple if statements can go back to being case statements because in
> the bitmap IB and IBOE are still both transport IB, so you just do the
> case on the transport bits and not on the link layer bits.
> 
> Patch 14/17:
> 
> Seems ok.
> 
> Patch 15/17:
> 
> If you implement the bitmap like I list above, then this code will need
> fixed up to use the bitmap.  Otherwise it looks OK.
> 
> Patch 16/17:
> 
> OK.
> 
> Patch 17/17:
> 
> I would drop this patch.  In the future, the mlx5 driver will support
> both Ethernet and IB like mlx4 does, and we would just need to pull this
> code back to core instead of only in mlx4.

Agreed.

Ira

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 10, 2015, 8:19 a.m. UTC | #15
On 04/09/2015 06:00 PM, Jason Gunthorpe wrote:
> On Thu, Apr 09, 2015 at 02:42:24PM +0200, Michael Wang wrote:
>> On 04/08/2015 10:10 PM, Jason Gunthorpe wrote:
>> [snip]
>>>
>>> Some of the other checks in this file revolve around pkey, I'm not
>>> sure what rocee does there? cap_pkey_supported ?
>>
>> I'm not sure if this count in capability... how shall we describe it?
> 
> I'm not sure how rocee uses pkey, but maybe the the GRH and pkey thing
> would work well together under a single 'cap_ethernet_ah' ?

Sounds better, we can use this in all the case that handling address
for eth-link-layer :-)

Regards,
Michael Wang

> 
> Jason
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 10, 2015, 8:25 a.m. UTC | #16
On 04/10/2015 08:16 AM, ira.weiny wrote:
> First off there are 2 separate issues here:
> 
> 1) We need to communicate if a port supports or requires various management
>    support from the ib_mad, ib_cm, and/or ib_sa modules.
> 
> 2) We need to communicate how a addresses are formated and resolved for a
>    particular port
> 
> 
> In general I don't think we need to remove all uses of the Transport
> or Link Layer.
> 
> Although we may be able to remove most of the transport uses.
> 
> On Wed, Apr 08, 2015 at 02:10:15PM -0600, Jason Gunthorpe wrote:
[snip]
> 
>>
>> Some of the other checks in this file revolve around pkey, I'm not
>> sure what rocee does there? cap_pkey_supported ?
> 
> It seems IBoE just hardcodes the pkey to 0xffff.  I don't see it used anywhere.
> 
> Function where port requires "real" PKey
> 
> 	-- cma.c: cma_ib_init_qp_attr
> 
> Check rdma_port_req_pkey()?

What about cap_eth_ah() for all the cases need eth addressing handling?

> 
> 
> Over all for the addressing choices:
> 
> The "Transport" (or protocol, or whatever) is Verbs.  The Layer below Verbs
> (OPA/IB/Eth/TCP) defines how addressing, route, and connection information is
> generated, communicated, and used.
> 
> As Jason and Doug have been saying sometimes we want to know when that requires
> SA interaction or the use of the CM protocol (or neither).  Other times we just
> need to know what the Address format or Link Layer is.

Till now it seems like we could be able to eliminate the link layer helper in core
layer, but I'll reserve that helper in next version, if later we do not need it anymore,
let's erase it then ;-)

Regards,
Michael Wang

> 
> 
> Ira
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 10, 2015, 2:56 p.m. UTC | #17
On Fri, Apr 10, 2015 at 10:25:21AM +0200, Michael Wang wrote:
> On 04/10/2015 08:16 AM, ira.weiny wrote:
> > First off there are 2 separate issues here:
> > 
> > 1) We need to communicate if a port supports or requires various management
> >    support from the ib_mad, ib_cm, and/or ib_sa modules.
> > 
> > 2) We need to communicate how a addresses are formated and resolved for a
> >    particular port
> > 
> > 
> > In general I don't think we need to remove all uses of the Transport
> > or Link Layer.
> > 
> > Although we may be able to remove most of the transport uses.
> > 
> > On Wed, Apr 08, 2015 at 02:10:15PM -0600, Jason Gunthorpe wrote:
> [snip]
> > 
> >>
> >> Some of the other checks in this file revolve around pkey, I'm not
> >> sure what rocee does there? cap_pkey_supported ?
> > 
> > It seems IBoE just hardcodes the pkey to 0xffff.  I don't see it used anywhere.
> > 
> > Function where port requires "real" PKey
> > 
> > 	-- cma.c: cma_ib_init_qp_attr
> > 
> > Check rdma_port_req_pkey()?
> 
> What about cap_eth_ah() for all the cases need eth addressing handling?

That works.

> 
> > 
> > 
> > Over all for the addressing choices:
> > 
> > The "Transport" (or protocol, or whatever) is Verbs.  The Layer below Verbs
> > (OPA/IB/Eth/TCP) defines how addressing, route, and connection information is
> > generated, communicated, and used.
> > 
> > As Jason and Doug have been saying sometimes we want to know when that requires
> > SA interaction or the use of the CM protocol (or neither).  Other times we just
> > need to know what the Address format or Link Layer is.
> 
> Till now it seems like we could be able to eliminate the link layer helper in core
> layer, but I'll reserve that helper in next version, if later we do not need it anymore,
> let's erase it then ;-)
 
Eliminating Link Layer is fine if we can do it, but I still think that
something like IPoIB should check the link layer.

After sleeping on it the driver exporting cap_ipoib() does not seem _so_ bad
but I still see a distinction between ULPs like IPoIB and the other modules we
have been discussing.

Ira

> Regards,
> Michael Wang
> 
> > 
> > 
> > Ira
> > 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe April 10, 2015, 4:15 p.m. UTC | #18
On Fri, Apr 10, 2015 at 02:16:11AM -0400, ira.weiny wrote:

> >         IB ROCEE OPA
> > SMI     Y  N     Y    (though the OPA smi looked a bit different)
> 
> Yes OPA is different but it is based on the class version of the individual
> MADs not any particular device/port support.

> > OPA SMP N  N     Y
> 
> How is this different from the SMI?

Any code that generates SMPs and SMIs is going to need to know what
format to generate them in. It seems we have a sort of weird world
where IB SMPs are supported on OPA but not the IB SMI.

Not sure any users exist though..

> > Maybe the above set would make a lot more sense as:
> >   cap_ib_qp0
> >   cap_ib_qp1
> >   cap_opa_qp0
> 
> I disagree.
> 
> All we need right now is is cap_qp0.  All devices currently support QP1.

I didn't list iWarp in the table because everything is no, but it
doesn't support QP1.

> > ib_sa means the IBA SA protocol is supported (Y Y Y)
> 
> I think this should be (Y N Y)
> 
> IBoE has no SA.  The IBoE code "fabricates" a Path Record it does not need to
> interact with the SA.

I was wondering why there are so many checks in the SA code, I know
RoCEE doesn't use it, but why are there there?
 
> > ib_mcast true if the IBA SA protocol is used for multicast GIDs (Y N
> > Y)
> 
> Given the above why can't we just have the "ib_sa" flag?

Maybe I got it wrong, but yes, if it really means 'IBA SA protocol for
multicast then it can just be cap_sa.

But there is also the idea that some devices can't do multicast at all
(iWarp), we must care about that at some point?

> However, I think checking the link layer is more appropriate here.
> It does not make sense to do IP over IB over Eth.  Even though the
> IBoE can do the "IB" protocol.

Yes, it is ugly.

I think if we look closely we'll find that IPoIB today has a hard
requirement on cap_sa being true, so lets use that?

In fact any ULP that unconditionally uses the SA can use that.

> > I actually really prefer cap_mandatory_grh - that is what is going on
> > here. ie based on that name (as a reviewer) I'd expect to see the mad
> > layer check that the mandatory GRH is always present, or blow up.
> 
> While GRH mandatory (for the GMP) is what this is.  The function
> ib_init_ah_from_path generically is really handling an "IBoE address" to send
> to and therefore we need to force the GRH in the AH.

This make sense to me.

It appears we have at least rocee, rocee v2 (udp?), tcp, ib and opa
address and AH formats? opa would support ib addresses too I guess.

A 
 bool rdma_port_addr_is_XXX()

along with a

  enum AddrType rdma_port_addr_type()

Might be the thing? The latter should only be used with switch()

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 10, 2015, 4:48 p.m. UTC | #19
On Fri, 2015-04-10 at 09:46 +0200, Michael Wang wrote:
> On 04/09/2015 11:19 PM, Doug Ledford wrote:
> [snip]
> >>
> >> I've written this before: The mess here is that it is too hard to know
> >> what the call sites are actually checking for when it is some baroque
> >> conditional.
> > 
> > The two goals: being specific about what the test is returning and
> > minimizing the bitmap footprint; are not necessarily opposed.  One can
> > do both at the same time.
> 
> This could be internal reforming after the cap_XX() stuff works for core
> layer, at that time we don't need to touch core layer anymore, just
> introducing this bitmap stuff in verb layer, replacing the implementation
> of these helpers with the bitmap check, and following the semantic (description).

Agreed.
Doug Ledford April 10, 2015, 5:10 p.m. UTC | #20
On Fri, 2015-04-10 at 03:48 -0400, ira.weiny wrote:
> > > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> > > index 18c1ece..a9587c4 100644
> > > --- a/drivers/infiniband/core/device.c
> > > +++ b/drivers/infiniband/core/device.c
> > > @@ -76,6 +76,7 @@ static int ib_device_check_mandatory(struct ib_device *device)
> > >  	} mandatory_table[] = {
> > >  		IB_MANDATORY_FUNC(query_device),
> > >  		IB_MANDATORY_FUNC(query_port),
> > > +		IB_MANDATORY_FUNC(query_transport),
> > >  		IB_MANDATORY_FUNC(query_pkey),
> > >  		IB_MANDATORY_FUNC(query_gid),
> > >  		IB_MANDATORY_FUNC(alloc_pd),
> > 
> > I'm concerned about the performance implications of this.  The size of
> > this patchset already points out just how many places in the code we
> > have to check for various aspects of the device transport in order to do
> > the right thing.  Without going through the entire list to see how many
> > are on critical hot paths, I'm sure some of them are on at least
> > partially critical hot paths (like creation of new connections).  I
> > would prefer to see this change be implemented via a device attribute,
> > not a functional call query.  That adds a needless function call in
> > these paths.
> 
> I like the idea of a query_transport but at the same time would like to see the
> use of "transport" reduced.  A reduction in the use of this call could
> eliminate most performance concerns.
> 
> So can we keep this abstraction if at the end of the series we limit its use?

The reason I don't like a query is because the transport type isn't
changing.  It's a static device attribute.  The only devices that *can*
change their transport are mlx4 or mlx5 devices, and they tear down and
deregister their current device and bring up a new one when they need to
change transports or link layers.  So, this really isn't something we
should query, this should be part of our static device attributes.
Every other query in the list above is for something that changes.  This
is not.

> > 
> > > diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
> > > index f93eb8d..83370de 100644
> > > --- a/drivers/infiniband/core/verbs.c
> > > +++ b/drivers/infiniband/core/verbs.c
> > > @@ -133,14 +133,16 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_
> > >  	if (device->get_link_layer)
> > >  		return device->get_link_layer(device, port_num);
> > >  
> > > -	switch (rdma_node_get_transport(device->node_type)) {
> > > +	switch (device->query_transport(device, port_num)) {
> > >  	case RDMA_TRANSPORT_IB:
> > > +	case RDMA_TRANSPORT_IBOE:
> > >  		return IB_LINK_LAYER_INFINIBAND;
> > 
> > If we are perserving ABI, then this looks wrong.  Currently, IBOE
> > returnsi transport IB and link layer Ethernet.  It should not return
> > link layer IB, it does not support IB link layer operations (such as MAD
> > access).
> 
> I think the original code has the bug.
> 
> IBoE devices currently return a transport of IB but they probably never get
> here because they support the get_link_layer callback used a few lines above.
> So this "bug" was probably never hit.
> 
> > 
> > >  	case RDMA_TRANSPORT_IWARP:
> > >  	case RDMA_TRANSPORT_USNIC:
> > >  	case RDMA_TRANSPORT_USNIC_UDP:
> > >  		return IB_LINK_LAYER_ETHERNET;
> > >  	default:
> > > +		BUG();
> > >  		return IB_LINK_LAYER_UNSPECIFIED;
> > >  	}
> > >  }
> > 
> > [ snip ]
> > 
> > > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> > > index 65994a1..d54f91e 100644
> > > --- a/include/rdma/ib_verbs.h
> > > +++ b/include/rdma/ib_verbs.h
> > > @@ -75,10 +75,13 @@ enum rdma_node_type {
> > >  };
> > >  
> > >  enum rdma_transport_type {
> > > +	/* legacy for users */
> > >  	RDMA_TRANSPORT_IB,
> > >  	RDMA_TRANSPORT_IWARP,
> > >  	RDMA_TRANSPORT_USNIC,
> > > -	RDMA_TRANSPORT_USNIC_UDP
> > > +	RDMA_TRANSPORT_USNIC_UDP,
> > > +	/* new transport */
> > > +	RDMA_TRANSPORT_IBOE,
> > >  };
> > 
> > I'm also concerned about this.  I would like to see this enum
> > essentially turned into a bitmap.  One that is constructed in such a way
> > that we can always get the specific test we need with only one compare
> > against the overall value.  In order to do so, we need to break it down
> > into the essential elements that are part of each of the transports.
> > So, for instance, we can define the two link layers we have so far, plus
> > reserve one for OPA which we know is coming:
> > 
> > RDMA_LINK_LAYER_IB       = 0x00000001,
> > RDMA_LINK_LAYER_ETH      = 0x00000002,
> > RDMA_LINK_LAYER_OPA      = 0x00000004,
> > RDMA_LINK_LAYER_MASK     = 0x0000000f,
> 
> I would reserve more bits here.

Sure.  I didn't mean to imply that this was as large is these bit fields
would ever need to be, I just typed it up quickly at the time.

> > 
> > We can then define the currently known high level transport types:
> > 
> > RDMA_TRANSPORT_IB        = 0x00000010,
> > RDMA_TRANSPORT_IWARP     = 0x00000020,
> > RDMA_TRANSPORT_USNIC     = 0x00000040,
> > RDMA_TRANSPORT_USNIC_UDP = 0x00000080,
> > RDMA_TRANSPORT_MASK      = 0x000000f0,
> 
> I would reserve more bits here.
> 
> > 
> > We could then define bits for the IB management types:
> > 
> > RDMA_MGMT_IB             = 0x00000100,
> > RDMA_MGMT_OPA            = 0x00000200,
> > RDMA_MGMT_MASK           = 0x00000f00,
> 
> We at least need bits for SA / CM support.
> 
> I said previously all device types support QP1  I was wrong...  I forgot about
> USNIC devices.  So the full management bit mask is.
> 
> 
> RDMA_MGMT_IB_MAD         = 0x00000100,
> RDMA_MGMT_QP0            = 0x00000200,
> RDMA_MGMT_SA             = 0x00000400,
> RDMA_MGMT_CM             = 0x00000800,
> RDMA_MGMT_OPA_MAD        = 0x00001000,
> RDMA_MGMT_MASK           = 0x000fff00,
> 
> With a couple of spares.
> 
> The MAD stack is pretty agnostic to the types of MADs passing through it so we
> don't really need PM flags etc.
> 
> > 
> > Then we have space to define specific quirks:
> > 
> > RDMA_SEPARATE_READ_SGE   = 0x00001000,
> > RDMA_QUIRKS_MASK         = 0xfffff000
> 
> shift for spares...
> 
> RDMA_SEPARATE_READ_SGE   = 0x00100000,
> RDMA_QUIRKS_MASK         = 0xfff00000
> 
> > 
> > Once those are defined, a few definitions for device drivers to use when
> > they initialize a device to set the bitmap to the right values:
> > 
> > #define IS_IWARP (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_IWARP |
> > RDMA_SEPARATE_READ_SGE)
> > #define IS_IB (RDMA_LINK_LAYER_IB | RDMA_TRANSPORT_IB | RDMA_MGMT_IB)
> > #define IS_IBOE (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_IB)
> > #define IS_USNIC (RDMA_LINK_LAYER_ETH | RDMA_TRANSPORT_USNIC)
> > #define IS_OPA (RDMA_LINK_LAYER_OPA | RDMA_TRANSPORT_IB | RDMA_MGMT_IB |
> > RDMA_MGMT_OPA)
> > 
> > Then you need to define the tests:
> > 
> > static inline bool
> > rdma_transport_is_iwarp(struct ib_device *dev, u8 port)
> > {
> > 	return dev->port[port]->transport & RDMA_TRANSPORT_IWARP;
> > }
> > 
> > /* Note: this intentionally covers IB, IBOE, and OPA...use
> >    rdma_dev_is_ib if you want to only get physical IB devices */
> > static inline bool
> > rdma_transport_is_ib(struct ibdev *dev)
> > {
> > 	return dev->port[port]->transport & RDMA_TRANSPORT_IB;
> > }
> > 
> > rdma_port_is_ib(struct ib_device *dev, u8 port)
> 
> I prefer
> 
> rdma_port_link_layer_is_ib
> rdma_port_link_layer_is_eth

In my quick example, anything that started with rdma_transport* was
testing the high level transport attribute of any given port of any
given device, and anything that started with rdma_port* was testing the
link layer on a specific port of any given device.

> 
> > {
> > 	return dev->port[port]->transport & RDMA_LINK_LAYER_IB;
> > }
> > 
> > rdma_port_is_iboe(struct ib_device *dev, u8 port)
> 
> I'm not sure what this means.
> 
> rdma_port_req_iboe_addr seems more appropriate because what we really need to
> know is that this device requires an IBoE address format.  (PKey is "fake",
> PathRecord is fabricated rather than queried, GRH is required in the AH
> conversion.)

TomAto, Tomato...port_is_ib implicity means port_req_iboe_addr.
> 
> > {
> > 	return dev->port[port]->transport & IS_IBOE == IS_IBOE;
> > }
> > 
> > rdma_port_is_usnic(struct ib_device *dev, u8 port)
> 
> rdma_transport_is_usnic
> 
> > {
> > 	return dev->port[port]->transport & RDMA_TRANSPORT_USNIC;
> > }
> > 
> > rdma_port_is_opa(struct ib_device *dev, u8 port)
> 
> rdma_port_link_layer_is_opa
> 
> > {
> > 	return dev->port[port]->transport & RDMA_LINK_LAYER_OPA;
> > }
> > 
> > rdma_port_is_iwarp(struct ib_device *dev, u8 port)
> > {
> > 	return rdma_transport_is_iwarp(dev, port);
> 
> Why not call rdma_transport_is_iwarp?

As per my above statement, rdma_transport* tests were testing the high
level transport type, rdma_port* types were testing link layers.  iWARP
has an Eth link layer, so technically port_is_iwarp makes no sense.  But
since all the other types had a check too, I included port_is_iwarp just
to be complete, and if you are going to ask if a specific port is iwarp
as a link layer, it makes sense to say yes if the transport is iwarp,
not if the link layer is eth.

[ snip lots of stuff that is all correct ]

> > Patch 8/17:
> > 
> > Here we can create a new test if we are using the bitmap I created
> > above:
> > 
> > rdma_port_ipoib(struct ib_device *dev, u8 port)
> > {
> > 	return !(dev->port[port]->transport & RDMA_LINK_LAYER_ETH);
> 
> I would prefer a link layer (or generic) bit mask rather than '&' of
> "transport" and "link layer".  That just seems wrong.

I kept the name transport, but it's really a device attribute bitmap.
And of all the link layer types, eth is the only one for which IPoIB
makes no sense (even if it's possible to do).  So, as long as the ETH
bit isn't set, we're good to go.  But, calling it
dev->port[port]->attributes instead of transport would make it more
clear what it is.

> I guess rdma_dev_is_iboe is ok.  But it seems like we are keying off the
> addresses not necessarily the devices.

They're one and the same.  The addresses go with the device, the device
goes with the addresses.  You never have one without the other.  The
name of the check is not really important, just as long as it's clearly
documented.  I get why you link the address variant, because it pops out
all the things that are special about IBoE addressing and calls out that
the issues need to be handled.  However, saying requires_iboe_addr(),
while foreshadowing the work that needs done, doesn't actually document
the work that needs done.  Whether we call is dev_is_iboe() or
requires_iboe_addr(), it would be good if the documentation spelled out
those specific requirements for reference sake.
Jason Gunthorpe April 10, 2015, 5:36 p.m. UTC | #21
On Fri, Apr 10, 2015 at 01:10:43PM -0400, Doug Ledford wrote:

> documented.  I get why you link the address variant, because it pops out
> all the things that are special about IBoE addressing and calls out that
> the issues need to be handled.  However, saying requires_iboe_addr(),
> while foreshadowing the work that needs done, doesn't actually document
> the work that needs done.  Whether we call is dev_is_iboe() or
> requires_iboe_addr(), it would be good if the documentation spelled out
> those specific requirements for reference sake.

My deep hope for this, was that the test 'requires_iboe_addr' or
whatever we call it would have a *really good* kdoc.

List all the ways iboe_addr's work, how they differ from IB addresses,
refer to the specs people should read to understand it, etc.

The patches don't do this, and maybe Michael is the wrong person to
fill that in, but we can get it done..

Jason

BTW: Michael, next time you post the series, please trim the CC
list...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 10, 2015, 5:38 p.m. UTC | #22
On Fri, Apr 10, 2015 at 10:15:51AM -0600, Jason Gunthorpe wrote:
> On Fri, Apr 10, 2015 at 02:16:11AM -0400, ira.weiny wrote:
> 
> > >         IB ROCEE OPA
> > > SMI     Y  N     Y    (though the OPA smi looked a bit different)
> > 
> > Yes OPA is different but it is based on the class version of the individual
> > MADs not any particular device/port support.
> 
> > > OPA SMP N  N     Y
> > 
> > How is this different from the SMI?
> 
> Any code that generates SMPs and SMIs is going to need to know what
> format to generate them in. It seems we have a sort of weird world
> where IB SMPs are supported on OPA but not the IB SMI.

My mistake it was late.  The MAD stack needs to know if it should implement the
IB SMI or the IB & OPA SMI.  That also implies the use of QP0 or vise versa.

In my email to Doug I suggested "OPA MAD" which covers the need to implement
the OPA SMI on that device for MADs which have that class version.

> 
> Not sure any users exist though..

I think all the "users" are either userspace or the drivers themselves.  It is
really just the common SMI processing which needs to be turned on/off.  Again I
think this can be covered with the QP0 flag.

RDMA_MGMT_QP0            = 0x00000200,

> 
> > > Maybe the above set would make a lot more sense as:
> > >   cap_ib_qp0
> > >   cap_ib_qp1
> > >   cap_opa_qp0
> > 
> > I disagree.
> > 
> > All we need right now is is cap_qp0.  All devices currently support QP1.
> 
> I didn't list iWarp in the table because everything is no, but it
> doesn't support QP1.

Isn't ocrdma an iWarp device?

int ocrdma_process_mad(struct ib_device *ibdev,
                       int process_mad_flags,
                       u8 port_num,
                       struct ib_wc *in_wc,
                       struct ib_grh *in_grh,
                       struct ib_mad *in_mad, struct ib_mad *out_mad)
{
        int status;
        struct ocrdma_dev *dev;

        switch (in_mad->mad_hdr.mgmt_class) {
        case IB_MGMT_CLASS_PERF_MGMT:
                dev = get_ocrdma_dev(ibdev);
                if (!ocrdma_pma_counters(dev, out_mad))
                        status = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
                else
                        status = IB_MAD_RESULT_SUCCESS;
                break;
        default:
                status = IB_MAD_RESULT_SUCCESS;
                break;
        }
        return status;
}


Regardless I was wrong.  USNIC devices don't support MADs at all.  So we do
need the "supports MADs at all flag".

RDMA_MGMT_IB_MAD         = 0x00000100,

> 
> > > ib_sa means the IBA SA protocol is supported (Y Y Y)
> > 
> > I think this should be (Y N Y)
> > 
> > IBoE has no SA.  The IBoE code "fabricates" a Path Record it does not need to
> > interact with the SA.
> 
> I was wondering why there are so many checks in the SA code, I know
> RoCEE doesn't use it, but why are there there?

Which checks are you referring to?  I think there are separate calls to query
the SA when running on IB for both the Route resolution and the Multicast join
operations.  The choice of those calls could be made by a "cap_sa()" helper.

>  
> > > ib_mcast true if the IBA SA protocol is used for multicast GIDs (Y N
> > > Y)
> > 
> > Given the above why can't we just have the "ib_sa" flag?
> 
> Maybe I got it wrong, but yes, if it really means 'IBA SA protocol for
> multicast then it can just be cap_sa.
> 
> But there is also the idea that some devices can't do multicast at all
> (iWarp), we must care about that at some point?

I was not sure how to handle this either...  I guess you need a
cap_multicast()???

> 
> > However, I think checking the link layer is more appropriate here.
> > It does not make sense to do IP over IB over Eth.  Even though the
> > IBoE can do the "IB" protocol.
> 
> Yes, it is ugly.
> 
> I think if we look closely we'll find that IPoIB today has a hard
> requirement on cap_sa being true, so lets use that?

I don't think that is appropriate.  You have been advocating that the checks
be clear as to what support we need.  While currently the IPoIB layer does (for
IB and OPA) require an SA I think those checks are only appropriate when it is
attempting an SA query.

The choice to run IPoIB at all is a different matter.

> 
> In fact any ULP that unconditionally uses the SA can use that.

They _can_ use that but the point of this exercise (and additional checks going
forward) is that we don't "hide" meaning like this.

IPoIB should restrict itself to running on IB link layers.  Should additional
link layers be added which IPoIB works on then we add that check.

> 
> > > I actually really prefer cap_mandatory_grh - that is what is going on
> > > here. ie based on that name (as a reviewer) I'd expect to see the mad
> > > layer check that the mandatory GRH is always present, or blow up.
> > 
> > While GRH mandatory (for the GMP) is what this is.  The function
> > ib_init_ah_from_path generically is really handling an "IBoE address" to send
> > to and therefore we need to force the GRH in the AH.
> 
> This make sense to me.
> 
> It appears we have at least rocee, rocee v2 (udp?), tcp, ib and opa
> address and AH formats?

Seems that way.  But has the rocee v2 been accepted?

> opa would support ib addresses too I guess.

Yes opa address == ib addresses.  So there is no need to distinguish them.

> 
> A 
>  bool rdma_port_addr_is_XXX()
> 
> along with a
> 
>   enum AddrType rdma_port_addr_type()
> 
> Might be the thing? The latter should only be used with switch()

Sounds good to me.  But Doug has a point that the address type and the "port"
type go together.  So this could probably be the same call for both of those.

Ira

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 10, 2015, 5:49 p.m. UTC | #23
On Fri, 2015-04-10 at 13:38 -0400, ira.weiny wrote:

> Isn't ocrdma an iWarp device?

No, it's roce.  It and mlx4 roce devices currently interoperate.

> > I think if we look closely we'll find that IPoIB today has a hard
> > requirement on cap_sa being true, so lets use that?
> 
> I don't think that is appropriate.  You have been advocating that the checks
> be clear as to what support we need.  While currently the IPoIB layer does (for
> IB and OPA) require an SA I think those checks are only appropriate when it is
> attempting an SA query.
> 
> The choice to run IPoIB at all is a different matter.

Appropriately named or not, Jason's choice of words "has a hard
requirement" is correct ;-)  For IPoIB, the broadcast group of the fake
Ethernet fabric is a very specific IB multicast group per the IPoIB
spec.

> > 
> > In fact any ULP that unconditionally uses the SA can use that.
> 
> They _can_ use that but the point of this exercise (and additional checks going
> forward) is that we don't "hide" meaning like this.
> 
> IPoIB should restrict itself to running on IB link layers.  Should additional
> link layers be added which IPoIB works on then we add that check.

I think your right that checking the link layer is the right thing, and
for now, there is no need to check cap_sa because the link layer check
enforces it.  In the future, if there is a new link layer we want to use
this on, and it doesn't have an sa, then we have to enable sa checks and
alternate methods at that time.
Tom Talpey April 10, 2015, 5:50 p.m. UTC | #24
On 4/10/2015 1:10 PM, Doug Ledford wrote:
> As per my above statement, rdma_transport* tests were testing the high
> level transport type, rdma_port* types were testing link layers.  iWARP
> has an Eth link layer, so technically port_is_iwarp makes no sense.  But
> since all the other types had a check too, I included port_is_iwarp just
> to be complete, and if you are going to ask if a specific port is iwarp
> as a link layer, it makes sense to say yes if the transport is iwarp,
> not if the link layer is eth.

Not wanting to split hairs, but I would not rule out the possibility
of a future device supporting iWARP on one port and another RDMA
protocol on another. One could also imagine softiWARP and softROCE
co-existing atop a single ethernet NIC.

So, I disagree that port_is_iwarp() is a nonsequitur.

Tom.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe April 10, 2015, 6:04 p.m. UTC | #25
On Fri, Apr 10, 2015 at 01:38:38PM -0400, ira.weiny wrote:
 
> Isn't ocrdma an iWarp device?

No, it is RoCEE

> > I was wondering why there are so many checks in the SA code, I know
> > RoCEE doesn't use it, but why are there there?
> 
> Which checks are you referring to?  I think there are separate calls to query
> the SA when running on IB for both the Route resolution and the Multicast join
> operations.  The choice of those calls could be made by a "cap_sa()" helper.

I will point them out next time I look through the patches

> > I think if we look closely we'll find that IPoIB today has a hard
> > requirement on cap_sa being true, so lets use that?
> 
> I don't think that is appropriate.  You have been advocating that the checks
> be clear as to what support we need. 

Right, but this is narrow, and we are not hiding meaning.

Look at the IPoIB ULP, and look at the hard requiments of the code,
then translate those back to our new cap scheme. We see today's IPoIB
will not run without:
 - UD support
 - IB addressing
 - IB multicast
 - IB SA
 - CM (optional)

It seems perfectly correct for a ULP to say at the very start, I need
all these caps, or I will not run (how could it run?). This is true of
any ULP that has a hard need to use those APIs.

That would seem to be the very essance of the cap scheme. Declare what
you need, not what standard you think you need.

Hiding meaning is to say 'only run on IB or OPA': WHY are we limited
to those two?

> While currently the IPoIB layer does (for IB and OPA) require an SA
> I think those checks are only appropriate when it is attempting an
> SA query.

That doesn't make any sense unless someone also adds support for
handling the !SA case.

> > It appears we have at least rocee, rocee v2 (udp?), tcp, ib and opa
> > address and AH formats?
> 
> Seems that way.  But has the rocee v2 been accepted?

Don't know much about it yet, patches exist, it seems to have a
slightly different addressing format.

> > opa would support ib addresses too I guess.
> 
> Yes opa address == ib addresses.  So there is no need to distinguish them.

The patches you sent showed a different LRH format for OPA (eg 32 bit
LID), so someday we will need to know that the full 32 bit LID is
available.

We can see how this might work in future, lets say OPAv2 *requires* the
32 bit LID, for that case cap_ib_address = 0 cap_opa_address = 1. If
we don't update IPoIB and it uses the tests from above then it
immediately, and correctly, stops running on those OPAv2 devices.

Once patched to support cap_op_address then it will begin working
again. That seems very sane..

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 10, 2015, 6:11 p.m. UTC | #26
On Fri, Apr 10, 2015 at 01:49:32PM -0400, Doug Ledford wrote:
> On Fri, 2015-04-10 at 13:38 -0400, ira.weiny wrote:
> 
> 
> > > I think if we look closely we'll find that IPoIB today has a hard
> > > requirement on cap_sa being true, so lets use that?
> > 
> > I don't think that is appropriate.  You have been advocating that the checks
> > be clear as to what support we need.  While currently the IPoIB layer does (for
> > IB and OPA) require an SA I think those checks are only appropriate when it is
> > attempting an SA query.
> > 
> > The choice to run IPoIB at all is a different matter.
> 
> Appropriately named or not, Jason's choice of words "has a hard
> requirement" is correct ;-)

Agreed.  I meant that using "cap_sa" is not appropriate.  Not that IPoIB did
not have a hard requirement...  :-D

I actually think that _both_ the check for IB link layer and the "cap_sa" is
required.  Perhaps not at start up...

Ira

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford April 10, 2015, 6:17 p.m. UTC | #27
On Fri, 2015-04-10 at 13:50 -0400, Tom Talpey wrote:
> On 4/10/2015 1:10 PM, Doug Ledford wrote:
> > As per my above statement, rdma_transport* tests were testing the high
> > level transport type, rdma_port* types were testing link layers.  iWARP
> > has an Eth link layer, so technically port_is_iwarp makes no sense.  But
> > since all the other types had a check too, I included port_is_iwarp just
> > to be complete, and if you are going to ask if a specific port is iwarp
> > as a link layer, it makes sense to say yes if the transport is iwarp,
> > not if the link layer is eth.
> 
> Not wanting to split hairs, but I would not rule out the possibility
> of a future device supporting iWARP on one port and another RDMA
> protocol on another. One could also imagine softiWARP and softROCE
> co-existing atop a single ethernet NIC.
> 
> So, I disagree that port_is_iwarp() is a nonsequitur.

Agreed, but that wasn't what I was calling non-sense.  I was referring
to the fact that in my quick little write up, the rdma_port* functions
were all intended to test link layers, not high level transports.  There
is no such thing as an iWARP link layer.  It was still a port specific
test, and would work in all the situations you described, it's just that
asking if a port's link layer is iWARP makes no sense, so I returned
true if the transport was iWARP regardless of what the link layer
actually was.
Doug Ledford April 10, 2015, 6:24 p.m. UTC | #28
On Fri, 2015-04-10 at 12:04 -0600, Jason Gunthorpe wrote:
> On Fri, Apr 10, 2015 at 01:38:38PM -0400, ira.weiny wrote:
>  
> Hiding meaning is to say 'only run on IB or OPA': WHY are we limited
> to those two?

For something else, I might agree with this.  But, for the specific case
of IPoIB, it's pretty fair.  IPoIB is more than just an ULP.  It's a
spec.  And it's very IB specific.  It will only work with OPA because
OPA is imitating IB.  To run it on another fabric, you would need more
than just to make it work.  If the new fabric doesn't have a broadcast
group, or has multicast registration like IB does, you need the
equivalent of IBTA, whatever that may be for this new fabric, buy in on
the pre-defined multicast groups and you might need firmware support in
the switches.

> We can see how this might work in future, lets say OPAv2 *requires* the
> 32 bit LID, for that case cap_ib_address = 0 cap_opa_address = 1. If
> we don't update IPoIB and it uses the tests from above then it
> immediately, and correctly, stops running on those OPAv2 devices.
> 
> Once patched to support cap_op_address then it will begin working
> again. That seems very sane..

It is very sane from an implementation standpoint, but from the larger
interoperability standpoint, you need that spec to be extended to the
new fabric simultaneously.
Jason Gunthorpe April 10, 2015, 7:17 p.m. UTC | #29
On Fri, Apr 10, 2015 at 02:24:26PM -0400, Doug Ledford wrote:

> IPoIB is more than just an ULP.  It's a spec.  And it's very IB
> specific.  It will only work with OPA because OPA is imitating IB.
> To run it on another fabric, you would need more than just to make
> it work.  If the new fabric doesn't have a broadcast group, or has
> multicast registration like IB does, you need the equivalent of
> IBTA, whatever that may be for this new fabric, buy in on the
> pre-defined multicast groups and you might need firmware support in
> the switches.

It feels like the 'cap_ib_addressing' or whatever we call it captures
this very well. The IPoIB RFC is very much concerned with GID's and
MGID's and broadly requires the IBA addressing
scheme. cap_ib_addressing asserts the port uses that scheme. 

We wouldn't accept patches to IPoIB to add a new addressing scheme
without seeing proper diligence to the standards work.

Looking away from the stadards, using cap_XX seems very sane: We are
building a well defined system of invarients, You can't call into the
sa functions if cap_sa is not set, you can't call into the mcast
functions if cap_mcast is not set, you can't form a AH from IB
GIDs/MGID/LID without cap_ib_addressing.

I makes so much sense for the ULP to directly require the needed cap's
for the kernel APIs it intends to call, or not use the RDMA port at
all.

> > We can see how this might work in future, lets say OPAv2 *requires* the
> > 32 bit LID, for that case cap_ib_address = 0 cap_opa_address = 1. If
> > we don't update IPoIB and it uses the tests from above then it
> > immediately, and correctly, stops running on those OPAv2 devices.
> > 
> > Once patched to support cap_op_address then it will begin working
> > again. That seems very sane..
> 
> It is very sane from an implementation standpoint, but from the larger
> interoperability standpoint, you need that spec to be extended to the
> new fabric simultaneously.

I liked the OPAv2 hypothetical because it doesn't actually touch the
IPoIB spec. IPoIB spec has little to say about LIDs or LRHs it works
entirely at the GID/MGID/GRH level.

Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 10, 2015, 8:38 p.m. UTC | #30
On Fri, Apr 10, 2015 at 12:04:55PM -0600, Jason Gunthorpe wrote:
> On Fri, Apr 10, 2015 at 01:38:38PM -0400, ira.weiny wrote:
>  
> > 
> > I don't think that is appropriate.  You have been advocating that the checks
> > be clear as to what support we need. 
> 
> Right, but this is narrow, and we are not hiding meaning.
> 
> Look at the IPoIB ULP, and look at the hard requiments of the code,
> then translate those back to our new cap scheme. We see today's IPoIB
> will not run without:
>  - UD support
>  - IB addressing
>  - IB multicast
>  - IB SA
>  - CM (optional)
> 
> It seems perfectly correct for a ULP to say at the very start, I need
> all these caps, or I will not run (how could it run?). This is true of
> any ULP that has a hard need to use those APIs.

Having IPoIB check for all of these and fail to start if not supported is good.
But the suggestion before was to have "cap_ipoib".  I don't think we want that.

> 
> That would seem to be the very essance of the cap scheme. Declare what
> you need, not what standard you think you need.

Right.

> 
> Hiding meaning is to say 'only run on IB or OPA': WHY are we limited
> to those two?

Because only those 2 support the list of capabilities above.

> 
> > While currently the IPoIB layer does (for IB and OPA) require an SA
> > I think those checks are only appropriate when it is attempting an
> > SA query.
> 
> That doesn't make any sense unless someone also adds support for
> handling the !SA case.

Fair enough but IPoIB does need to check for SA support now as well as IB
addressing which is currently IB Link Layer (although as Doug said the Port and
Address format go hand in hand.  So I'm happy calling it whatever.)

> 
> > > It appears we have at least rocee, rocee v2 (udp?), tcp, ib and opa
> > > address and AH formats?
> > 
> > Seems that way.  But has the rocee v2 been accepted?
> 
> Don't know much about it yet, patches exist, it seems to have a
> slightly different addressing format.
> 
> > > opa would support ib addresses too I guess.
> > 
> > Yes opa address == ib addresses.  So there is no need to distinguish them.
> 
> The patches you sent showed a different LRH format for OPA (eg 32 bit
> LID), so someday we will need to know that the full 32 bit LID is
> available.
> 
> We can see how this might work in future, lets say OPAv2 *requires* the
> 32 bit LID, for that case cap_ib_address = 0 cap_opa_address = 1. If
> we don't update IPoIB and it uses the tests from above then it
> immediately, and correctly, stops running on those OPAv2 devices.

For your hypothetical case, agreed.  But I want to make it clear for those who
may be casually reading this thread that OPA addresses are IB addresses right
now.

> 
> Once patched to support cap_op_address then it will begin working
> again. That seems very sane..

Agreed.

Ira

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 10, 2015, 9:06 p.m. UTC | #31
On Fri, Apr 10, 2015 at 01:17:23PM -0600, Jason Gunthorpe wrote:
> On Fri, Apr 10, 2015 at 02:24:26PM -0400, Doug Ledford wrote:
> 
> > IPoIB is more than just an ULP.  It's a spec.  And it's very IB
> > specific.  It will only work with OPA because OPA is imitating IB.
> > To run it on another fabric, you would need more than just to make
> > it work.  If the new fabric doesn't have a broadcast group, or has
> > multicast registration like IB does, you need the equivalent of
> > IBTA, whatever that may be for this new fabric, buy in on the
> > pre-defined multicast groups and you might need firmware support in
> > the switches.
> 
> It feels like the 'cap_ib_addressing' or whatever we call it captures
> this very well. The IPoIB RFC is very much concerned with GID's and
> MGID's and broadly requires the IBA addressing
> scheme. cap_ib_addressing asserts the port uses that scheme. 
> 
> We wouldn't accept patches to IPoIB to add a new addressing scheme
> without seeing proper diligence to the standards work.
> 
> Looking away from the stadards, using cap_XX seems very sane: We are
> building a well defined system of invarients, You can't call into the
> sa functions if cap_sa is not set, you can't call into the mcast
> functions if cap_mcast is not set, you can't form a AH from IB
> GIDs/MGID/LID without cap_ib_addressing.

Yep.

> 
> I makes so much sense for the ULP to directly require the needed cap's
> for the kernel APIs it intends to call, or not use the RDMA port at
> all.

Yes.

So trying to sum up.

Have we settled on the following "capabilities"?  Helper function names aside.

/* legacy to communicate to userspace */
RDMA_LINK_LAYER_IB       = 0x0000000000000001,
RDMA_LINK_LAYER_ETH      = 0x0000000000000002,
RDMA_LINK_LAYER_MASK     = 0x000000000000000f, /* more bits? */
	/* I'm hoping we don't need more bits here */


/* legacy to communicate to userspace */
RDMA_TRANSPORT_IB        = 0x0000000000000010,
RDMA_TRANSPORT_IWARP     = 0x0000000000000020,
RDMA_TRANSPORT_USNIC     = 0x0000000000000040,
RDMA_TRANSPORT_USNIC_UDP = 0x0000000000000080,
RDMA_TRANSPORT_MASK      = 0x00000000000000f0, /* more bits? */
	/* I'm hoping we don't need more bits here */


/* New flags */

RDMA_MGMT_IB_MAD         = 0x0000000000000100, /* ib_mad module support */
RDMA_MGMT_QP0            = 0x0000000000000200, /* ib_mad QP0 support */
RDMA_MGMT_IB_SA          = 0x0000000000000400, /* ib_sa module support */
					       /* NOTE includes IB Mcast */
RDMA_MGMT_IB_CM          = 0x0000000000000800, /* ib_cm module support */
RDMA_MGMT_OPA_MAD        = 0x0000000000001000, /* ib_mad OPA MAD support */
RDMA_MGMT_MASK           = 0x00000000000fff00,

RDMA_ADDR_IB             = 0x0000000000100000, /* Port does IB AH, PR, Pkey */
RDMA_ADDR_IBoE           = 0x0000000000200000, /* Port does IBoE AH, PR, Pkey */
/* Do we need iWarp (TCP) here? */
RDMA_ADDR_IB_MASK        = 0x000000000ff00000,


RDMA_SEPARATE_READ_SGE   = 0x0000000010000000,
RDMA_QUIRKS_MASK         = 0x000000fff0000000


> 
> > > We can see how this might work in future, lets say OPAv2 *requires* the
> > > 32 bit LID, for that case cap_ib_address = 0 cap_opa_address = 1. If
> > > we don't update IPoIB and it uses the tests from above then it
> > > immediately, and correctly, stops running on those OPAv2 devices.
> > > 
> > > Once patched to support cap_op_address then it will begin working
> > > again. That seems very sane..
> > 
> > It is very sane from an implementation standpoint, but from the larger
> > interoperability standpoint, you need that spec to be extended to the
> > new fabric simultaneously.
> 
> I liked the OPAv2 hypothetical because it doesn't actually touch the
> IPoIB spec. IPoIB spec has little to say about LIDs or LRHs it works
> entirely at the GID/MGID/GRH level.

Agreed.

Ira

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Talpey April 11, 2015, 12:01 a.m. UTC | #32
On 4/10/2015 5:06 PM, ira.weiny wrote:
> On Fri, Apr 10, 2015 at 01:17:23PM -0600, Jason Gunthorpe wrote:
>...
> So trying to sum up.
>
> Have we settled on the following "capabilities"?  Helper function names aside.
>
> /* legacy to communicate to userspace */
> RDMA_LINK_LAYER_IB       = 0x0000000000000001,
> RDMA_LINK_LAYER_ETH      = 0x0000000000000002,
> RDMA_LINK_LAYER_MASK     = 0x000000000000000f, /* more bits? */
> 	/* I'm hoping we don't need more bits here */
>
>
> /* legacy to communicate to userspace */
> RDMA_TRANSPORT_IB        = 0x0000000000000010,
> RDMA_TRANSPORT_IWARP     = 0x0000000000000020,
> RDMA_TRANSPORT_USNIC     = 0x0000000000000040,
> RDMA_TRANSPORT_USNIC_UDP = 0x0000000000000080,
> RDMA_TRANSPORT_MASK      = 0x00000000000000f0, /* more bits? */
> 	/* I'm hoping we don't need more bits here */
>
>
> /* New flags */
>
> RDMA_MGMT_IB_MAD         = 0x0000000000000100, /* ib_mad module support */
> RDMA_MGMT_QP0            = 0x0000000000000200, /* ib_mad QP0 support */
> RDMA_MGMT_IB_SA          = 0x0000000000000400, /* ib_sa module support */
> 					       /* NOTE includes IB Mcast */
> RDMA_MGMT_IB_CM          = 0x0000000000000800, /* ib_cm module support */
> RDMA_MGMT_OPA_MAD        = 0x0000000000001000, /* ib_mad OPA MAD support */
> RDMA_MGMT_MASK           = 0x00000000000fff00,

You explicitly say "userspace" - why would an upper layer need to
know the link, transport and management details? These seem to be
mid-layer matters.

> RDMA_ADDR_IB             = 0x0000000000100000, /* Port does IB AH, PR, Pkey */
> RDMA_ADDR_IBoE           = 0x0000000000200000, /* Port does IBoE AH, PR, Pkey */
> /* Do we need iWarp (TCP) here? */
> RDMA_ADDR_IB_MASK        = 0x000000000ff00000,

I do see a ULP needing to know the address family needed to pass to
rdma_connect and rdma_listen, so I would add "IP", but not "iWARP".

> RDMA_SEPARATE_READ_SGE   = 0x0000000010000000,
> RDMA_QUIRKS_MASK         = 0x000000fff0000000

This is good, but it also needs an attribute to signal the need for a
remote-writable RDMA Read sink buffer, for today's iWARP.

Tom.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Wang April 13, 2015, 7:40 a.m. UTC | #33
On 04/10/2015 07:36 PM, Jason Gunthorpe wrote:
> On Fri, Apr 10, 2015 at 01:10:43PM -0400, Doug Ledford wrote:
> 
>> documented.  I get why you link the address variant, because it pops out
>> all the things that are special about IBoE addressing and calls out that
>> the issues need to be handled.  However, saying requires_iboe_addr(),
>> while foreshadowing the work that needs done, doesn't actually document
>> the work that needs done.  Whether we call is dev_is_iboe() or
>> requires_iboe_addr(), it would be good if the documentation spelled out
>> those specific requirements for reference sake.
> 
> My deep hope for this, was that the test 'requires_iboe_addr' or
> whatever we call it would have a *really good* kdoc.
> 
> List all the ways iboe_addr's work, how they differ from IB addresses,
> refer to the specs people should read to understand it, etc.
> 
> The patches don't do this, and maybe Michael is the wrong person to
> fill that in, but we can get it done..

That's exactly what I'm thinking ;-)

At first I'm just trying to save us some code but now it's becoming
a topic far above that purpose, I'd like to help commit whatever we already
settled and pass the internal reforming works to experts like you guys
, implement the bitmask stuff ;-)

And I can still help on review and may be testing with mlx4 if later I
got the access.

> 
> Jason
> 
> BTW: Michael, next time you post the series, please trim the CC
> list...

Thanks for the remind, I'll do trim in v3 :-)

Regards,
Michael Wang

> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 18c1ece..a9587c4 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -76,6 +76,7 @@  static int ib_device_check_mandatory(struct ib_device *device)
 	} mandatory_table[] = {
 		IB_MANDATORY_FUNC(query_device),
 		IB_MANDATORY_FUNC(query_port),
+		IB_MANDATORY_FUNC(query_transport),
 		IB_MANDATORY_FUNC(query_pkey),
 		IB_MANDATORY_FUNC(query_gid),
 		IB_MANDATORY_FUNC(alloc_pd),
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index f93eb8d..83370de 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -133,14 +133,16 @@  enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_
 	if (device->get_link_layer)
 		return device->get_link_layer(device, port_num);
 
-	switch (rdma_node_get_transport(device->node_type)) {
+	switch (device->query_transport(device, port_num)) {
 	case RDMA_TRANSPORT_IB:
+	case RDMA_TRANSPORT_IBOE:
 		return IB_LINK_LAYER_INFINIBAND;
 	case RDMA_TRANSPORT_IWARP:
 	case RDMA_TRANSPORT_USNIC:
 	case RDMA_TRANSPORT_USNIC_UDP:
 		return IB_LINK_LAYER_ETHERNET;
 	default:
+		BUG();
 		return IB_LINK_LAYER_UNSPECIFIED;
 	}
 }
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
index bdf3507..d46bbb0 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -99,6 +99,12 @@  static int c2_query_port(struct ib_device *ibdev,
 	return 0;
 }
 
+static enum rdma_transport_type
+c2_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IWARP;
+}
+
 static int c2_query_pkey(struct ib_device *ibdev,
 			 u8 port, u16 index, u16 * pkey)
 {
@@ -801,6 +807,7 @@  int c2_register_device(struct c2_dev *dev)
 	dev->ibdev.dma_device = &dev->pcidev->dev;
 	dev->ibdev.query_device = c2_query_device;
 	dev->ibdev.query_port = c2_query_port;
+	dev->ibdev.query_transport = c2_query_transport;
 	dev->ibdev.query_pkey = c2_query_pkey;
 	dev->ibdev.query_gid = c2_query_gid;
 	dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 811b24a..09682e9e 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1232,6 +1232,12 @@  static int iwch_query_port(struct ib_device *ibdev,
 	return 0;
 }
 
+static enum rdma_transport_type
+iwch_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IWARP;
+}
+
 static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
@@ -1385,6 +1391,7 @@  int iwch_register_device(struct iwch_dev *dev)
 	dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev);
 	dev->ibdev.query_device = iwch_query_device;
 	dev->ibdev.query_port = iwch_query_port;
+	dev->ibdev.query_transport = iwch_query_transport;
 	dev->ibdev.query_pkey = iwch_query_pkey;
 	dev->ibdev.query_gid = iwch_query_gid;
 	dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 66bd6a2..a445e0d 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -390,6 +390,12 @@  static int c4iw_query_port(struct ib_device *ibdev, u8 port,
 	return 0;
 }
 
+static enum rdma_transport_type
+c4iw_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IWARP;
+}
+
 static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
@@ -506,6 +512,7 @@  int c4iw_register_device(struct c4iw_dev *dev)
 	dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev);
 	dev->ibdev.query_device = c4iw_query_device;
 	dev->ibdev.query_port = c4iw_query_port;
+	dev->ibdev.query_transport = c4iw_query_transport;
 	dev->ibdev.query_pkey = c4iw_query_pkey;
 	dev->ibdev.query_gid = c4iw_query_gid;
 	dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext;
diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
index 9ed4d25..d5a34a6 100644
--- a/drivers/infiniband/hw/ehca/ehca_hca.c
+++ b/drivers/infiniband/hw/ehca/ehca_hca.c
@@ -242,6 +242,12 @@  query_port1:
 	return ret;
 }
 
+enum rdma_transport_type
+ehca_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IB;
+}
+
 int ehca_query_sma_attr(struct ehca_shca *shca,
 			u8 port, struct ehca_sma_attr *attr)
 {
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
index 22f79af..cec945f 100644
--- a/drivers/infiniband/hw/ehca/ehca_iverbs.h
+++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h
@@ -49,6 +49,9 @@  int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props);
 int ehca_query_port(struct ib_device *ibdev, u8 port,
 		    struct ib_port_attr *props);
 
+enum rdma_transport_type
+ehca_query_transport(struct ib_device *device, u8 port_num);
+
 int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
 			struct ehca_sma_attr *attr);
 
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index cd8d290..60e0a09 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -467,6 +467,7 @@  static int ehca_init_device(struct ehca_shca *shca)
 	shca->ib_device.dma_device          = &shca->ofdev->dev;
 	shca->ib_device.query_device        = ehca_query_device;
 	shca->ib_device.query_port          = ehca_query_port;
+	shca->ib_device.query_transport     = ehca_query_transport;
 	shca->ib_device.query_gid           = ehca_query_gid;
 	shca->ib_device.query_pkey          = ehca_query_pkey;
 	/* shca->in_device.modify_device    = ehca_modify_device    */
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 44ea939..58d36e3 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -1638,6 +1638,12 @@  static int ipath_query_port(struct ib_device *ibdev,
 	return 0;
 }
 
+static enum rdma_transport_type
+ipath_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IB;
+}
+
 static int ipath_modify_device(struct ib_device *device,
 			       int device_modify_mask,
 			       struct ib_device_modify *device_modify)
@@ -2140,6 +2146,7 @@  int ipath_register_ib_device(struct ipath_devdata *dd)
 	dev->query_device = ipath_query_device;
 	dev->modify_device = ipath_modify_device;
 	dev->query_port = ipath_query_port;
+	dev->query_transport = ipath_query_transport;
 	dev->modify_port = ipath_modify_port;
 	dev->query_pkey = ipath_query_pkey;
 	dev->query_gid = ipath_query_gid;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 0b280b1..28100bd 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -413,6 +413,15 @@  static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
 	return __mlx4_ib_query_port(ibdev, port, props, 0);
 }
 
+static enum rdma_transport_type
+mlx4_ib_query_transport(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_dev *dev = to_mdev(device)->dev;
+
+	return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
+		RDMA_TRANSPORT_IB : RDMA_TRANSPORT_IBOE;
+}
+
 int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 			union ib_gid *gid, int netw_view)
 {
@@ -2121,6 +2130,7 @@  static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
 	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
+	ibdev->ib_dev.query_transport	= mlx4_ib_query_transport;
 	ibdev->ib_dev.get_link_layer	= mlx4_ib_port_link_layer;
 	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
 	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index cc4ac1e..209c796 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -351,6 +351,12 @@  out:
 	return err;
 }
 
+static enum rdma_transport_type
+mlx5_ib_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IB;
+}
+
 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 			     union ib_gid *gid)
 {
@@ -1336,6 +1342,7 @@  static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
 	dev->ib_dev.query_device	= mlx5_ib_query_device;
 	dev->ib_dev.query_port		= mlx5_ib_query_port;
+	dev->ib_dev.query_transport	= mlx5_ib_query_transport;
 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
 	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
 	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 415f8e1..67ac6a4 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -179,6 +179,12 @@  static int mthca_query_port(struct ib_device *ibdev,
 	return err;
 }
 
+static enum rdma_transport_type
+mthca_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IB;
+}
+
 static int mthca_modify_device(struct ib_device *ibdev,
 			       int mask,
 			       struct ib_device_modify *props)
@@ -1281,6 +1287,7 @@  int mthca_register_device(struct mthca_dev *dev)
 	dev->ib_dev.dma_device           = &dev->pdev->dev;
 	dev->ib_dev.query_device         = mthca_query_device;
 	dev->ib_dev.query_port           = mthca_query_port;
+	dev->ib_dev.query_transport      = mthca_query_transport;
 	dev->ib_dev.modify_device        = mthca_modify_device;
 	dev->ib_dev.modify_port          = mthca_modify_port;
 	dev->ib_dev.query_pkey           = mthca_query_pkey;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index c0d0296..8df5b61 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -606,6 +606,11 @@  static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr
 	return 0;
 }
 
+static enum rdma_transport_type
+nes_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IWARP;
+}
 
 /**
  * nes_query_pkey
@@ -3879,6 +3884,7 @@  struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
 	nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
 	nesibdev->ibdev.query_device = nes_query_device;
 	nesibdev->ibdev.query_port = nes_query_port;
+	nesibdev->ibdev.query_transport = nes_query_transport;
 	nesibdev->ibdev.query_pkey = nes_query_pkey;
 	nesibdev->ibdev.query_gid = nes_query_gid;
 	nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 7a2b59a..9f4d182 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -244,6 +244,7 @@  static int ocrdma_register_device(struct ocrdma_dev *dev)
 	/* mandatory verbs. */
 	dev->ibdev.query_device = ocrdma_query_device;
 	dev->ibdev.query_port = ocrdma_query_port;
+	dev->ibdev.query_transport = ocrdma_query_transport;
 	dev->ibdev.modify_port = ocrdma_modify_port;
 	dev->ibdev.query_gid = ocrdma_query_gid;
 	dev->ibdev.get_link_layer = ocrdma_link_layer;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 8771755..73bace4 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -187,6 +187,12 @@  int ocrdma_query_port(struct ib_device *ibdev,
 	return 0;
 }
 
+enum rdma_transport_type
+ocrdma_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IBOE;
+}
+
 int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
 		       struct ib_port_modify *props)
 {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index b8f7853..4a81b63 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -41,6 +41,9 @@  int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
 int ocrdma_modify_port(struct ib_device *, u8 port, int mask,
 		       struct ib_port_modify *props);
 
+enum rdma_transport_type
+ocrdma_query_transport(struct ib_device *device, u8 port_num);
+
 void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
 int ocrdma_query_gid(struct ib_device *, u8 port,
 		     int index, union ib_gid *gid);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 4a35998..caad665 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1650,6 +1650,12 @@  static int qib_query_port(struct ib_device *ibdev, u8 port,
 	return 0;
 }
 
+static enum rdma_transport_type
+qib_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_IB;
+}
+
 static int qib_modify_device(struct ib_device *device,
 			     int device_modify_mask,
 			     struct ib_device_modify *device_modify)
@@ -2184,6 +2190,7 @@  int qib_register_ib_device(struct qib_devdata *dd)
 	ibdev->query_device = qib_query_device;
 	ibdev->modify_device = qib_modify_device;
 	ibdev->query_port = qib_query_port;
+	ibdev->query_transport = qib_query_transport;
 	ibdev->modify_port = qib_modify_port;
 	ibdev->query_pkey = qib_query_pkey;
 	ibdev->query_gid = qib_query_gid;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index 0d0f986..03ea9f3 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -360,6 +360,7 @@  static void *usnic_ib_device_add(struct pci_dev *dev)
 
 	us_ibdev->ib_dev.query_device = usnic_ib_query_device;
 	us_ibdev->ib_dev.query_port = usnic_ib_query_port;
+	us_ibdev->ib_dev.query_transport = usnic_ib_query_transport;
 	us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey;
 	us_ibdev->ib_dev.query_gid = usnic_ib_query_gid;
 	us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 53bd6a2..ff9a5f7 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -348,6 +348,12 @@  int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
 	return 0;
 }
 
+enum rdma_transport_type
+usnic_ib_query_transport(struct ib_device *device, u8 port_num)
+{
+	return RDMA_TRANSPORT_USNIC_UDP;
+}
+
 int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
 				int qp_attr_mask,
 				struct ib_qp_init_attr *qp_init_attr)
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index bb864f5..0b1633b 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -27,6 +27,8 @@  int usnic_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props);
 int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
 				struct ib_port_attr *props);
+enum rdma_transport_type
+usnic_ib_query_transport(struct ib_device *device, u8 port_num);
 int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
 				int qp_attr_mask,
 				struct ib_qp_init_attr *qp_init_attr);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 65994a1..d54f91e 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -75,10 +75,13 @@  enum rdma_node_type {
 };
 
 enum rdma_transport_type {
+	/* legacy for users */
 	RDMA_TRANSPORT_IB,
 	RDMA_TRANSPORT_IWARP,
 	RDMA_TRANSPORT_USNIC,
-	RDMA_TRANSPORT_USNIC_UDP
+	RDMA_TRANSPORT_USNIC_UDP,
+	/* new transport */
+	RDMA_TRANSPORT_IBOE,
 };
 
 __attribute_const__ enum rdma_transport_type
@@ -1501,6 +1504,8 @@  struct ib_device {
 	int		           (*query_port)(struct ib_device *device,
 						 u8 port_num,
 						 struct ib_port_attr *port_attr);
+	enum rdma_transport_type   (*query_transport)(struct ib_device *device,
+						      u8 port_num);
 	enum rdma_link_layer	   (*get_link_layer)(struct ib_device *device,
 						     u8 port_num);
 	int		           (*query_gid)(struct ib_device *device,