Patchwork [v4,1/9] rdma/cm: define native IB address

login
register
mail settings
Submitter Sean Hefty
Date Jan. 22, 2013, 9:56 p.m.
Message ID <1358891797-14625-2-git-send-email-sean.hefty@intel.com>
Download mbox | patch
Permalink /patch/214673/
State Not Applicable
Delegated to: David Miller
Headers show

Comments

Sean Hefty - Jan. 22, 2013, 9:56 p.m.
From: Sean Hefty <sean.hefty@intel.com>

Define AF_IB and sockaddr_ib to allow the rdma_cm to use native IB
addressing.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
---
 include/linux/socket.h |    2 +
 include/rdma/ib.h      |   89 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 0 deletions(-)
 create mode 100644 include/rdma/ib.h
Sean Hefty - Feb. 11, 2013, 6:02 p.m.
> Define AF_IB and sockaddr_ib to allow the rdma_cm to use native IB
> addressing.
> 
> Signed-off-by: Sean Hefty <sean.hefty@intel.com>
> ---
>  include/linux/socket.h |    2 +
>  include/rdma/ib.h      |   89 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 91 insertions(+), 0 deletions(-)
>  create mode 100644 include/rdma/ib.h
> 
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index 9a546ff..17a33f7 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -167,6 +167,7 @@ struct ucred {
>  #define AF_PPPOX	24	/* PPPoX sockets		*/
>  #define AF_WANPIPE	25	/* Wanpipe API Sockets */
>  #define AF_LLC		26	/* Linux LLC			*/
> +#define AF_IB		27	/* Native InfiniBand address	*/

...

> diff --git a/include/rdma/ib.h b/include/rdma/ib.h

...

> +struct sockaddr_ib {
> +	unsigned short int	sib_family;	/* AF_IB */
> +	__be16			sib_pkey;
> +	__be32			sib_flowinfo;
> +	struct ib_addr		sib_addr;
> +	__be64			sib_sid;
> +	__be64			sib_sid_mask;
> +	__u64			sib_scope_id;
> +};

Dave/Roland/anyone, is there any feedback on this approach?

If there's hesitation to add new address families to socket.h, I could instead use definitions local to the rdma_cm.  This has the potential to result in conflicts if the rdma_cm is expanded for other address families, though such conflicts seem unlikely.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz - Feb. 13, 2013, 12:51 p.m.
On 11/02/2013 20:02, Hefty, Sean wrote:
>> Define AF_IB and sockaddr_ib to allow the rdma_cm to use native IB addressing.
>>
>> Signed-off-by: Sean Hefty <sean.hefty@intel.com>
>> ---
>>   include/linux/socket.h |    2 +
>>   include/rdma/ib.h      |   89 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 91 insertions(+), 0 deletions(-)
>>   create mode 100644 include/rdma/ib.h
>>
>> diff --git a/include/linux/socket.h b/include/linux/socket.h
>> index 9a546ff..17a33f7 100644
>> --- a/include/linux/socket.h
>> +++ b/include/linux/socket.h
>> @@ -167,6 +167,7 @@ struct ucred {
>>   #define AF_PPPOX	24	/* PPPoX sockets		*/
>>   #define AF_WANPIPE	25	/* Wanpipe API Sockets */
>>   #define AF_LLC		26	/* Linux LLC			*/
>> +#define AF_IB		27	/* Native InfiniBand address	*/
> ...
>
>> diff --git a/include/rdma/ib.h b/include/rdma/ib.h
> ...
>
>> +struct sockaddr_ib {
>> +	unsigned short int	sib_family;	/* AF_IB */
>> +	__be16			sib_pkey;
>> +	__be32			sib_flowinfo;
>> +	struct ib_addr		sib_addr;
>> +	__be64			sib_sid;
>> +	__be64			sib_sid_mask;
>> +	__u64			sib_scope_id;
>> +};
> Dave/Roland/anyone, is there any feedback on this approach?
>
> If there's hesitation to add new address families to socket.h, I could instead use definitions local to the rdma_cm.  This has the potential to result in conflicts if the rdma_cm is expanded for other address families, though such conflicts seem unlikely.
>
>

I don't see why not add new address family if it comes to serve a real 
world use case, which seems to be the case from the description you 
provided in the cover letter.

Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz - Feb. 13, 2013, 12:56 p.m.
On 22/01/2013 23:56, sean.hefty@intel.com wrote:
> +
> +struct sockaddr_ib {
> +	unsigned short int	sib_family;	/* AF_IB */
> +	__be16			sib_pkey;
> +	__be32			sib_flowinfo;
> +	struct ib_addr		sib_addr;
> +	__be64			sib_sid;
> +	__be64			sib_sid_mask;
> +	__u64			sib_scope_id;
> +};

just a nit, maybe reorder the fields to better cope with their IPv6 
buddies (where  there is
such) from sockaddr_in6?

Also I see that both IPv6 header and IB GRH have a traffic class field 
which is skipped in both
cases for the related sockaddr_ structure, not sure why, is this 
something the kernel stack decides on and uses but not available for 
applications to read/modify?

struct sockaddr_in6 {
         unsigned short int      sin6_family;    /* AF_INET6 */
         __be16                  sin6_port;      /* Transport layer port 
# */
         __be32                  sin6_flowinfo;  /* IPv6 flow information */
         struct in6_addr         sin6_addr;      /* IPv6 address */
         __u32                   sin6_scope_id;  /* scope id (new in 
RFC2553) */
};

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sean Hefty - March 1, 2013, 3:49 a.m.
> > Define AF_IB and sockaddr_ib to allow the rdma_cm to use native IB
> > addressing.
> >
> > Signed-off-by: Sean Hefty <sean.hefty@intel.com>
> > ---
> >  include/linux/socket.h |    2 +
> >  include/rdma/ib.h      |   89
> ++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 91 insertions(+), 0 deletions(-)
> >  create mode 100644 include/rdma/ib.h
> >
> > diff --git a/include/linux/socket.h b/include/linux/socket.h
> > index 9a546ff..17a33f7 100644
> > --- a/include/linux/socket.h
> > +++ b/include/linux/socket.h
> > @@ -167,6 +167,7 @@ struct ucred {
> >  #define AF_PPPOX	24	/* PPPoX sockets		*/
> >  #define AF_WANPIPE	25	/* Wanpipe API Sockets */
> >  #define AF_LLC		26	/* Linux LLC			*/
> > +#define AF_IB		27	/* Native InfiniBand address	*/
> 
> ...
> 
> > diff --git a/include/rdma/ib.h b/include/rdma/ib.h
> 
> ...
> 
> > +struct sockaddr_ib {
> > +	unsigned short int	sib_family;	/* AF_IB */
> > +	__be16			sib_pkey;
> > +	__be32			sib_flowinfo;
> > +	struct ib_addr		sib_addr;
> > +	__be64			sib_sid;
> > +	__be64			sib_sid_mask;
> > +	__u64			sib_scope_id;
> > +};
> 
> Dave/Roland/anyone, is there any feedback on this approach?

ping...

Seriously, there is a need to establish connections using native IB GIDs.  It is preferable to add this functionality to the rdma_cm.  Although the ib_cm can exchange IB CM messages based on GIDs, the rdma_cm is the only interface that provides applications with dynamic service IDs, path record queries, and usable multicast support. 

The rdma_cm uses sockaddr for addressing information.  The sockaddr_ib structure is defined within the RDMA tree, but I believe that the address family value belongs in socket.h.  This helps to avoid any future conflicts.  However, I can change the rdma_cm to use an internal enum for address values if there is an objection.

sockaddr_ib/AF_IB is usable with any IB network card.  It allows an application to establish connections over IB without IBoIP being present, which is a real need.  It also allows an application to use out of band mechanisms for discovering remote GIDs.  Such mechanisms are needed to work through IP load balancing software and for MPI scalability.

I'm not trying to be impatient, but it's been 3 years... 

- Sean
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9a546ff..17a33f7 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -167,6 +167,7 @@  struct ucred {
 #define AF_PPPOX	24	/* PPPoX sockets		*/
 #define AF_WANPIPE	25	/* Wanpipe API Sockets */
 #define AF_LLC		26	/* Linux LLC			*/
+#define AF_IB		27	/* Native InfiniBand address	*/
 #define AF_CAN		29	/* Controller Area Network      */
 #define AF_TIPC		30	/* TIPC sockets			*/
 #define AF_BLUETOOTH	31	/* Bluetooth sockets 		*/
@@ -210,6 +211,7 @@  struct ucred {
 #define PF_PPPOX	AF_PPPOX
 #define PF_WANPIPE	AF_WANPIPE
 #define PF_LLC		AF_LLC
+#define PF_IB		AF_IB
 #define PF_CAN		AF_CAN
 #define PF_TIPC		AF_TIPC
 #define PF_BLUETOOTH	AF_BLUETOOTH
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
new file mode 100644
index 0000000..cf8f9e7
--- /dev/null
+++ b/include/rdma/ib.h
@@ -0,0 +1,89 @@ 
+/*
+ * Copyright (c) 2010 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_RDMA_IB_H)
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+
+struct ib_addr {
+	union {
+		__u8		uib_addr8[16];
+		__be16		uib_addr16[8];
+		__be32		uib_addr32[4];
+		__be64		uib_addr64[2];
+	} ib_u;
+#define sib_addr8		ib_u.uib_addr8
+#define sib_addr16		ib_u.uib_addr16
+#define sib_addr32		ib_u.uib_addr32
+#define sib_addr64		ib_u.uib_addr64
+#define sib_raw			ib_u.uib_addr8
+#define sib_subnet_prefix	ib_u.uib_addr64[0]
+#define sib_interface_id	ib_u.uib_addr64[1]
+};
+
+static inline int ib_addr_any(const struct ib_addr *a)
+{
+	return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline int ib_addr_loopback(const struct ib_addr *a)
+{
+	return ((a->sib_addr32[0] | a->sib_addr32[1] |
+		 a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+			       __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+	addr->sib_addr32[0] = w1;
+	addr->sib_addr32[1] = w2;
+	addr->sib_addr32[2] = w3;
+	addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+	return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+	unsigned short int	sib_family;	/* AF_IB */
+	__be16			sib_pkey;
+	__be32			sib_flowinfo;
+	struct ib_addr		sib_addr;
+	__be64			sib_sid;
+	__be64			sib_sid_mask;
+	__u64			sib_scope_id;
+};
+
+#endif /* _RDMA_IB_H */