diff mbox

[RFC,v3,02/11] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) interface

Message ID 1486498990-76562-3-git-send-email-niranjana.vishwanathapura@intel.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Vishwanathapura, Niranjana Feb. 7, 2017, 8:23 p.m. UTC
Add rdma netdev interface to ib device structure allowing rdma netdev
devices to be allocated by ib clients.
Define HFI VNIC interface between hardware independent VNIC
functionality and the hardware dependent VNIC functionality.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
---
 include/rdma/ib_verbs.h |  27 +++++++++
 include/rdma/opa_hfi.h  | 147 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 include/rdma/opa_hfi.h

Comments

Jason Gunthorpe Feb. 7, 2017, 9:19 p.m. UTC | #1
On Tue, Feb 07, 2017 at 12:23:01PM -0800, Vishwanathapura, Niranjana wrote:
> Add rdma netdev interface to ib device structure allowing rdma netdev
> devices to be allocated by ib clients.
> Define HFI VNIC interface between hardware independent VNIC
> functionality and the hardware dependent VNIC functionality.

This commit message could be a bit clearer.

The alloc_rdma_netdev multiplexer is inteded as a new general
interface and this adds a protocol definition for ethernet VNIC on
OPA.

The hope is that ipoib can follow the same example and use the same
alloc_rdma_netdev entry point. Hopefully Mellanox will look at this
patch as I have talked to them in the past about doing this...

It looks like HFI turned out fairly well, the driver code and higher
level code have a reasonably nice split in my quick look.

>  	IB_DEVICE_RAW_SCATTER_FCS		= (1ULL << 34),
> +	IB_DEVICE_RDMA_NETDEV_HFI_VNIC		= (1ULL << 35),

What is this called HFI_VNIC anyhow? Shouldn't this be OPA_VNIC? There
is nothing really HFI specific, right?

> +/* hfi vnic rdma netdev's private data structure */
> +struct hfi_vnic_rdma_netdev {
> +	struct rdma_netdev rn;  /* keep this first */
> +	/* followed by device private data */
> +	char *dev_priv[0];
> +};
> +
> +static inline void *hfi_vnic_priv(const struct net_device *dev)
> +{
> +	struct rdma_netdev *rn = netdev_priv(dev);
> +
> +	return rn->clnt_priv;
> +}
> +
> +static inline void *hfi_vnic_dev_priv(const struct net_device *dev)
> +{
> +	struct rdma_netdev *rn = netdev_priv(dev);

Shouldn't this be hfi_vnic_rdma_netdev ?

> +	return rn + 1;

And this should be rn->dev_priv ?

Jason
Vishwanathapura, Niranjana Feb. 7, 2017, 10:06 p.m. UTC | #2
On Tue, Feb 07, 2017 at 02:19:01PM -0700, Jason Gunthorpe wrote:
>On Tue, Feb 07, 2017 at 12:23:01PM -0800, Vishwanathapura, Niranjana wrote:
>> Add rdma netdev interface to ib device structure allowing rdma netdev
>> devices to be allocated by ib clients.
>> Define HFI VNIC interface between hardware independent VNIC
>> functionality and the hardware dependent VNIC functionality.
>
>This commit message could be a bit clearer.
>
>The alloc_rdma_netdev multiplexer is inteded as a new general
>interface and this adds a protocol definition for ethernet VNIC on
>OPA.
>

Ok, will add the statement to the commit message in PATCH series.

>The hope is that ipoib can follow the same example and use the same
>alloc_rdma_netdev entry point. Hopefully Mellanox will look at this
>patch as I have talked to them in the past about doing this...
>
>It looks like HFI turned out fairly well, the driver code and higher
>level code have a reasonably nice split in my quick look.
>

Yes, HFI_VNIC design is leaner now with standard netdev interface.

>>  	IB_DEVICE_RAW_SCATTER_FCS		= (1ULL << 34),
>> +	IB_DEVICE_RDMA_NETDEV_HFI_VNIC		= (1ULL << 35),
>
>What is this called HFI_VNIC anyhow? Shouldn't this be OPA_VNIC? There
>is nothing really HFI specific, right?
>

Agreed, OPA_VNIC is more appropriate here. Will change it.

>> +/* hfi vnic rdma netdev's private data structure */
>> +struct hfi_vnic_rdma_netdev {
>> +	struct rdma_netdev rn;  /* keep this first */
>> +	/* followed by device private data */
>> +	char *dev_priv[0];
>> +};
>> +
>> +static inline void *hfi_vnic_priv(const struct net_device *dev)
>> +{
>> +	struct rdma_netdev *rn = netdev_priv(dev);
>> +
>> +	return rn->clnt_priv;
>> +}
>> +
>> +static inline void *hfi_vnic_dev_priv(const struct net_device *dev)
>> +{
>> +	struct rdma_netdev *rn = netdev_priv(dev);
>
>Shouldn't this be hfi_vnic_rdma_netdev ?
>
>> +	return rn + 1;
>
>And this should be rn->dev_priv ?
>

Yah, both will result in same behavior. But yah, what you are suggesting will 
remove any confusion. Will change in next PATCH series.

Niranjana

>Jason
Jason Gunthorpe Feb. 7, 2017, 10:19 p.m. UTC | #3
On Tue, Feb 07, 2017 at 02:06:30PM -0800, Vishwanathapura, Niranjana wrote:

> >> 	IB_DEVICE_RAW_SCATTER_FCS		= (1ULL << 34),
> >>+	IB_DEVICE_RDMA_NETDEV_HFI_VNIC		= (1ULL << 35),
> >
> >What is this called HFI_VNIC anyhow? Shouldn't this be OPA_VNIC? There
> >is nothing really HFI specific, right?
> 
> Agreed, OPA_VNIC is more appropriate here. Will change it.

And probably lots of other places too.. :)


> >And this should be rn->dev_priv ?
> 
> Yah, both will result in same behavior. But yah, what you are suggesting
> will remove any confusion. Will change in next PATCH series.

Only because the struct has no members, as soon as someone adds
something it would go booom.

Jason
Parav Pandit Feb. 8, 2017, 12:43 a.m. UTC | #4
Hi 

> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> owner@vger.kernel.org] On Behalf Of Vishwanathapura, Niranjana
> Sent: Tuesday, February 7, 2017 2:23 PM
> To: dledford@redhat.com
> Cc: linux-rdma@vger.kernel.org; netdev@vger.kernel.org;
> dennis.dalessandro@intel.com; ira.weiny@intel.com; Niranjana
> Vishwanathapura <niranjana.vishwanathapura@intel.com>
> Subject: [RFC v3 02/11] IB/hfi-vnic: Virtual Network Interface Controller
> (VNIC) interface
> 
> Add rdma netdev interface to ib device structure allowing rdma netdev
> devices to be allocated by ib clients.
> Define HFI VNIC interface between hardware independent VNIC
> functionality and the hardware dependent VNIC functionality.
> 
> Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Niranjana Vishwanathapura
> <niranjana.vishwanathapura@intel.com>
> ---
>  struct ib_device {
>  	struct device                *dma_device;
> 
> @@ -2096,6 +2114,15 @@ struct ib_device {
>  							   struct
> ib_rwq_ind_table_init_attr *init_attr,
>  							   struct ib_udata
> *udata);
>  	int                        (*destroy_rwq_ind_table)(struct ib_rwq_ind_table
> *wq_ind_table);
> +	/* rdma netdev operations */
> +	struct net_device *(*alloc_rdma_netdev)(
> +					struct ib_device *device,
> +					u8 port_num,
> +					enum rdma_netdev_t type,
> +					const char *name,
> +					unsigned char name_assign_type,
> +					void (*setup)(struct net_device *));
> +	void (*free_rdma_netdev)(struct net_device *netdev);
>  	struct ib_dma_mapping_ops   *dma_ops;
> 
>  	struct module               *owner;

As its clear from the cover letter and from the request to place this in drivers/infiniband/ulp,
Instead of increasing the ib_dev structure further,
Can you change the code to make use of ib_register_client() and friend functions to register vnic as ULP.
(similar to other ULP such as uverbs, srp, ipoib).
This will also allow you get to get notified for removing the vnic device when underlying rdma device gets removed.
Based on the property that gets exposed by the ibdev, vnic driver filters whether it needs to load its vnic to specific device or not.
This way modules are isolated between core and ULP little better.
Would it work for you?
Vishwanathapura, Niranjana Feb. 8, 2017, 1:04 a.m. UTC | #5
On Tue, Feb 07, 2017 at 03:19:25PM -0700, Jason Gunthorpe wrote:
>On Tue, Feb 07, 2017 at 02:06:30PM -0800, Vishwanathapura, Niranjana wrote:
>
>> >> 	IB_DEVICE_RAW_SCATTER_FCS		= (1ULL << 34),
>> >>+	IB_DEVICE_RDMA_NETDEV_HFI_VNIC		= (1ULL << 35),
>> >
>> >What is this called HFI_VNIC anyhow? Shouldn't this be OPA_VNIC? There
>> >is nothing really HFI specific, right?
>>
>> Agreed, OPA_VNIC is more appropriate here. Will change it.
>
>And probably lots of other places too.. :)
>

Well, our driver is called HFI1 and HFI_VNIC is in accordance with our naming 
convention. I will only change the above device attribute name to OPA_VNIC in 
the ib interface just to be consitant with other such defintions here.

>
>> >And this should be rn->dev_priv ?
>>
>> Yah, both will result in same behavior. But yah, what you are suggesting
>> will remove any confusion. Will change in next PATCH series.
>
>Only because the struct has no members, as soon as someone adds
>something it would go booom.
>

Agreed.

>Jason
Vishwanathapura, Niranjana Feb. 8, 2017, 1:12 a.m. UTC | #6
On Wed, Feb 08, 2017 at 12:43:40AM +0000, Parav Pandit wrote:
>> @@ -2096,6 +2114,15 @@ struct ib_device {
>>  							   struct
>> ib_rwq_ind_table_init_attr *init_attr,
>>  							   struct ib_udata
>> *udata);
>>  	int                        (*destroy_rwq_ind_table)(struct ib_rwq_ind_table
>> *wq_ind_table);
>> +	/* rdma netdev operations */
>> +	struct net_device *(*alloc_rdma_netdev)(
>> +					struct ib_device *device,
>> +					u8 port_num,
>> +					enum rdma_netdev_t type,
>> +					const char *name,
>> +					unsigned char name_assign_type,
>> +					void (*setup)(struct net_device *));
>> +	void (*free_rdma_netdev)(struct net_device *netdev);
>>  	struct ib_dma_mapping_ops   *dma_ops;
>>
>>  	struct module               *owner;
>
>As its clear from the cover letter and from the request to place this in drivers/infiniband/ulp,
>Instead of increasing the ib_dev structure further,
>Can you change the code to make use of ib_register_client() and friend functions to register vnic as ULP.
>(similar to other ULP such as uverbs, srp, ipoib).
>This will also allow you get to get notified for removing the vnic device when underlying rdma device gets removed.
>Based on the property that gets exposed by the ibdev, vnic driver filters whether it needs to load its vnic to specific device or not.
>This way modules are isolated between core and ULP little better.
>Would it work for you?

HFI_VNIC driver is using ib_register_client() and friend fucntions. Below patch 
in this series does that.
[RFC v3 08/11] IB/hfi-vnic: VNIC Ethernet Management Agent (VEMA) function

Niranjana

>
Leon Romanovsky Feb. 8, 2017, 6:54 a.m. UTC | #7
On Tue, Feb 07, 2017 at 02:19:01PM -0700, Jason Gunthorpe wrote:
> On Tue, Feb 07, 2017 at 12:23:01PM -0800, Vishwanathapura, Niranjana wrote:
> > Add rdma netdev interface to ib device structure allowing rdma netdev
> > devices to be allocated by ib clients.
> > Define HFI VNIC interface between hardware independent VNIC
> > functionality and the hardware dependent VNIC functionality.
>
> This commit message could be a bit clearer.
>
> The alloc_rdma_netdev multiplexer is inteded as a new general
> interface and this adds a protocol definition for ethernet VNIC on
> OPA.
>
> The hope is that ipoib can follow the same example and use the same
> alloc_rdma_netdev entry point. Hopefully Mellanox will look at this
> patch as I have talked to them in the past about doing this...

Jason,

We looked on it and it is useless for us, mainly because of the fact
that most  of the work is done in our net part of the driver.

So, as it looks for now, this ULP exercise will be for HFI only.

Thanks
Jason Gunthorpe Feb. 8, 2017, 5:29 p.m. UTC | #8
On Wed, Feb 08, 2017 at 08:54:37AM +0200, Leon Romanovsky wrote:
> On Tue, Feb 07, 2017 at 02:19:01PM -0700, Jason Gunthorpe wrote:
> > On Tue, Feb 07, 2017 at 12:23:01PM -0800, Vishwanathapura, Niranjana wrote:
> > > Add rdma netdev interface to ib device structure allowing rdma netdev
> > > devices to be allocated by ib clients.
> > > Define HFI VNIC interface between hardware independent VNIC
> > > functionality and the hardware dependent VNIC functionality.
> >
> > This commit message could be a bit clearer.
> >
> > The alloc_rdma_netdev multiplexer is inteded as a new general
> > interface and this adds a protocol definition for ethernet VNIC on
> > OPA.
> >
> > The hope is that ipoib can follow the same example and use the same
> > alloc_rdma_netdev entry point. Hopefully Mellanox will look at this
> > patch as I have talked to them in the past about doing this...
> 
> We looked on it and it is useless for us, mainly because of the fact
> that most  of the work is done in our net part of the driver.

I don't understand this comment.

At the last OFA conference the discussion with Mellanox was to provide
a way to implement ipoib ndo_start_xmit (and others) inside the lowest
level driver so the driver could do all offloads and optimizations
when pushing the ipoib SKB on the wire (and similar for rx). The ipoib
control plane would remain in the ipoib driver.

This is exactly the split that Vishwanathapura has created for
vnic. It has greatly simplified their code design from the earlier
patch series, so it seems like the right approach so far.

ipoib is not really any different from this vnic stuff. Both have
complex control planes that must be shared and need highly device
specific tx/rx flows.

I certainly don't want to see some other proposal to optimize ipoib ..

Jason
diff mbox

Patch

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b1ac973..b9897d9d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -55,6 +55,7 @@ 
 #include <net/ip.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/netdevice.h>
 
 #include <linux/if_link.h>
 #include <linux/atomic.h>
@@ -221,6 +222,7 @@  enum ib_device_cap_flags {
 	IB_DEVICE_SG_GAPS_REG			= (1ULL << 32),
 	IB_DEVICE_VIRTUAL_FUNCTION		= (1ULL << 33),
 	IB_DEVICE_RAW_SCATTER_FCS		= (1ULL << 34),
+	IB_DEVICE_RDMA_NETDEV_HFI_VNIC		= (1ULL << 35),
 };
 
 enum ib_signature_prot_cap {
@@ -1844,6 +1846,22 @@  struct ib_port_immutable {
 	u32                           max_mad_size;
 };
 
+/* rdma netdev type - specifies protocol type */
+enum rdma_netdev_t {
+	RDMA_NETDEV_HFI_VNIC
+};
+
+/**
+ * struct rdma_netdev - rdma netdev
+ * For cases where netstack interfacing is required.
+ */
+struct rdma_netdev {
+	void *clnt_priv;
+
+	/* control functions */
+	void (*set_id)(struct net_device *netdev, int id);
+};
+
 struct ib_device {
 	struct device                *dma_device;
 
@@ -2096,6 +2114,15 @@  struct ib_device {
 							   struct ib_rwq_ind_table_init_attr *init_attr,
 							   struct ib_udata *udata);
 	int                        (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table);
+	/* rdma netdev operations */
+	struct net_device *(*alloc_rdma_netdev)(
+					struct ib_device *device,
+					u8 port_num,
+					enum rdma_netdev_t type,
+					const char *name,
+					unsigned char name_assign_type,
+					void (*setup)(struct net_device *));
+	void (*free_rdma_netdev)(struct net_device *netdev);
 	struct ib_dma_mapping_ops   *dma_ops;
 
 	struct module               *owner;
diff --git a/include/rdma/opa_hfi.h b/include/rdma/opa_hfi.h
new file mode 100644
index 0000000..f357d04
--- /dev/null
+++ b/include/rdma/opa_hfi.h
@@ -0,0 +1,147 @@ 
+#ifndef _OPA_HFI_H
+#define _OPA_HFI_H
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains Intel Omni-Path (OPA) Host Fabric Interface (HFI)
+ * specific declarations.
+ */
+
+#include <rdma/ib_verbs.h>
+
+#define HFI_MAX_NUM_VNICS  255
+
+/* VNIC uses 16B header format */
+#define HFI_VNIC_L2_TYPE    0x2
+
+/* 16 header bytes + 2 reserved bytes */
+#define HFI_VNIC_L2_HDR_LEN   (16 + 2)
+
+#define HFI_VNIC_L4_HDR_LEN   2
+
+#define HFI_VNIC_HDR_LEN      (HFI_VNIC_L2_HDR_LEN + \
+			       HFI_VNIC_L4_HDR_LEN)
+
+#define HFI_VNIC_L4_ETHR  0x78
+
+#define HFI_VNIC_ICRC_LEN   4
+#define HFI_VNIC_TAIL_LEN   1
+#define HFI_VNIC_ICRC_TAIL_LEN  (HFI_VNIC_ICRC_LEN + HFI_VNIC_TAIL_LEN)
+
+/* VNIC configured and operational state values */
+#define HFI_VNIC_STATE_DROP_ALL        0x1
+#define HFI_VNIC_STATE_FORWARDING      0x3
+
+#define HFI_VNIC_SKB_MDATA_LEN         4
+#define HFI_VNIC_SKB_MDATA_ENCAP_ERR   0x1
+
+/* hfi vnic rdma netdev's private data structure */
+struct hfi_vnic_rdma_netdev {
+	struct rdma_netdev rn;  /* keep this first */
+	/* followed by device private data */
+	char *dev_priv[0];
+};
+
+static inline void *hfi_vnic_priv(const struct net_device *dev)
+{
+	struct rdma_netdev *rn = netdev_priv(dev);
+
+	return rn->clnt_priv;
+}
+
+static inline void *hfi_vnic_dev_priv(const struct net_device *dev)
+{
+	struct rdma_netdev *rn = netdev_priv(dev);
+
+	return rn + 1;
+}
+
+/* hfi_vnic skb meta data structrue */
+struct hfi_vnic_skb_mdata {
+	u8 vl;
+	u8 entropy;
+	u8 flags;
+	u8 rsvd;
+} __packed;
+
+/* HFI VNIC group statistics */
+struct hfi_vnic_grp_stats {
+	u64 unicast;
+	u64 mcastbcast;
+	u64 untagged;
+	u64 vlan;
+	u64 s_64;
+	u64 s_65_127;
+	u64 s_128_255;
+	u64 s_256_511;
+	u64 s_512_1023;
+	u64 s_1024_1518;
+	u64 s_1519_max;
+};
+
+struct hfi_vnic_stats {
+	/* standard netdev statistics */
+	struct rtnl_link_stats64 netstats;
+
+	/* HFI VNIC statistics */
+	struct hfi_vnic_grp_stats tx_grp;
+	struct hfi_vnic_grp_stats rx_grp;
+	u64 tx_dlid_zero;
+	u64 tx_drop_state;
+	u64 rx_drop_state;
+	u64 rx_runt;
+	u64 rx_oversize;
+};
+
+static inline bool rdma_cap_hfi_vnic(struct ib_device *device)
+{
+	return !!(device->attrs.device_cap_flags &
+		  IB_DEVICE_RDMA_NETDEV_HFI_VNIC);
+}
+
+#endif /* _OPA_HFI_H */