diff mbox

[RFC,2/2] be2net: Added functionality to support RoCE driver

Message ID 470f5ea2-9344-42d5-b130-130d90a42250@exht1.ad.emulex.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

parav.pandit@emulex.com March 7, 2012, 5:47 p.m. UTC
From: Parav Pandit <parav.pandit@emulex.com>

- Increased MSIX vectors by 5 for RoCE traffic.
- Added macro to check roce support on a device.
- Added device specific doorbell, msix vector fields shared with nic functionality.
- Provides RoCE driver registration and deregistration functions.
- Added support functions which will be invoked on adapter
  add/remove and port up/down events.
- Traverses through the list of adapters for invoking callback functions.

Signed-off-by: Parav Pandit <parav.pandit@emulex.com>
---
 drivers/net/ethernet/emulex/benet/Makefile  |    2 +-
 drivers/net/ethernet/emulex/benet/be.h      |   28 ++++-
 drivers/net/ethernet/emulex/benet/be_cmds.h |    1 +
 drivers/net/ethernet/emulex/benet/be_hw.h   |    4 +-
 drivers/net/ethernet/emulex/benet/be_main.c |   83 +++++++++++--
 drivers/net/ethernet/emulex/benet/be_roce.c |  185 +++++++++++++++++++++++++++
 drivers/net/ethernet/emulex/benet/be_roce.h |   75 +++++++++++
 7 files changed, 365 insertions(+), 13 deletions(-)
 create mode 100644 drivers/net/ethernet/emulex/benet/be_roce.c
 create mode 100644 drivers/net/ethernet/emulex/benet/be_roce.h

Comments

Francois Romieu March 8, 2012, 11:48 a.m. UTC | #1
parav.pandit@emulex.com <parav.pandit@emulex.com> :
[...]
> diff --git a/drivers/net/ethernet/emulex/benet/Makefile b/drivers/net/ethernet/emulex/benet/Makefile
> index a60cd80..1a91b27 100644
> --- a/drivers/net/ethernet/emulex/benet/Makefile
> +++ b/drivers/net/ethernet/emulex/benet/Makefile
[...]
> @@ -372,6 +374,14 @@ struct be_adapter {
>  	u8 transceiver;
>  	u8 autoneg;
>  	u8 generation;		/* BladeEngine ASIC generation */
> +	u32 if_type;
> +	u8 __iomem *roce_db;	/* Door Bell */
> +	u32 roce_db_size;
> +	u32 roce_db_total_size;
> +	u64 roce_db_io_addr;

There may be room for some 'struct roce_db'.

> +	void *ocrdma_dev;

Is there a reason why it could not be 'struct ocrdma_dev *o' ?

> +	struct list_head entry;
> +
>  	u32 flash_status;
>  	struct completion flash_compl;
>  
> @@ -398,6 +408,9 @@ struct be_adapter {
>  #define OFF				0
>  #define lancer_chip(adapter)	((adapter->pdev->device == OC_DEVICE_ID3) || \
>  				 (adapter->pdev->device == OC_DEVICE_ID4))
> +#define be_roce_supported(adapter) ((adapter->if_type == SLI_INTF_TYPE_3 || \
> +				adapter->sli_family == SKYHAWK_SLI_FAMILY) && \
> +				(adapter->function_mode & RDMA_ENABLED))

No need for a macro. Could be static inline.

>  
>  extern const struct ethtool_ops be_ethtool_ops;
>  
> @@ -544,4 +557,17 @@ extern void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm,
>  extern void be_link_status_update(struct be_adapter *adapter, u8 link_status);
>  extern void be_parse_stats(struct be_adapter *adapter);
>  extern int be_load_fw(struct be_adapter *adapter, u8 *func);
> +
> +/*
> + * internal function to initialize-cleanup roce device.
> + */
> +extern void be_roce_dev_add(struct be_adapter *adapter);
> +extern void be_roce_dev_remove(struct be_adapter *adapter);
> +
> +/*
> + * internal function to open-close roce device during ifup-ifdown.
> + */
> +extern void be_roce_dev_open(struct be_adapter *adapter);
> +extern void be_roce_dev_close(struct be_adapter *adapter);

Nit: the name of the argument does not matter much here.

[...]
> diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
> index e703d64..9554cc9 100644
> --- a/drivers/net/ethernet/emulex/benet/be_main.c
> +++ b/drivers/net/ethernet/emulex/benet/be_main.c
[...]
> @@ -3139,19 +3155,45 @@ static void be_unmap_pci_bars(struct be_adapter *adapter)
>  		iounmap(adapter->csr);
>  	if (adapter->db)
>  		iounmap(adapter->db);
> +	if (adapter->roce_db)
> +		iounmap(adapter->roce_db);
> +}
> +
> +static int lancer_roce_map_pci_bars(struct be_adapter *adapter)
> +{
> +	struct pci_dev *pdev = adapter->pdev;
> +	u8 __iomem *addr;
> +	addr = ioremap_nocache(pci_resource_start(pdev, 2),
> +				pci_resource_len(pdev, 2));

- missing empty line.
- use pci_iomap

> +	if (addr == NULL)
> +		return -ENOMEM;
> +
> +	adapter->roce_db = addr;
> +	adapter->roce_db_io_addr = pci_resource_start(pdev, 2);
> +	adapter->roce_db_size = 8192;
> +	adapter->roce_db_total_size = pci_resource_len(pdev, 2);
> +	return 0;
>  }
>  
>  static int be_map_pci_bars(struct be_adapter *adapter)
>  {
> +	struct pci_dev *pdev = adapter->pdev;
>  	u8 __iomem *addr;
>  	int db_reg;
>  
>  	if (lancer_chip(adapter)) {
> -		addr = ioremap_nocache(pci_resource_start(adapter->pdev, 0),
> -			pci_resource_len(adapter->pdev, 0));
> -		if (addr == NULL)
> -			return -ENOMEM;
> -		adapter->db = addr;
> +		if (adapter->if_type == SLI_INTF_TYPE_2 ||
> +		    adapter->if_type == SLI_INTF_TYPE_3) {
> +			addr = ioremap_nocache(pci_resource_start(pdev, 0),
> +			    pci_resource_len(adapter->pdev, 0));

pci_iomap

> +			if (addr == NULL)
> +				return -ENOMEM;
> +			adapter->db = addr;
> +		}
> +		if (adapter->if_type == SLI_INTF_TYPE_3) {
> +			if (lancer_roce_map_pci_bars(adapter))
> +				goto pci_map_err;
> +		}
>  		return 0;
>  	}
>  
[...]
> @@ -3353,7 +3400,21 @@ static int be_dev_family_check(struct be_adapter *adapter)
>  						SLI_INTF_IF_TYPE_SHIFT;
>  
>  		if (((sli_intf & SLI_INTF_VALID_MASK) != SLI_INTF_VALID) ||
> -			if_type != 0x02) {
> +			((if_type != SLI_INTF_TYPE_2) &&
> +			 (if_type != SLI_INTF_TYPE_3))) {

You can remove some parenthesis.

It may benefit from a 'bool be_type_2_3' (could be used twice).


[...]
> diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c
> new file mode 100644
> index 0000000..7002160
> --- /dev/null
> +++ b/drivers/net/ethernet/emulex/benet/be_roce.c
> @@ -0,0 +1,185 @@
[...]
> +static struct ocrdma_driver *ocrdma_drv;
> +static LIST_HEAD(be_adapter_list);
> +static DEFINE_MUTEX(be_adapter_list_lock);
> +
> +static void _be_roce_dev_add(struct be_adapter *adapter)
> +{
> +	struct be_dev_info dev_info;
> +	int i, roce_vect_cnt = 0;

roce_vect_cnt : excess visibility and useless init

> +	struct pci_dev *pdev = adapter->pdev;
> +
> +	if (!ocrdma_drv || !ocrdma_drv->add || adapter->ocrdma_dev)
> +		return;

The registration logic seems a bit convoluted.

Are there real use-cases where the device and the ocrdma_drv could be
loaded in different relative ordering ?

> +	if (pdev->device == OC_DEVICE_ID5) {
> +		/* only msix is supported on these devices */
> +		if (!msix_enabled(adapter))
> +			return;
> +		/* DPP region address and length */
> +		dev_info.dpp_unmapped_addr = pci_resource_start(pdev, 2);
> +		dev_info.dpp_unmapped_len = pci_resource_len(pdev, 2);
> +	} else {
> +		dev_info.dpp_unmapped_addr = 0;
> +		dev_info.dpp_unmapped_len = 0;
> +	}
> +	dev_info.pdev = adapter->pdev;
> +	if (adapter->sli_family == SKYHAWK_SLI_FAMILY)
> +		dev_info.db = adapter->db;
> +	else
> +		dev_info.db = adapter->roce_db;
> +	dev_info.unmapped_db = adapter->roce_db_io_addr;
> +	dev_info.db_page_size = adapter->roce_db_size;
> +	dev_info.db_total_size = adapter->roce_db_total_size;
> +	dev_info.netdev = adapter->netdev;
> +	memcpy(dev_info.mac_addr, adapter->netdev->dev_addr, ETH_ALEN);
> +	dev_info.dev_family = adapter->sli_family;
> +	dev_info.vendor_id = adapter->pdev->vendor;
> +	dev_info.device_id = adapter->pdev->device;

I wonder why you peel struct pci_dev so early.

> +	if (msix_enabled(adapter)) {
> +		/* provide all the vectors, so that EQ creation response
> +		 * can decide which one to use.
> +		 */
> +		roce_vect_cnt = adapter->num_msix_vec;
> +		dev_info.intr_mode = BE_INTERRUPT_MODE_MSIX;
> +		dev_info.msix.num_vectors =
> +		    min(roce_vect_cnt, MAX_ROCE_MSIX_VECTORS);
> +		/* provide start index of the vector,
> +		 * so in case of linear usage,
> +		 * it can use the base as starting point.
> +		 */
> +		dev_info.msix.start_vector = adapter->num_eqs;
> +		for (i = 0; i < dev_info.msix.num_vectors; i++)
> +			dev_info.msix.vector_list[i] =
> +			    adapter->msix_entries[i].vector;

Parenthesis around the for statement ?

[...]
> +void be_roce_dev_close(struct be_adapter *adapter)
> +{
> +	if (be_roce_supported(adapter)) {
> +		mutex_lock(&be_adapter_list_lock);
> +		_be_roce_dev_close(adapter);
> +		mutex_unlock(&be_adapter_list_lock);
> +	}
> +}

There should be no need to check for be_roce_supported() once the probe
is done.

> +
> +int be_roce_register_driver(struct ocrdma_driver *drv)
> +{
> +	struct be_adapter *dev;
> +	struct net_device *netdev = NULL;

netdev should be in the list_for_each_entry loop.

> +	int status = 0;
> +
> +	mutex_lock(&be_adapter_list_lock);
> +	if (ocrdma_drv) {
> +		mutex_unlock(&be_adapter_list_lock);
> +		return -EINVAL;

Either use goto or the status variable is not needed.

> +	}
> +	ocrdma_drv = drv;
> +	list_for_each_entry(dev, &be_adapter_list, entry) {
> +		_be_roce_dev_add(dev);
> +		netdev = dev->netdev;
> +		if (netif_running(netdev) && netif_oper_up(netdev))
> +			_be_roce_dev_open(dev);
> +	}
> +	mutex_unlock(&be_adapter_list_lock);
> +	return status;
> +}
> +EXPORT_SYMBOL(be_roce_register_driver);
> +
> +int be_roce_unregister_driver(struct ocrdma_driver *drv)

It always return 0 : make it void.

> +{
> +	struct be_adapter *dev;
> +	mutex_lock(&be_adapter_list_lock);

Missing empty line.

[...]
> diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h
> new file mode 100644
> index 0000000..0bb3634
> --- /dev/null
> +++ b/drivers/net/ethernet/emulex/benet/be_roce.h
[...]
> +struct pci_dev;
> +struct net_device;

Why not a proper #include ?

> +
> +enum be_interrupt_mode {
> +	BE_INTERRUPT_MODE_MSIX = 0,
> +	BE_INTERRUPT_MODE_INTX = 1,
> +	BE_INTERRUPT_MODE_MSI = 2,

Please <tab>= instead of <space>=

> +};
> +
> +#define MAX_ROCE_MSIX_VECTORS   16
> +struct be_dev_info {
> +	u16 vendor_id;
> +	u16 device_id;

See remark above. I'd rather keep the relevant pci_dev around.

[...]
> +/* ocrdma driver register's the callback functions with nic driver. */
> +struct ocrdma_driver {
> +	unsigned char name[32];
> +	void *(*add) (struct be_dev_info *dev_info);
> +	void (*remove) (void *device_handle);
> +	void (*state_change_handler) (void *roce_handle, u32 new_state);

Please don't abuse void * like that. :o(
parav.pandit@emulex.com March 8, 2012, 1:04 p.m. UTC | #2
Inline.
Parav

-----Original Message-----
From: Francois Romieu [mailto:romieu@fr.zoreil.com] 
Sent: Thursday, March 08, 2012 5:19 PM
To: Pandit, Parav
Cc: netdev@vger.kernel.org
Subject: Re: [RFC 2/2] be2net: Added functionality to support RoCE driver

parav.pandit@emulex.com <parav.pandit@emulex.com> :
[...]
> diff --git a/drivers/net/ethernet/emulex/benet/Makefile 
> b/drivers/net/ethernet/emulex/benet/Makefile
> index a60cd80..1a91b27 100644
> --- a/drivers/net/ethernet/emulex/benet/Makefile
> +++ b/drivers/net/ethernet/emulex/benet/Makefile
[...]
> @@ -372,6 +374,14 @@ struct be_adapter {
>  	u8 transceiver;
>  	u8 autoneg;
>  	u8 generation;		/* BladeEngine ASIC generation */
> +	u32 if_type;
> +	u8 __iomem *roce_db;	/* Door Bell */
> +	u32 roce_db_size;
> +	u32 roce_db_total_size;
> +	u64 roce_db_io_addr;

There may be room for some 'struct roce_db'.
[Parav] o.k. I'll move them to structure.

> +	void *ocrdma_dev;

Is there a reason why it could not be 'struct ocrdma_dev *o' ?
[Parav] Yes. Reason is NIC driver publishes interface to RoCE driver. NIC driver is not aware of ocrdma_dev structure.
So its void pointer.

> +	struct list_head entry;
> +
>  	u32 flash_status;
>  	struct completion flash_compl;
>  
> @@ -398,6 +408,9 @@ struct be_adapter {
>  #define OFF				0
>  #define lancer_chip(adapter)	((adapter->pdev->device == OC_DEVICE_ID3) || \
>  				 (adapter->pdev->device == OC_DEVICE_ID4))
> +#define be_roce_supported(adapter) ((adapter->if_type == SLI_INTF_TYPE_3 || \
> +				adapter->sli_family == SKYHAWK_SLI_FAMILY) && \
> +				(adapter->function_mode & RDMA_ENABLED))

No need for a macro. Could be static inline.

>  
>  extern const struct ethtool_ops be_ethtool_ops;
>  
> @@ -544,4 +557,17 @@ extern void be_cq_notify(struct be_adapter 
> *adapter, u16 qid, bool arm,  extern void be_link_status_update(struct 
> be_adapter *adapter, u8 link_status);  extern void 
> be_parse_stats(struct be_adapter *adapter);  extern int 
> be_load_fw(struct be_adapter *adapter, u8 *func);
> +
> +/*
> + * internal function to initialize-cleanup roce device.
> + */
> +extern void be_roce_dev_add(struct be_adapter *adapter); extern void 
> +be_roce_dev_remove(struct be_adapter *adapter);
> +
> +/*
> + * internal function to open-close roce device during ifup-ifdown.
> + */
> +extern void be_roce_dev_open(struct be_adapter *adapter); extern void 
> +be_roce_dev_close(struct be_adapter *adapter);

Nit: the name of the argument does not matter much here.

[...]
> diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
> b/drivers/net/ethernet/emulex/benet/be_main.c
> index e703d64..9554cc9 100644
> --- a/drivers/net/ethernet/emulex/benet/be_main.c
> +++ b/drivers/net/ethernet/emulex/benet/be_main.c
[...]
> @@ -3139,19 +3155,45 @@ static void be_unmap_pci_bars(struct be_adapter *adapter)
>  		iounmap(adapter->csr);
>  	if (adapter->db)
>  		iounmap(adapter->db);
> +	if (adapter->roce_db)
> +		iounmap(adapter->roce_db);
> +}
> +
> +static int lancer_roce_map_pci_bars(struct be_adapter *adapter) {
> +	struct pci_dev *pdev = adapter->pdev;
> +	u8 __iomem *addr;
> +	addr = ioremap_nocache(pci_resource_start(pdev, 2),
> +				pci_resource_len(pdev, 2));

- missing empty line.
- use pci_iomap

> +	if (addr == NULL)
> +		return -ENOMEM;
> +
> +	adapter->roce_db = addr;
> +	adapter->roce_db_io_addr = pci_resource_start(pdev, 2);
> +	adapter->roce_db_size = 8192;
> +	adapter->roce_db_total_size = pci_resource_len(pdev, 2);
> +	return 0;
>  }
>  
>  static int be_map_pci_bars(struct be_adapter *adapter)  {
> +	struct pci_dev *pdev = adapter->pdev;
>  	u8 __iomem *addr;
>  	int db_reg;
>  
>  	if (lancer_chip(adapter)) {
> -		addr = ioremap_nocache(pci_resource_start(adapter->pdev, 0),
> -			pci_resource_len(adapter->pdev, 0));
> -		if (addr == NULL)
> -			return -ENOMEM;
> -		adapter->db = addr;
> +		if (adapter->if_type == SLI_INTF_TYPE_2 ||
> +		    adapter->if_type == SLI_INTF_TYPE_3) {
> +			addr = ioremap_nocache(pci_resource_start(pdev, 0),
> +			    pci_resource_len(adapter->pdev, 0));

pci_iomap

> +			if (addr == NULL)
> +				return -ENOMEM;
> +			adapter->db = addr;
> +		}
> +		if (adapter->if_type == SLI_INTF_TYPE_3) {
> +			if (lancer_roce_map_pci_bars(adapter))
> +				goto pci_map_err;
> +		}
>  		return 0;
>  	}
>  
[...]
> @@ -3353,7 +3400,21 @@ static int be_dev_family_check(struct be_adapter *adapter)
>  						SLI_INTF_IF_TYPE_SHIFT;
>  
>  		if (((sli_intf & SLI_INTF_VALID_MASK) != SLI_INTF_VALID) ||
> -			if_type != 0x02) {
> +			((if_type != SLI_INTF_TYPE_2) &&
> +			 (if_type != SLI_INTF_TYPE_3))) {

You can remove some parenthesis.

It may benefit from a 'bool be_type_2_3' (could be used twice).


[...]
> diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c 
> b/drivers/net/ethernet/emulex/benet/be_roce.c
> new file mode 100644
> index 0000000..7002160
> --- /dev/null
> +++ b/drivers/net/ethernet/emulex/benet/be_roce.c
> @@ -0,0 +1,185 @@
[...]
> +static struct ocrdma_driver *ocrdma_drv; static 
> +LIST_HEAD(be_adapter_list); static 
> +DEFINE_MUTEX(be_adapter_list_lock);
> +
> +static void _be_roce_dev_add(struct be_adapter *adapter) {
> +	struct be_dev_info dev_info;
> +	int i, roce_vect_cnt = 0;

roce_vect_cnt : excess visibility and useless init

> +	struct pci_dev *pdev = adapter->pdev;
> +
> +	if (!ocrdma_drv || !ocrdma_drv->add || adapter->ocrdma_dev)
> +		return;

The registration logic seems a bit convoluted.

Are there real use-cases where the device and the ocrdma_drv could be loaded in different relative ordering ?

[Parav] Above checks whether its already registered or not, whether it has registered proper callback function or not.
I'll add comment for same.

> +	if (pdev->device == OC_DEVICE_ID5) {
> +		/* only msix is supported on these devices */
> +		if (!msix_enabled(adapter))
> +			return;
> +		/* DPP region address and length */
> +		dev_info.dpp_unmapped_addr = pci_resource_start(pdev, 2);
> +		dev_info.dpp_unmapped_len = pci_resource_len(pdev, 2);
> +	} else {
> +		dev_info.dpp_unmapped_addr = 0;
> +		dev_info.dpp_unmapped_len = 0;
> +	}
> +	dev_info.pdev = adapter->pdev;
> +	if (adapter->sli_family == SKYHAWK_SLI_FAMILY)
> +		dev_info.db = adapter->db;
> +	else
> +		dev_info.db = adapter->roce_db;
> +	dev_info.unmapped_db = adapter->roce_db_io_addr;
> +	dev_info.db_page_size = adapter->roce_db_size;
> +	dev_info.db_total_size = adapter->roce_db_total_size;
> +	dev_info.netdev = adapter->netdev;
> +	memcpy(dev_info.mac_addr, adapter->netdev->dev_addr, ETH_ALEN);
> +	dev_info.dev_family = adapter->sli_family;
> +	dev_info.vendor_id = adapter->pdev->vendor;
> +	dev_info.device_id = adapter->pdev->device;

I wonder why you peel struct pci_dev so early.

> +	if (msix_enabled(adapter)) {
> +		/* provide all the vectors, so that EQ creation response
> +		 * can decide which one to use.
> +		 */
> +		roce_vect_cnt = adapter->num_msix_vec;
> +		dev_info.intr_mode = BE_INTERRUPT_MODE_MSIX;
> +		dev_info.msix.num_vectors =
> +		    min(roce_vect_cnt, MAX_ROCE_MSIX_VECTORS);
> +		/* provide start index of the vector,
> +		 * so in case of linear usage,
> +		 * it can use the base as starting point.
> +		 */
> +		dev_info.msix.start_vector = adapter->num_eqs;
> +		for (i = 0; i < dev_info.msix.num_vectors; i++)
> +			dev_info.msix.vector_list[i] =
> +			    adapter->msix_entries[i].vector;

Parenthesis around the for statement ?

[...]
> +void be_roce_dev_close(struct be_adapter *adapter) {
> +	if (be_roce_supported(adapter)) {
> +		mutex_lock(&be_adapter_list_lock);
> +		_be_roce_dev_close(adapter);
> +		mutex_unlock(&be_adapter_list_lock);
> +	}
> +}

There should be no need to check for be_roce_supported() once the probe is done.
[Parav] I think there is a need to check for same. Because NIC driver can be loaded for few device which supported NIC and few which are NIC+ROCE.
If we don't check it will incur the cost of taking and releasing mutex lock, which can be easily avoid with this check.

> +
> +int be_roce_register_driver(struct ocrdma_driver *drv) {
> +	struct be_adapter *dev;
> +	struct net_device *netdev = NULL;

netdev should be in the list_for_each_entry loop.

> +	int status = 0;
> +
> +	mutex_lock(&be_adapter_list_lock);
> +	if (ocrdma_drv) {
> +		mutex_unlock(&be_adapter_list_lock);
> +		return -EINVAL;

Either use goto or the status variable is not needed.

> +	}
> +	ocrdma_drv = drv;
> +	list_for_each_entry(dev, &be_adapter_list, entry) {
> +		_be_roce_dev_add(dev);
> +		netdev = dev->netdev;
> +		if (netif_running(netdev) && netif_oper_up(netdev))
> +			_be_roce_dev_open(dev);
> +	}
> +	mutex_unlock(&be_adapter_list_lock);
> +	return status;
> +}
> +EXPORT_SYMBOL(be_roce_register_driver);
> +
> +int be_roce_unregister_driver(struct ocrdma_driver *drv)

It always return 0 : make it void.

> +{
> +	struct be_adapter *dev;
> +	mutex_lock(&be_adapter_list_lock);

Missing empty line.

[...]
> diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h 
> b/drivers/net/ethernet/emulex/benet/be_roce.h
> new file mode 100644
> index 0000000..0bb3634
> --- /dev/null
> +++ b/drivers/net/ethernet/emulex/benet/be_roce.h
[...]
> +struct pci_dev;
> +struct net_device;

Why not a proper #include ?

> +
> +enum be_interrupt_mode {
> +	BE_INTERRUPT_MODE_MSIX = 0,
> +	BE_INTERRUPT_MODE_INTX = 1,
> +	BE_INTERRUPT_MODE_MSI = 2,

Please <tab>= instead of <space>=

> +};
> +
> +#define MAX_ROCE_MSIX_VECTORS   16
> +struct be_dev_info {
> +	u16 vendor_id;
> +	u16 device_id;

See remark above. I'd rather keep the relevant pci_dev around.

[...]
> +/* ocrdma driver register's the callback functions with nic driver. 
> +*/ struct ocrdma_driver {
> +	unsigned char name[32];
> +	void *(*add) (struct be_dev_info *dev_info);
> +	void (*remove) (void *device_handle);
> +	void (*state_change_handler) (void *roce_handle, u32 new_state);

Please don't abuse void * like that. :o(
[Parav] This interface provided by NIC doesn't have to know about the structure of RoCE driver as its provide the service to upper layer driver.
So void* pointer helps to achieve that. I may be lacking the disadvantage of how void* could badly affect it.
Other little heavy option is to use a shared structure between RoCE and NIC driver. This just increase the roce_device structure by another 8 bytes and stores redundant info.
Something like below,
be_roce.h
struct nic_roce_common_struct { 
	struct netdev *dev;
};
*/ struct ocrdma_driver {
	unsigned char name[32];
	struct nic_roce_common_struct *(*add) (struct be_dev_info *dev_info);
	void (*remove) (struct nic_roce_common_struct *);
	void (*state_change_handler) (struct nic_roce_common_struct *, u32 new_state);

roce_driver_main.h
struct roce_device { 
	struct nic_roce_common_struct common_struct;
	....
	....
};
Any other simpler solution/suggestion?

--
Ueimor
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight March 8, 2012, 1:38 p.m. UTC | #3
> > > +	void *ocrdma_dev;
> 
> > > Is there a reason why it could not be 'struct ocrdma_dev *o' ?
> [Parav] Yes. Reason is NIC driver publishes interface to RoCE 
> driver. NIC driver is not aware of ocrdma_dev structure.
> So its void pointer.

If the argument can only ever be 'struct ocrdma_dev *'
then used the named structure pointer - you don't need to
expose the actual definition to the other drivers.

Makes it much more difficult to pass the address of an
inappropriate structure.

	David


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Francois Romieu March 8, 2012, 4:27 p.m. UTC | #4
Parav.Pandit@Emulex.Com <Parav.Pandit@Emulex.Com> :
> From: Francois Romieu [mailto:romieu@fr.zoreil.com] 
[...]
> > +	void *ocrdma_dev;
> 
> Is there a reason why it could not be 'struct ocrdma_dev *o' ?
> [Parav] Yes. Reason is NIC driver publishes interface to RoCE driver. NIC driver is not aware of ocrdma_dev structure.
> So its void pointer.

Thanks for the explanation.

See below.

[...]
> > +	struct pci_dev *pdev = adapter->pdev;
> > +
> > +	if (!ocrdma_drv || !ocrdma_drv->add || adapter->ocrdma_dev)
> > +		return;
> 
> The registration logic seems a bit convoluted.
> 
> Are there real use-cases where the device and the ocrdma_drv could be loaded in different relative ordering ?
> 
> [Parav] Above checks whether its already registered or not, whether it has registered proper callback function or not.
> I'll add comment for same.

I'm dense but it does not answer my concern.

- I'll admit that there is a good reason for a loose coupling between
  the ocrdma_driver and the device driver. Life would be imho simpler
  if the availability of ocrdma_driver was a prerequisite for a proper
  use of the methods exported by be_roce.c, especially as "prerequisite"
  does not forbid to request said code on the fly.

- I'm curious to know why ocrdma_drv could have a NULL .add() method.
  Imho it's the duty of register_ocrdma_drv() to enforce
  ocrdma_drv->add != NULL as soon as ocrdma_drv != NULL.

- the adapter->ocrdma_dev test is useless. It would need _be_roce_dev_add()
  to be called both from be_roce_dev_add() and be_roce_register_driver().
  As be_roce_dev_add() depends on be_roce_register_driver() for ocrdma_drv
  to be set and be_roce_register_driver() depends on be_roce_dev_add()
  for including the device in be_adapter_list and everything is performed
  with be_adapter_list_lock held, it can't happen (TM).

[...]
> > +void be_roce_dev_close(struct be_adapter *adapter) {
> > +	if (be_roce_supported(adapter)) {
> > +		mutex_lock(&be_adapter_list_lock);
> > +		_be_roce_dev_close(adapter);
> > +		mutex_unlock(&be_adapter_list_lock);
> > +	}
> > +}
> 
> There should be no need to check for be_roce_supported() once the probe is done.
> [Parav] I think there is a need to check for same. Because NIC driver can be loaded for few device which supported NIC and few which are NIC+ROCE.
> If we don't check it will incur the cost of taking and releasing mutex lock,
> which can be easily avoid with this check.

You can init the adequate operations set at dev_add / registration time so
that the remaining be_roce_dev_xyz methods turn into noop for NIC without
ROCE.

be_adapter_list_lock is not performance critical, is it ?

[...]
> > +/* ocrdma driver register's the callback functions with nic driver. 
> > +*/ struct ocrdma_driver {
> > +	unsigned char name[32];
> > +	void *(*add) (struct be_dev_info *dev_info);
> > +	void (*remove) (void *device_handle);
> > +	void (*state_change_handler) (void *roce_handle, u32 new_state);
> 
> Please don't abuse void * like that. :o(
> [Parav] This interface provided by NIC doesn't have to know about the
> structure of RoCE driver as its provide the service to upper layer driver.
> So void* pointer helps to achieve that.

Please see David Laight's suggestion and consider it seriously.

> I may be lacking the disadvantage of how void* could badly affect it.

It comes with a big flashing "maintenance nightmare" sign.

[...]
> Any other simpler solution/suggestion?

As long as the code of the driver does not need to access the innards
of the memory area behind a struct ocrdma_driver *, it can go along with
a 'struct ocrdma_driver;' forward declaration.

struct ocrdma_driver would be fully described in drivers/net/.../be_roce.c.

I won't worry if the driver can see the innards of struct ocrdma_driver
but the type checks should not be damaged.

--
Ueimor
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
parav.pandit@emulex.com March 8, 2012, 5:49 p.m. UTC | #5

Roland Dreier March 8, 2012, 6:08 p.m. UTC | #6
On Thu, Mar 8, 2012 at 3:48 AM, Francois Romieu <romieu@fr.zoreil.com> wrote:
> Are there real use-cases where the device and the ocrdma_drv could be
> loaded in different relative ordering ?

Yes -- two cases are pretty standard:

 - be2net driver loads, probes all devices, then roce driver loads
 - be2net and roce driver loads, then a device is probed later
   (PCI hotplug, new SR-IOV virtual function, etc)

so we need to deal with the case where both the core be2net driver
is probing devices and the roce driver loads later, and also the case
where the roce driver loads first and then gets notified as the be2net
driver probes a device.

(This is pretty much the same as mlx4_core / mlx4_ib BTW)

 - R.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
parav.pandit@emulex.com March 8, 2012, 6:11 p.m. UTC | #7
Inline.
Parav
Francois Romieu March 8, 2012, 7:41 p.m. UTC | #8
Parav.Pandit@Emulex.Com <Parav.Pandit@Emulex.Com> :
> From: Francois Romieu [romieu@fr.zoreil.com]
[...]
> You can init the adequate operations set at dev_add / registration time so
> that the remaining be_roce_dev_xyz methods turn into noop for NIC without
> ROCE.
> [Parav] What do we gain by executing NO_OP function, where there is no need to do that?

We do not execute anything for NIC without ROCE.

For the ROCE case, it costs a new set of net_device_ops where the .open() and
.close() callbacks are different from the ones in be_netdev_ops. The former
.open() and .close() call into the latter. The current code does the opposite.

> What is the problem in performing this check and not executing those function? I find it more simpler and clean.

It is not a big problem. ROCE depends on NIC. The implementation exhibits a
dependency in the opposite order.

> Also, add(), remove() etc operations are at driver level and not at device level. So with proposed implementation there is no need to store those 3 fn_pointers to device structure.

The functions pointers are already there. The driver .remove() handler can
not be replaced but it can exchange the be_roce_supported() test for a
!ocrdma_dev test. It should allow to remove if_type from be_adapter as
it won't be used beyond .probe() (I only did a quick check).

It is up to you to weigh the net_device_ops cost. If more methods need to be
modified for ROCE, the extra layer could be worth it. Your call.

Off for jogging.
parav.pandit@emulex.com March 9, 2012, 6:16 a.m. UTC | #9
Inline.

-----Original Message-----
From: Francois Romieu [mailto:romieu@fr.zoreil.com] 
Sent: Friday, March 09, 2012 1:11 AM
To: Pandit, Parav
Cc: netdev@vger.kernel.org
Subject: Re: [RFC 2/2] be2net: Added functionality to support RoCE driver

Parav.Pandit@Emulex.Com <Parav.Pandit@Emulex.Com> :
> From: Francois Romieu [romieu@fr.zoreil.com]
[...]
> You can init the adequate operations set at dev_add / registration 
> time so that the remaining be_roce_dev_xyz methods turn into noop for 
> NIC without ROCE.
> [Parav] What do we gain by executing NO_OP function, where there is no need to do that?

We do not execute anything for NIC without ROCE.

For the ROCE case, it costs a new set of net_device_ops where the .open() and
.close() callbacks are different from the ones in be_netdev_ops. The former
.open() and .close() call into the latter. The current code does the opposite.

> What is the problem in performing this check and not executing those function? I find it more simpler and clean.

It is not a big problem. ROCE depends on NIC. The implementation exhibits a dependency in the opposite order.

> Also, add(), remove() etc operations are at driver level and not at device level. So with proposed implementation there is no need to store those 3 fn_pointers to device structure.

The functions pointers are already there. The driver .remove() handler can not be replaced but it can exchange the be_roce_supported() test for a !ocrdma_dev test. It should allow to remove if_type from be_adapter as it won't be used beyond .probe() (I only did a quick check).

It is up to you to weigh the net_device_ops cost. If more methods need to be modified for ROCE, the extra layer could be worth it. Your call.

[Parav] I understand your proposed design and benefit if we have more such differences with ROCE vs non ROCE NICs.
In near future those device_specific checks in either way doesn't differ much in either implementation.  So I would like to keep it current way. When we add more features it will be more logical to adopt your newly proposed approach and use different netdev_ops structure.
I'll be addressing other review comments and one specifically to change void* to ocrdma_dev* with forward declaration.

Off for jogging.

--
Ueimor
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/emulex/benet/Makefile b/drivers/net/ethernet/emulex/benet/Makefile
index a60cd80..1a91b27 100644
--- a/drivers/net/ethernet/emulex/benet/Makefile
+++ b/drivers/net/ethernet/emulex/benet/Makefile
@@ -4,4 +4,4 @@ 
 
 obj-$(CONFIG_BE2NET) += be2net.o
 
-be2net-y :=  be_main.o be_cmds.o be_ethtool.o
+be2net-y :=  be_main.o be_cmds.o be_ethtool.o be_roce.o
diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index cbdec25..767f41f 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -32,6 +32,7 @@ 
 #include <linux/u64_stats_sync.h>
 
 #include "be_hw.h"
+#include "be_roce.h"
 
 #define DRV_VER			"4.0.100u"
 #define DRV_NAME		"be2net"
@@ -92,7 +93,7 @@  static inline char *nic_name(struct pci_dev *pdev)
 #define MAX_RSS_QS		4	/* BE limit is 4 queues/port */
 #define MAX_RX_QS		(MAX_RSS_QS + 1) /* RSS qs + 1 def Rx */
 #define MAX_TX_QS		8
-#define BE_MAX_MSIX_VECTORS	(MAX_RX_QS + 1)/* RX + TX */
+#define BE_MAX_MSIX_VECTORS	(MAX_RX_QS + 1 + 5)/* RX + TX  + 5 RoCE */
 #define BE_NAPI_WEIGHT		64
 #define MAX_RX_POST 		BE_NAPI_WEIGHT /* Frags posted at a time */
 #define RX_FRAGS_REFILL_WM	(RX_Q_LEN - MAX_RX_POST)
@@ -320,6 +321,7 @@  struct be_adapter {
 
 	struct msix_entry msix_entries[BE_MAX_MSIX_VECTORS];
 	u32 num_msix_vec;
+	u32 num_eqs;
 	bool isr_registered;
 
 	/* TX Rings */
@@ -372,6 +374,14 @@  struct be_adapter {
 	u8 transceiver;
 	u8 autoneg;
 	u8 generation;		/* BladeEngine ASIC generation */
+	u32 if_type;
+	u8 __iomem *roce_db;	/* Door Bell */
+	u32 roce_db_size;
+	u32 roce_db_total_size;
+	u64 roce_db_io_addr;
+	void *ocrdma_dev;
+	struct list_head entry;
+
 	u32 flash_status;
 	struct completion flash_compl;
 
@@ -398,6 +408,9 @@  struct be_adapter {
 #define OFF				0
 #define lancer_chip(adapter)	((adapter->pdev->device == OC_DEVICE_ID3) || \
 				 (adapter->pdev->device == OC_DEVICE_ID4))
+#define be_roce_supported(adapter) ((adapter->if_type == SLI_INTF_TYPE_3 || \
+				adapter->sli_family == SKYHAWK_SLI_FAMILY) && \
+				(adapter->function_mode & RDMA_ENABLED))
 
 extern const struct ethtool_ops be_ethtool_ops;
 
@@ -544,4 +557,17 @@  extern void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm,
 extern void be_link_status_update(struct be_adapter *adapter, u8 link_status);
 extern void be_parse_stats(struct be_adapter *adapter);
 extern int be_load_fw(struct be_adapter *adapter, u8 *func);
+
+/*
+ * internal function to initialize-cleanup roce device.
+ */
+extern void be_roce_dev_add(struct be_adapter *adapter);
+extern void be_roce_dev_remove(struct be_adapter *adapter);
+
+/*
+ * internal function to open-close roce device during ifup-ifdown.
+ */
+extern void be_roce_dev_open(struct be_adapter *adapter);
+extern void be_roce_dev_close(struct be_adapter *adapter);
+
 #endif				/* BE_H */
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index dca8924..d7ad03b 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -1057,6 +1057,7 @@  struct be_cmd_resp_modify_eq_delay {
 #define FLEX10_MODE				0x400
 #define VNIC_MODE				0x20000
 #define UMC_ENABLED				0x1000000
+#define RDMA_ENABLED				0x4
 struct be_cmd_req_query_fw_cfg {
 	struct be_cmd_req_hdr hdr;
 	u32 rsvd[31];
diff --git a/drivers/net/ethernet/emulex/benet/be_hw.h b/drivers/net/ethernet/emulex/benet/be_hw.h
index f2c89e3..0aa6509 100644
--- a/drivers/net/ethernet/emulex/benet/be_hw.h
+++ b/drivers/net/ethernet/emulex/benet/be_hw.h
@@ -98,11 +98,13 @@ 
 #define SLI_INTF_REV_SHIFT			4
 #define SLI_INTF_FT_MASK			0x00000001
 
+#define SLI_INTF_TYPE_2		2
+#define SLI_INTF_TYPE_3		3
 
 /* SLI family */
 #define BE_SLI_FAMILY		0x0
 #define LANCER_A0_SLI_FAMILY	0xA
-
+#define SKYHAWK_SLI_FAMILY	0x2
 
 /********* ISR0 Register offset **********/
 #define CEV_ISR0_OFFSET 			0xC18
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index e703d64..9554cc9 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1697,8 +1697,10 @@  static void be_tx_queues_destroy(struct be_adapter *adapter)
 	be_eq_clean(adapter, &adapter->tx_eq);
 
 	q = &adapter->tx_eq.q;
-	if (q->created)
+	if (q->created) {
 		be_cmd_q_destroy(adapter, q, QTYPE_EQ);
+		adapter->num_eqs -= 1;
+	}
 	be_queue_free(adapter, q);
 }
 
@@ -1740,6 +1742,7 @@  static int be_tx_queues_create(struct be_adapter *adapter)
 	if (be_cmd_eq_create(adapter, eq, adapter->tx_eq.cur_eqd))
 		goto err;
 	adapter->tx_eq.eq_idx = adapter->eq_next_idx++;
+	adapter->num_eqs += 1;
 
 	for_all_tx_queues(adapter, txo, i) {
 		cq = &txo->cq;
@@ -1777,8 +1780,10 @@  static void be_rx_queues_destroy(struct be_adapter *adapter)
 		be_queue_free(adapter, q);
 
 		q = &rxo->rx_eq.q;
-		if (q->created)
+		if (q->created) {
 			be_cmd_q_destroy(adapter, q, QTYPE_EQ);
+			adapter->num_eqs -= 1;
+		}
 		be_queue_free(adapter, q);
 	}
 }
@@ -1824,6 +1829,7 @@  static int be_rx_queues_create(struct be_adapter *adapter)
 		rc = be_cmd_eq_create(adapter, eq, rxo->rx_eq.cur_eqd);
 		if (rc)
 			goto err;
+		adapter->num_eqs += 1;
 
 		rxo->rx_eq.eq_idx = adapter->eq_next_idx++;
 
@@ -2120,6 +2126,12 @@  static void be_msix_enable(struct be_adapter *adapter)
 
 	num_vec = be_num_rxqs_want(adapter) + 1;
 
+	if (be_roce_supported(adapter)) {
+		num_vec += min_t(u32, MAX_ROCE_MSIX_VECTORS,
+					(num_online_cpus() + 1));
+	}
+	num_vec = min(num_vec, BE_MAX_MSIX_VECTORS);
+
 	for (i = 0; i < num_vec; i++)
 		adapter->msix_entries[i].entry = i;
 
@@ -2330,6 +2342,8 @@  static int be_close(struct net_device *netdev)
 	struct be_eq_obj *tx_eq = &adapter->tx_eq;
 	int vec, i;
 
+	be_roce_dev_close(adapter);
+
 	be_async_mcc_disable(adapter);
 
 	if (!lancer_chip(adapter))
@@ -2442,6 +2456,8 @@  static int be_open(struct net_device *netdev)
 	if (!status)
 		be_link_status_update(adapter, link_status);
 
+	be_roce_dev_open(adapter);
+
 	return 0;
 err:
 	be_close(adapter->netdev);
@@ -3139,19 +3155,45 @@  static void be_unmap_pci_bars(struct be_adapter *adapter)
 		iounmap(adapter->csr);
 	if (adapter->db)
 		iounmap(adapter->db);
+	if (adapter->roce_db)
+		iounmap(adapter->roce_db);
+}
+
+static int lancer_roce_map_pci_bars(struct be_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	u8 __iomem *addr;
+	addr = ioremap_nocache(pci_resource_start(pdev, 2),
+				pci_resource_len(pdev, 2));
+	if (addr == NULL)
+		return -ENOMEM;
+
+	adapter->roce_db = addr;
+	adapter->roce_db_io_addr = pci_resource_start(pdev, 2);
+	adapter->roce_db_size = 8192;
+	adapter->roce_db_total_size = pci_resource_len(pdev, 2);
+	return 0;
 }
 
 static int be_map_pci_bars(struct be_adapter *adapter)
 {
+	struct pci_dev *pdev = adapter->pdev;
 	u8 __iomem *addr;
 	int db_reg;
 
 	if (lancer_chip(adapter)) {
-		addr = ioremap_nocache(pci_resource_start(adapter->pdev, 0),
-			pci_resource_len(adapter->pdev, 0));
-		if (addr == NULL)
-			return -ENOMEM;
-		adapter->db = addr;
+		if (adapter->if_type == SLI_INTF_TYPE_2 ||
+		    adapter->if_type == SLI_INTF_TYPE_3) {
+			addr = ioremap_nocache(pci_resource_start(pdev, 0),
+			    pci_resource_len(adapter->pdev, 0));
+			if (addr == NULL)
+				return -ENOMEM;
+			adapter->db = addr;
+		}
+		if (adapter->if_type == SLI_INTF_TYPE_3) {
+			if (lancer_roce_map_pci_bars(adapter))
+				goto pci_map_err;
+		}
 		return 0;
 	}
 
@@ -3176,7 +3218,11 @@  static int be_map_pci_bars(struct be_adapter *adapter)
 	if (addr == NULL)
 		goto pci_map_err;
 	adapter->db = addr;
-
+	if (adapter->sli_family == SKYHAWK_SLI_FAMILY) {
+		adapter->roce_db_size = 4096;
+		adapter->roce_db_io_addr = pci_resource_start(pdev, db_reg);
+		adapter->roce_db_total_size = pci_resource_len(pdev, db_reg);
+	}
 	return 0;
 pci_map_err:
 	be_unmap_pci_bars(adapter);
@@ -3289,6 +3335,8 @@  static void __devexit be_remove(struct pci_dev *pdev)
 	if (!adapter)
 		return;
 
+	be_roce_dev_remove(adapter);
+
 	cancel_delayed_work_sync(&adapter->work);
 
 	unregister_netdev(adapter->netdev);
@@ -3343,7 +3391,6 @@  static int be_dev_family_check(struct be_adapter *adapter)
 		break;
 	case BE_DEVICE_ID2:
 	case OC_DEVICE_ID2:
-	case OC_DEVICE_ID5:
 		adapter->generation = BE_GEN3;
 		break;
 	case OC_DEVICE_ID3:
@@ -3353,7 +3400,21 @@  static int be_dev_family_check(struct be_adapter *adapter)
 						SLI_INTF_IF_TYPE_SHIFT;
 
 		if (((sli_intf & SLI_INTF_VALID_MASK) != SLI_INTF_VALID) ||
-			if_type != 0x02) {
+			((if_type != SLI_INTF_TYPE_2) &&
+			 (if_type != SLI_INTF_TYPE_3))) {
+			dev_err(&pdev->dev, "SLI_INTF reg val is not valid\n");
+			return -EINVAL;
+		}
+		adapter->sli_family = ((sli_intf & SLI_INTF_FAMILY_MASK) >>
+							SLI_INTF_FAMILY_SHIFT);
+		adapter->generation = BE_GEN3;
+		adapter->if_type = if_type;
+		break;
+	case OC_DEVICE_ID5:
+		pci_read_config_dword(pdev, SLI_INTF_REG_OFFSET, &sli_intf);
+		if_type = (sli_intf & SLI_INTF_IF_TYPE_MASK) >>
+						SLI_INTF_IF_TYPE_SHIFT;
+		if ((sli_intf & SLI_INTF_VALID_MASK) != SLI_INTF_VALID) {
 			dev_err(&pdev->dev, "SLI_INTF reg val is not valid\n");
 			return -EINVAL;
 		}
@@ -3621,6 +3682,8 @@  static int __devinit be_probe(struct pci_dev *pdev,
 	if (status != 0)
 		goto unsetup;
 
+	be_roce_dev_add(adapter);
+
 	dev_info(&pdev->dev, "%s port %d\n", nic_name(pdev), adapter->port_num);
 
 	schedule_delayed_work(&adapter->work, msecs_to_jiffies(100));
diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c
new file mode 100644
index 0000000..7002160
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_roce.c
@@ -0,0 +1,185 @@ 
+/*
+ * Copyright (C) 2005 - 2011 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation. The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+
+#include "be.h"
+#include "be_cmds.h"
+
+static struct ocrdma_driver *ocrdma_drv;
+static LIST_HEAD(be_adapter_list);
+static DEFINE_MUTEX(be_adapter_list_lock);
+
+static void _be_roce_dev_add(struct be_adapter *adapter)
+{
+	struct be_dev_info dev_info;
+	int i, roce_vect_cnt = 0;
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (!ocrdma_drv || !ocrdma_drv->add || adapter->ocrdma_dev)
+		return;
+	if (pdev->device == OC_DEVICE_ID5) {
+		/* only msix is supported on these devices */
+		if (!msix_enabled(adapter))
+			return;
+		/* DPP region address and length */
+		dev_info.dpp_unmapped_addr = pci_resource_start(pdev, 2);
+		dev_info.dpp_unmapped_len = pci_resource_len(pdev, 2);
+	} else {
+		dev_info.dpp_unmapped_addr = 0;
+		dev_info.dpp_unmapped_len = 0;
+	}
+	dev_info.pdev = adapter->pdev;
+	if (adapter->sli_family == SKYHAWK_SLI_FAMILY)
+		dev_info.db = adapter->db;
+	else
+		dev_info.db = adapter->roce_db;
+	dev_info.unmapped_db = adapter->roce_db_io_addr;
+	dev_info.db_page_size = adapter->roce_db_size;
+	dev_info.db_total_size = adapter->roce_db_total_size;
+	dev_info.netdev = adapter->netdev;
+	memcpy(dev_info.mac_addr, adapter->netdev->dev_addr, ETH_ALEN);
+	dev_info.dev_family = adapter->sli_family;
+	dev_info.vendor_id = adapter->pdev->vendor;
+	dev_info.device_id = adapter->pdev->device;
+	if (msix_enabled(adapter)) {
+		/* provide all the vectors, so that EQ creation response
+		 * can decide which one to use.
+		 */
+		roce_vect_cnt = adapter->num_msix_vec;
+		dev_info.intr_mode = BE_INTERRUPT_MODE_MSIX;
+		dev_info.msix.num_vectors =
+		    min(roce_vect_cnt, MAX_ROCE_MSIX_VECTORS);
+		/* provide start index of the vector,
+		 * so in case of linear usage,
+		 * it can use the base as starting point.
+		 */
+		dev_info.msix.start_vector = adapter->num_eqs;
+		for (i = 0; i < dev_info.msix.num_vectors; i++)
+			dev_info.msix.vector_list[i] =
+			    adapter->msix_entries[i].vector;
+	} else {
+		dev_info.msix.num_vectors = 0;
+		dev_info.intr_mode = BE_INTERRUPT_MODE_INTX;
+	}
+	adapter->ocrdma_dev = ocrdma_drv->add(&dev_info);
+}
+
+void be_roce_dev_add(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		INIT_LIST_HEAD(&adapter->entry);
+		mutex_lock(&be_adapter_list_lock);
+		list_add_tail(&adapter->entry, &be_adapter_list);
+
+		/* invoke add() routine of roce driver only if
+		 * valid driver registered with add method and add() is not yet
+		 * invoked on a given adapter.
+		 */
+		_be_roce_dev_add(adapter);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+void _be_roce_dev_remove(struct be_adapter *adapter)
+{
+	if (ocrdma_drv && ocrdma_drv->remove && adapter->ocrdma_dev)
+		ocrdma_drv->remove(adapter->ocrdma_dev);
+	adapter->ocrdma_dev = NULL;
+}
+
+void be_roce_dev_remove(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		_be_roce_dev_remove(adapter);
+		list_del(&adapter->entry);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+void _be_roce_dev_open(struct be_adapter *adapter)
+{
+	if (ocrdma_drv && adapter->ocrdma_dev &&
+	    ocrdma_drv->state_change_handler)
+		ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 0);
+}
+
+void be_roce_dev_open(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		_be_roce_dev_open(adapter);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+void _be_roce_dev_close(struct be_adapter *adapter)
+{
+	if (ocrdma_drv && adapter->ocrdma_dev &&
+	    ocrdma_drv->state_change_handler)
+		ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 1);
+}
+
+void be_roce_dev_close(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		_be_roce_dev_close(adapter);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+int be_roce_register_driver(struct ocrdma_driver *drv)
+{
+	struct be_adapter *dev;
+	struct net_device *netdev = NULL;
+	int status = 0;
+
+	mutex_lock(&be_adapter_list_lock);
+	if (ocrdma_drv) {
+		mutex_unlock(&be_adapter_list_lock);
+		return -EINVAL;
+	}
+	ocrdma_drv = drv;
+	list_for_each_entry(dev, &be_adapter_list, entry) {
+		_be_roce_dev_add(dev);
+		netdev = dev->netdev;
+		if (netif_running(netdev) && netif_oper_up(netdev))
+			_be_roce_dev_open(dev);
+	}
+	mutex_unlock(&be_adapter_list_lock);
+	return status;
+}
+EXPORT_SYMBOL(be_roce_register_driver);
+
+int be_roce_unregister_driver(struct ocrdma_driver *drv)
+{
+	struct be_adapter *dev;
+	mutex_lock(&be_adapter_list_lock);
+	list_for_each_entry(dev, &be_adapter_list, entry) {
+		if (dev->ocrdma_dev)
+			_be_roce_dev_remove(dev);
+	}
+	ocrdma_drv = NULL;
+	mutex_unlock(&be_adapter_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(be_roce_unregister_driver);
diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h
new file mode 100644
index 0000000..0bb3634
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_roce.h
@@ -0,0 +1,75 @@ 
+/*
+ * Copyright (C) 2005 - 2011 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation. The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef BE_ROCE_H
+#define BE_ROCE_H
+
+struct pci_dev;
+struct net_device;
+
+enum be_interrupt_mode {
+	BE_INTERRUPT_MODE_MSIX = 0,
+	BE_INTERRUPT_MODE_INTX = 1,
+	BE_INTERRUPT_MODE_MSI = 2,
+};
+
+#define MAX_ROCE_MSIX_VECTORS   16
+struct be_dev_info {
+	u16 vendor_id;
+	u16 device_id;
+	u8 __iomem *db;
+	u64 unmapped_db;
+	u32 db_page_size;
+	u32 db_total_size;
+	u64 dpp_unmapped_addr;
+	u32 dpp_unmapped_len;
+	struct pci_dev *pdev;
+	struct net_device *netdev;
+	u8 mac_addr[ETH_ALEN];
+	u32 dev_family;
+	enum be_interrupt_mode intr_mode;
+	struct {
+		int num_vectors;
+		int start_vector;
+		u32 vector_list[MAX_ROCE_MSIX_VECTORS];
+	} msix;
+};
+
+/* ocrdma driver register's the callback functions with nic driver. */
+struct ocrdma_driver {
+	unsigned char name[32];
+	void *(*add) (struct be_dev_info *dev_info);
+	void (*remove) (void *device_handle);
+	void (*state_change_handler) (void *roce_handle, u32 new_state);
+};
+
+enum {
+	BE_DEV_UP = 0,
+	BE_DEV_DOWN = 1
+};
+
+/* APIs for RoCE driver to register callback handlers,
+ * which will be invoked when device is added, removed, ifup, ifdown
+ */
+int be_roce_register_driver(struct ocrdma_driver *drv);
+int be_roce_unregister_driver(struct ocrdma_driver *drv);
+
+/* API for RoCE driver to issue mailbox commands */
+int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload,
+		    int wrb_payload_size, u16 *cmd_status, u16 *ext_status);
+
+#endif /* BE_ROCE_H */