diff mbox

[for,bnxt_re,V4,03/21] RDMA/bnxt_re: register with the NIC driver

Message ID 1482320530-5344-4-git-send-email-selvin.xavier@broadcom.com
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Selvin Xavier Dec. 21, 2016, 11:41 a.m. UTC
This patch handles the registration with the bnxt_en driver.
The bnxt_re driver first registers with netdev notifier chain and upon
receiving the NETDEV_REGISTER event, it registers with bnxt_en driver.

	1. bnxt_en's ulp_probe function returns a structure that contains
	   information 	about the device and additional entry points.
	2. bnxt_en driver returns 'struct bnxt_eth_dev' that contains set
	   of operation  vectors that bnxt_re driver invokes later.
	3. bnxt_request_msix() allows the bnxt_re driver to specify the
	   number of MSI-X vectors that are needed.
	4. bnxt_send_fw_msg () is used to send messages to the FW
	5. bnxt_register_async_events() is used to register for async
	   event callbacks.

v2: Remove some sparse warning. Also, remove some unused code from unreg
    path.
v3: Removed condition checks for rdev reported during static code analysis.
    Check the return value of try_module_get while getting bnxt_en
    reference.

Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h |  51 ++++
 drivers/infiniband/hw/bnxt_re/main.c    | 448 ++++++++++++++++++++++++++++++++
 2 files changed, 499 insertions(+)

Comments

Leon Romanovsky Jan. 18, 2017, 10:11 a.m. UTC | #1
On Wed, Dec 21, 2016 at 03:41:52AM -0800, Selvin Xavier wrote:
> This patch handles the registration with the bnxt_en driver.
> The bnxt_re driver first registers with netdev notifier chain and upon
> receiving the NETDEV_REGISTER event, it registers with bnxt_en driver.
>
> 	1. bnxt_en's ulp_probe function returns a structure that contains
> 	   information 	about the device and additional entry points.
> 	2. bnxt_en driver returns 'struct bnxt_eth_dev' that contains set
> 	   of operation  vectors that bnxt_re driver invokes later.
> 	3. bnxt_request_msix() allows the bnxt_re driver to specify the
> 	   number of MSI-X vectors that are needed.
> 	4. bnxt_send_fw_msg () is used to send messages to the FW
> 	5. bnxt_register_async_events() is used to register for async
> 	   event callbacks.
>
> v2: Remove some sparse warning. Also, remove some unused code from unreg
>     path.
> v3: Removed condition checks for rdev reported during static code analysis.
>     Check the return value of try_module_get while getting bnxt_en
>     reference.
>
> Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
> Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
> Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
> Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
> ---
>  drivers/infiniband/hw/bnxt_re/bnxt_re.h |  51 ++++
>  drivers/infiniband/hw/bnxt_re/main.c    | 448 ++++++++++++++++++++++++++++++++
>  2 files changed, 499 insertions(+)
>
> diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
> index f9b8542..8b73e3d 100644
> --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
> +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
> @@ -42,5 +42,56 @@
>  #define ROCE_DRV_MODULE_NAME		"bnxt_re"
>  #define ROCE_DRV_MODULE_VERSION		"1.0.0"
>
> +#define BNXT_RE_REF_WAIT_COUNT		10
>  #define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
> +
> +struct bnxt_re_work {
> +	struct work_struct	work;
> +	unsigned long		event;
> +	struct bnxt_re_dev      *rdev;
> +	struct net_device	*vlan_dev;
> +};
> +
> +#define BNXT_RE_MIN_MSIX		2
> +#define BNXT_RE_MAX_MSIX		16
> +struct bnxt_re_dev {
> +	struct ib_device		ibdev;
> +	struct list_head		list;
> +	atomic_t			ref_count;
> +	unsigned long			flags;
> +#define BNXT_RE_FLAG_NETDEV_REGISTERED	0
> +#define BNXT_RE_FLAG_IBDEV_REGISTERED	1
> +#define BNXT_RE_FLAG_GOT_MSIX		2
> +#define BNXT_RE_FLAG_RCFW_CHANNEL_EN	8
> +#define BNXT_RE_FLAG_QOS_WORK_REG	16
> +	struct net_device		*netdev;
> +	unsigned int			version, major, minor;
> +	struct bnxt_en_dev		*en_dev;
> +	struct bnxt_msix_entry		msix_entries[BNXT_RE_MAX_MSIX];
> +	int				num_msix;
> +
> +	int				id;
> +
> +	atomic_t			qp_count;
> +	struct mutex			qp_lock;	/* protect qp list */
> +	struct list_head		qp_list;
> +
> +	atomic_t			cq_count;
> +	atomic_t			srq_count;
> +	atomic_t			mr_count;
> +	atomic_t			mw_count;
> +	/* Max of 2 lossless traffic class supported per port */
> +	u16				cosq[2];
> +};
> +
> +#define to_bnxt_re_dev(ptr, member)	\
> +	container_of((ptr), struct bnxt_re_dev, member)
> +
> +static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
> +{
> +	if (rdev)
> +		return  &rdev->ibdev.dev;
> +	return NULL;
> +}
> +
>  #endif
> diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
> index 6c22d51..71d7501 100644
> --- a/drivers/infiniband/hw/bnxt_re/main.c
> +++ b/drivers/infiniband/hw/bnxt_re/main.c
> @@ -38,10 +38,24 @@
>
>  #include <linux/module.h>
>  #include <linux/netdevice.h>
> +#include <linux/ethtool.h>
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/rculist.h>
> +#include <linux/spinlock.h>
> +#include <linux/pci.h>
> +#include <net/ipv6.h>
> +#include <net/addrconf.h>
> +
> +#include <rdma/ib_verbs.h>
> +#include <rdma/ib_user_verbs.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_addr.h>
> +
> +#include "bnxt_ulp.h"
> +#include "roce_hsi.h"
>  #include "bnxt_re.h"
> +#include "bnxt.h"
>  static char version[] =
>  		BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n";
>
> @@ -55,6 +69,384 @@ static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
>  /* Mutex to protect the list of bnxt_re devices added */
>  static DEFINE_MUTEX(bnxt_re_dev_lock);
>  static struct workqueue_struct *bnxt_re_wq;
> +
> +/* for handling bnxt_en callbacks later */
> +static void bnxt_re_stop(void *p)
> +{
> +}
> +
> +static void bnxt_re_start(void *p)
> +{
> +}
> +
> +static void bnxt_re_sriov_config(void *p, int num_vfs)
> +{
> +}
> +
> +static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
> +	.ulp_async_notifier = NULL,
> +	.ulp_stop = bnxt_re_stop,
> +	.ulp_start = bnxt_re_start,
> +	.ulp_sriov_config = bnxt_re_sriov_config
> +};
> +
> +/* The rdev ref_count is to protect immature removal of the device */
> +static inline void bnxt_re_hold(struct bnxt_re_dev *rdev)
> +{
> +	atomic_inc(&rdev->ref_count);
> +}
> +
> +static inline void bnxt_re_put(struct bnxt_re_dev *rdev)
> +{
> +	atomic_dec(&rdev->ref_count);
> +}
> +
> +/* RoCE -> Net driver */
> +
> +/* Driver registration routines used to let the networking driver (bnxt_en)
> + * to know that the RoCE driver is now installed
> + */
> +static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait)
> +{
> +	struct bnxt_en_dev *en_dev;
> +	int rc;
> +
> +	if (!rdev)
> +		return -EINVAL;
> +
> +	en_dev = rdev->en_dev;
> +	/* Acquire rtnl lock if it is not invokded from netdev event */
> +	if (lock_wait)
> +		rtnl_lock();
> +
> +	rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev,
> +						    BNXT_ROCE_ULP);
> +	if (lock_wait)
> +		rtnl_unlock();
> +	return rc;
> +}
> +
> +static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
> +{
> +	struct bnxt_en_dev *en_dev;
> +	int rc = 0;
> +
> +	if (!rdev)
> +		return -EINVAL;
> +
> +	en_dev = rdev->en_dev;
> +
> +	rtnl_lock();
> +	rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
> +						  &bnxt_re_ulp_ops, rdev);
> +	rtnl_unlock();
> +	return rc;
> +}
> +
> +static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait)
> +{
> +	struct bnxt_en_dev *en_dev;
> +	int rc;
> +
> +	if (!rdev)
> +		return -EINVAL;
> +
> +	en_dev = rdev->en_dev;
> +
> +	if (lock_wait)
> +		rtnl_lock();
> +
> +	rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP);
> +
> +	if (lock_wait)
> +		rtnl_unlock();
> +	return rc;
> +}
> +
> +static int bnxt_re_request_msix(struct bnxt_re_dev *rdev)
> +{
> +	int rc = 0, num_msix_want = BNXT_RE_MIN_MSIX, num_msix_got;
> +	struct bnxt_en_dev *en_dev;
> +
> +	if (!rdev)
> +		return -EINVAL;
> +
> +	en_dev = rdev->en_dev;
> +
> +	rtnl_lock();
> +	num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP,
> +							 rdev->msix_entries,
> +							 num_msix_want);
> +	if (num_msix_got < BNXT_RE_MIN_MSIX) {
> +		rc = -EINVAL;
> +		goto done;
> +	}
> +	if (num_msix_got != num_msix_want) {
> +		dev_warn(rdev_to_dev(rdev),
> +			 "Requested %d MSI-X vectors, got %d\n",
> +			 num_msix_want, num_msix_got);
> +	}
> +	rdev->num_msix = num_msix_got;
> +done:
> +	rtnl_unlock();
> +	return rc;
> +}
> +
> +/* Device */
> +
> +static bool is_bnxt_re_dev(struct net_device *netdev)
> +{
> +	struct ethtool_drvinfo drvinfo;
> +
> +	if (netdev->ethtool_ops && netdev->ethtool_ops->get_drvinfo) {
> +		memset(&drvinfo, 0, sizeof(drvinfo));
> +		netdev->ethtool_ops->get_drvinfo(netdev, &drvinfo);
> +
> +		if (strcmp(drvinfo.driver, "bnxt_en"))
> +			return false;
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev)
> +{
> +	struct bnxt_re_dev *rdev;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(rdev, &bnxt_re_dev_list, list) {
> +		if (rdev->netdev == netdev) {
> +			rcu_read_unlock();
> +			return rdev;
> +		}
> +	}
> +	rcu_read_unlock();
> +	return NULL;
> +}
> +
> +static void bnxt_re_dev_unprobe(struct net_device *netdev,
> +				struct bnxt_en_dev *en_dev)
> +{
> +	dev_put(netdev);
> +	module_put(en_dev->pdev->driver->driver.owner);
> +}
> +
> +static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
> +{
> +	struct bnxt *bp = netdev_priv(netdev);
> +	struct bnxt_en_dev *en_dev;
> +	struct pci_dev *pdev;
> +
> +	/* Call bnxt_en's RoCE probe via indirect API */
> +	if (!bp->ulp_probe)
> +		return ERR_PTR(-EINVAL);
> +
> +	en_dev = bp->ulp_probe(netdev);
> +	if (IS_ERR(en_dev))
> +		return en_dev;
> +
> +	pdev = en_dev->pdev;
> +	if (!pdev)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* Bump net device reference count */
> +	if (!try_module_get(pdev->driver->driver.owner))
> +		return ERR_PTR(-ENODEV);
> +
> +	dev_hold(netdev);
> +
> +	return en_dev;
> +}
> +
> +static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
> +{
> +	int i = BNXT_RE_REF_WAIT_COUNT;
> +
> +	/* Wait for rdev refcount to come down */
> +	while ((atomic_read(&rdev->ref_count) > 1) && i--)
> +		msleep(100);
> +
> +	if (atomic_read(&rdev->ref_count) > 1)
> +		dev_err(rdev_to_dev(rdev),
> +			"Failed waiting for ref count to deplete %d",
> +			atomic_read(&rdev->ref_count));
> +
> +	atomic_set(&rdev->ref_count, 0);
> +	dev_put(rdev->netdev);
> +	rdev->netdev = NULL;
> +
> +	mutex_lock(&bnxt_re_dev_lock);
> +	list_del_rcu(&rdev->list);
> +	mutex_unlock(&bnxt_re_dev_lock);
> +
> +	synchronize_rcu();
> +	flush_workqueue(bnxt_re_wq);
> +
> +	ib_dealloc_device(&rdev->ibdev);
> +	/* rdev is gone */
> +}
> +
> +static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
> +					   struct bnxt_en_dev *en_dev)
> +{
> +	struct bnxt_re_dev *rdev;
> +
> +	/* Allocate bnxt_re_dev instance here */
> +	rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
> +	if (!rdev) {
> +		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
> +			ROCE_DRV_MODULE_NAME);
> +		return NULL;
> +	}
> +	/* Default values */
> +	atomic_set(&rdev->ref_count, 0);
> +	rdev->netdev = netdev;
> +	dev_hold(rdev->netdev);
> +	rdev->en_dev = en_dev;
> +	rdev->id = rdev->en_dev->pdev->devfn;
> +	INIT_LIST_HEAD(&rdev->qp_list);
> +	mutex_init(&rdev->qp_lock);
> +	atomic_set(&rdev->qp_count, 0);
> +	atomic_set(&rdev->cq_count, 0);
> +	atomic_set(&rdev->srq_count, 0);
> +	atomic_set(&rdev->mr_count, 0);
> +	atomic_set(&rdev->mw_count, 0);
> +	rdev->cosq[0] = 0xFFFF;
> +	rdev->cosq[1] = 0xFFFF;
> +
> +	mutex_lock(&bnxt_re_dev_lock);
> +	list_add_tail_rcu(&rdev->list, &bnxt_re_dev_list);
> +	mutex_unlock(&bnxt_re_dev_lock);
> +	return rdev;
> +}
> +
> +static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
> +{
> +	int rc;
> +
> +	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
> +		rc = bnxt_re_free_msix(rdev, lock_wait);
> +		if (rc)
> +			dev_warn(rdev_to_dev(rdev),
> +				 "Failed to free MSI-X vectors: %#x", rc);
> +	}
> +	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
> +		rc = bnxt_re_unregister_netdev(rdev, lock_wait);
> +		if (rc)
> +			dev_warn(rdev_to_dev(rdev),
> +				 "Failed to unregister with netdev: %#x", rc);
> +	}
> +}
> +
> +static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
> +{
> +	int i, j, rc;
> +
> +	/* Registered a new RoCE device instance to netdev */
> +	rc = bnxt_re_register_netdev(rdev);
> +	if (rc) {
> +		pr_err("Failed to register with netedev: %#x\n", rc);
> +		return -EINVAL;
> +	}
> +	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
> +
> +	rc = bnxt_re_request_msix(rdev);
> +	if (rc) {
> +		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
> +		rc = -EINVAL;
> +		goto fail;
> +	}
> +	set_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags);
> +
> +	return 0;
> +fail:
> +	bnxt_re_ib_unreg(rdev, true);
> +	return rc;
> +}
> +
> +static void bnxt_re_dev_unreg(struct bnxt_re_dev *rdev)
> +{
> +	struct bnxt_en_dev *en_dev = rdev->en_dev;
> +	struct net_device *netdev = rdev->netdev;
> +
> +	bnxt_re_dev_remove(rdev);
> +
> +	if (netdev)
> +		bnxt_re_dev_unprobe(netdev, en_dev);
> +}
> +
> +static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
> +{
> +	struct bnxt_en_dev *en_dev;
> +	int rc = 0;
> +
> +	if (!is_bnxt_re_dev(netdev))
> +		return -ENODEV;
> +
> +	en_dev = bnxt_re_dev_probe(netdev);
> +	if (IS_ERR(en_dev)) {
> +		pr_err("%s: Failed to probe\n", ROCE_DRV_MODULE_NAME);
> +		rc = PTR_ERR(en_dev);
> +		goto exit;
> +	}
> +	*rdev = bnxt_re_dev_add(netdev, en_dev);
> +	if (!*rdev) {
> +		rc = -ENOMEM;
> +		bnxt_re_dev_unprobe(netdev, en_dev);
> +		goto exit;
> +	}
> +	bnxt_re_hold(*rdev);
> +exit:
> +	return rc;
> +}
> +
> +static void bnxt_re_remove_one(struct bnxt_re_dev *rdev)
> +{
> +	pci_dev_put(rdev->en_dev->pdev);
> +}
> +
> +/* Handle all deferred netevents tasks */
> +static void bnxt_re_task(struct work_struct *work)
> +{
> +	struct bnxt_re_work *re_work;
> +	struct bnxt_re_dev *rdev;
> +	int rc = 0;
> +
> +	re_work = container_of(work, struct bnxt_re_work, work);
> +	rdev = re_work->rdev;
> +
> +	if (re_work->event != NETDEV_REGISTER &&
> +	    !test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
> +		return;
> +
> +	switch (re_work->event) {
> +	case NETDEV_REGISTER:
> +		rc = bnxt_re_ib_reg(rdev);
> +		if (rc)
> +			dev_err(rdev_to_dev(rdev),
> +				"Failed to register with IB: %#x", rc);
> +		break;
> +	case NETDEV_UP:
> +
> +		break;
> +	case NETDEV_DOWN:
> +
> +		break;
> +
> +	case NETDEV_CHANGE:
> +
> +		break;
> +	default:
> +		break;
> +	}

Why do you need empty lines and "break"s between cases?
It can be written as a fallback
case ..:
case ..:
   break;

> +	kfree(re_work);
> +}
> +
> +static void bnxt_re_init_one(struct bnxt_re_dev *rdev)
> +{
> +	pci_dev_get(rdev->en_dev->pdev);
> +}
> +
>  /*
>   * "Notifier chain callback can be invoked for the same chain from
>   * different CPUs at the same time".
> @@ -72,6 +464,62 @@ static struct workqueue_struct *bnxt_re_wq;
>  static int bnxt_re_netdev_event(struct notifier_block *notifier,
>  				unsigned long event, void *ptr)
>  {
> +	struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr);
> +	struct bnxt_re_work *re_work;
> +	struct bnxt_re_dev *rdev;
> +	int rc = 0;
> +
> +	real_dev = rdma_vlan_dev_real_dev(netdev);
> +	if (!real_dev)
> +		real_dev = netdev;
> +
> +	rdev = bnxt_re_from_netdev(real_dev);
> +	if (!rdev && event != NETDEV_REGISTER)
> +		goto exit;
> +	if (real_dev != netdev)
> +		goto exit;
> +
> +	if (rdev)
> +		bnxt_re_hold(rdev);
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		if (rdev)
> +			break;
> +		rc = bnxt_re_dev_reg(&rdev, real_dev);
> +		if (rc == -ENODEV)
> +			break;
> +		if (rc) {
> +			pr_err("Failed to register with the device %s: %#x\n",
> +			       real_dev->name, rc);
> +			break;
> +		}
> +		bnxt_re_init_one(rdev);
> +		goto sch_work;
> +
> +	case NETDEV_UNREGISTER:
> +		bnxt_re_ib_unreg(rdev, false);
> +		bnxt_re_remove_one(rdev);
> +		bnxt_re_dev_unreg(rdev);
> +		break;
> +
> +	default:
> +sch_work:

It is very awkward goto label at the middle of switch construction.
Please consider to rewrite it.

> +		/* Allocate for the deferred task */
> +		re_work = kzalloc(sizeof(*re_work), GFP_ATOMIC);
> +		if (!re_work)
> +			break;
> +
> +		re_work->rdev = rdev;
> +		re_work->event = event;
> +		re_work->vlan_dev = (real_dev == netdev ? NULL : netdev);
> +		INIT_WORK(&re_work->work, bnxt_re_task);
> +		queue_work(bnxt_re_wq, &re_work->work);
> +		break;
> +	}
> +	if (rdev)
> +		bnxt_re_put(rdev);
> +exit:
>  	return NOTIFY_DONE;
>  }
>
> --
> 2.5.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Feb. 6, 2017, 8:26 p.m. UTC | #2
On Wed, 2016-12-21 at 03:41 -0800, Selvin Xavier wrote:
> +static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
> +{
> +       int i = BNXT_RE_REF_WAIT_COUNT;
> +
> +       /* Wait for rdev refcount to come down */
> +       while ((atomic_read(&rdev->ref_count) > 1) && i--)
> +               msleep(100);
> +
> +       if (atomic_read(&rdev->ref_count) > 1)
> +               dev_err(rdev_to_dev(rdev),
> +                       "Failed waiting for ref count to deplete %d",
> +                       atomic_read(&rdev->ref_count));
> +
> +       atomic_set(&rdev->ref_count, 0);
> +       dev_put(rdev->netdev);
> +       rdev->netdev = NULL;
> +
> +       mutex_lock(&bnxt_re_dev_lock);
> +       list_del_rcu(&rdev->list);
> +       mutex_unlock(&bnxt_re_dev_lock);
> +
> +       synchronize_rcu();
> +       flush_workqueue(bnxt_re_wq);
> +
> +       ib_dealloc_device(&rdev->ibdev);
> +       /* rdev is gone */
> +}

This looks bad.  Either your ref counting is right, and your ref count
should go to 1, or you have an issue that won't be helped by forcibly
removing the device.  In its current form, this looks like an oopser
waiting to happen.

If you know your ref counting is right, then simply wait until it goes
to 1, don't have this bailout logic.  If you truly need to bailout for
some reason, then you need to not free things.  Better to shutdown then
leak and live than release and have a use after free data corrupter.
Selvin Xavier Feb. 10, 2017, 11:41 a.m. UTC | #3
On Tue, Feb 7, 2017 at 1:56 AM, Doug Ledford <dledford@redhat.com> wrote:
>> +static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
>> +{
>> +       int i = BNXT_RE_REF_WAIT_COUNT;
>> +
>> +       /* Wait for rdev refcount to come down */
>> +       while ((atomic_read(&rdev->ref_count) > 1) && i--)
>> +               msleep(100);
>> +
>> +       if (atomic_read(&rdev->ref_count) > 1)
>> +               dev_err(rdev_to_dev(rdev),
>> +                       "Failed waiting for ref count to deplete %d",
>> +                       atomic_read(&rdev->ref_count));
>> +
>> +       atomic_set(&rdev->ref_count, 0);
>> +       dev_put(rdev->netdev);
>> +       rdev->netdev = NULL;
>> +
>> +       mutex_lock(&bnxt_re_dev_lock);
>> +       list_del_rcu(&rdev->list);
>> +       mutex_unlock(&bnxt_re_dev_lock);
>> +
>> +       synchronize_rcu();
>> +       flush_workqueue(bnxt_re_wq);
>> +
>> +       ib_dealloc_device(&rdev->ibdev);
>> +       /* rdev is gone */
>> +}
>
> This looks bad.  Either your ref counting is right, and your ref count
> should go to 1, or you have an issue that won't be helped by forcibly
> removing the device.  In its current form, this looks like an oopser
> waiting to happen.
>
> If you know your ref counting is right, then simply wait until it goes
> to 1, don't have this bailout logic.  If you truly need to bailout for
> some reason, then you need to not free things.  Better to shutdown then
> leak and live than release and have a use after free data corrupter.

Thanks for your commet.
I reviewed the usage of ref_count again and it is not really required
in this patch series.
It is being incremented and decremented from the netdev notifier and
will be always 1
in bnxt_re_dev_remove. bnxt_re_dev_remove is also invoked from netdev
notifier, so
no need to wait in this function.  So, I have removed ref_count
variable and cleaned up
this function in the v5 patch set.

Thanks,
Selvin
diff mbox

Patch

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index f9b8542..8b73e3d 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -42,5 +42,56 @@ 
 #define ROCE_DRV_MODULE_NAME		"bnxt_re"
 #define ROCE_DRV_MODULE_VERSION		"1.0.0"
 
+#define BNXT_RE_REF_WAIT_COUNT		10
 #define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
+
+struct bnxt_re_work {
+	struct work_struct	work;
+	unsigned long		event;
+	struct bnxt_re_dev      *rdev;
+	struct net_device	*vlan_dev;
+};
+
+#define BNXT_RE_MIN_MSIX		2
+#define BNXT_RE_MAX_MSIX		16
+struct bnxt_re_dev {
+	struct ib_device		ibdev;
+	struct list_head		list;
+	atomic_t			ref_count;
+	unsigned long			flags;
+#define BNXT_RE_FLAG_NETDEV_REGISTERED	0
+#define BNXT_RE_FLAG_IBDEV_REGISTERED	1
+#define BNXT_RE_FLAG_GOT_MSIX		2
+#define BNXT_RE_FLAG_RCFW_CHANNEL_EN	8
+#define BNXT_RE_FLAG_QOS_WORK_REG	16
+	struct net_device		*netdev;
+	unsigned int			version, major, minor;
+	struct bnxt_en_dev		*en_dev;
+	struct bnxt_msix_entry		msix_entries[BNXT_RE_MAX_MSIX];
+	int				num_msix;
+
+	int				id;
+
+	atomic_t			qp_count;
+	struct mutex			qp_lock;	/* protect qp list */
+	struct list_head		qp_list;
+
+	atomic_t			cq_count;
+	atomic_t			srq_count;
+	atomic_t			mr_count;
+	atomic_t			mw_count;
+	/* Max of 2 lossless traffic class supported per port */
+	u16				cosq[2];
+};
+
+#define to_bnxt_re_dev(ptr, member)	\
+	container_of((ptr), struct bnxt_re_dev, member)
+
+static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
+{
+	if (rdev)
+		return  &rdev->ibdev.dev;
+	return NULL;
+}
+
 #endif
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 6c22d51..71d7501 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -38,10 +38,24 @@ 
 
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <linux/ethtool.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+
+#include "bnxt_ulp.h"
+#include "roce_hsi.h"
 #include "bnxt_re.h"
+#include "bnxt.h"
 static char version[] =
 		BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n";
 
@@ -55,6 +69,384 @@  static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
 /* Mutex to protect the list of bnxt_re devices added */
 static DEFINE_MUTEX(bnxt_re_dev_lock);
 static struct workqueue_struct *bnxt_re_wq;
+
+/* for handling bnxt_en callbacks later */
+static void bnxt_re_stop(void *p)
+{
+}
+
+static void bnxt_re_start(void *p)
+{
+}
+
+static void bnxt_re_sriov_config(void *p, int num_vfs)
+{
+}
+
+static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
+	.ulp_async_notifier = NULL,
+	.ulp_stop = bnxt_re_stop,
+	.ulp_start = bnxt_re_start,
+	.ulp_sriov_config = bnxt_re_sriov_config
+};
+
+/* The rdev ref_count is to protect immature removal of the device */
+static inline void bnxt_re_hold(struct bnxt_re_dev *rdev)
+{
+	atomic_inc(&rdev->ref_count);
+}
+
+static inline void bnxt_re_put(struct bnxt_re_dev *rdev)
+{
+	atomic_dec(&rdev->ref_count);
+}
+
+/* RoCE -> Net driver */
+
+/* Driver registration routines used to let the networking driver (bnxt_en)
+ * to know that the RoCE driver is now installed
+ */
+static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+	/* Acquire rtnl lock if it is not invokded from netdev event */
+	if (lock_wait)
+		rtnl_lock();
+
+	rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev,
+						    BNXT_ROCE_ULP);
+	if (lock_wait)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc = 0;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	rtnl_lock();
+	rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
+						  &bnxt_re_ulp_ops, rdev);
+	rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	if (lock_wait)
+		rtnl_lock();
+
+	rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP);
+
+	if (lock_wait)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_request_msix(struct bnxt_re_dev *rdev)
+{
+	int rc = 0, num_msix_want = BNXT_RE_MIN_MSIX, num_msix_got;
+	struct bnxt_en_dev *en_dev;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	rtnl_lock();
+	num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP,
+							 rdev->msix_entries,
+							 num_msix_want);
+	if (num_msix_got < BNXT_RE_MIN_MSIX) {
+		rc = -EINVAL;
+		goto done;
+	}
+	if (num_msix_got != num_msix_want) {
+		dev_warn(rdev_to_dev(rdev),
+			 "Requested %d MSI-X vectors, got %d\n",
+			 num_msix_want, num_msix_got);
+	}
+	rdev->num_msix = num_msix_got;
+done:
+	rtnl_unlock();
+	return rc;
+}
+
+/* Device */
+
+static bool is_bnxt_re_dev(struct net_device *netdev)
+{
+	struct ethtool_drvinfo drvinfo;
+
+	if (netdev->ethtool_ops && netdev->ethtool_ops->get_drvinfo) {
+		memset(&drvinfo, 0, sizeof(drvinfo));
+		netdev->ethtool_ops->get_drvinfo(netdev, &drvinfo);
+
+		if (strcmp(drvinfo.driver, "bnxt_en"))
+			return false;
+		return true;
+	}
+	return false;
+}
+
+static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev)
+{
+	struct bnxt_re_dev *rdev;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rdev, &bnxt_re_dev_list, list) {
+		if (rdev->netdev == netdev) {
+			rcu_read_unlock();
+			return rdev;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
+static void bnxt_re_dev_unprobe(struct net_device *netdev,
+				struct bnxt_en_dev *en_dev)
+{
+	dev_put(netdev);
+	module_put(en_dev->pdev->driver->driver.owner);
+}
+
+static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
+{
+	struct bnxt *bp = netdev_priv(netdev);
+	struct bnxt_en_dev *en_dev;
+	struct pci_dev *pdev;
+
+	/* Call bnxt_en's RoCE probe via indirect API */
+	if (!bp->ulp_probe)
+		return ERR_PTR(-EINVAL);
+
+	en_dev = bp->ulp_probe(netdev);
+	if (IS_ERR(en_dev))
+		return en_dev;
+
+	pdev = en_dev->pdev;
+	if (!pdev)
+		return ERR_PTR(-EINVAL);
+
+	/* Bump net device reference count */
+	if (!try_module_get(pdev->driver->driver.owner))
+		return ERR_PTR(-ENODEV);
+
+	dev_hold(netdev);
+
+	return en_dev;
+}
+
+static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
+{
+	int i = BNXT_RE_REF_WAIT_COUNT;
+
+	/* Wait for rdev refcount to come down */
+	while ((atomic_read(&rdev->ref_count) > 1) && i--)
+		msleep(100);
+
+	if (atomic_read(&rdev->ref_count) > 1)
+		dev_err(rdev_to_dev(rdev),
+			"Failed waiting for ref count to deplete %d",
+			atomic_read(&rdev->ref_count));
+
+	atomic_set(&rdev->ref_count, 0);
+	dev_put(rdev->netdev);
+	rdev->netdev = NULL;
+
+	mutex_lock(&bnxt_re_dev_lock);
+	list_del_rcu(&rdev->list);
+	mutex_unlock(&bnxt_re_dev_lock);
+
+	synchronize_rcu();
+	flush_workqueue(bnxt_re_wq);
+
+	ib_dealloc_device(&rdev->ibdev);
+	/* rdev is gone */
+}
+
+static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
+					   struct bnxt_en_dev *en_dev)
+{
+	struct bnxt_re_dev *rdev;
+
+	/* Allocate bnxt_re_dev instance here */
+	rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
+	if (!rdev) {
+		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
+			ROCE_DRV_MODULE_NAME);
+		return NULL;
+	}
+	/* Default values */
+	atomic_set(&rdev->ref_count, 0);
+	rdev->netdev = netdev;
+	dev_hold(rdev->netdev);
+	rdev->en_dev = en_dev;
+	rdev->id = rdev->en_dev->pdev->devfn;
+	INIT_LIST_HEAD(&rdev->qp_list);
+	mutex_init(&rdev->qp_lock);
+	atomic_set(&rdev->qp_count, 0);
+	atomic_set(&rdev->cq_count, 0);
+	atomic_set(&rdev->srq_count, 0);
+	atomic_set(&rdev->mr_count, 0);
+	atomic_set(&rdev->mw_count, 0);
+	rdev->cosq[0] = 0xFFFF;
+	rdev->cosq[1] = 0xFFFF;
+
+	mutex_lock(&bnxt_re_dev_lock);
+	list_add_tail_rcu(&rdev->list, &bnxt_re_dev_list);
+	mutex_unlock(&bnxt_re_dev_lock);
+	return rdev;
+}
+
+static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	int rc;
+
+	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
+		rc = bnxt_re_free_msix(rdev, lock_wait);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to free MSI-X vectors: %#x", rc);
+	}
+	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
+		rc = bnxt_re_unregister_netdev(rdev, lock_wait);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to unregister with netdev: %#x", rc);
+	}
+}
+
+static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
+{
+	int i, j, rc;
+
+	/* Registered a new RoCE device instance to netdev */
+	rc = bnxt_re_register_netdev(rdev);
+	if (rc) {
+		pr_err("Failed to register with netedev: %#x\n", rc);
+		return -EINVAL;
+	}
+	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+
+	rc = bnxt_re_request_msix(rdev);
+	if (rc) {
+		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
+		rc = -EINVAL;
+		goto fail;
+	}
+	set_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags);
+
+	return 0;
+fail:
+	bnxt_re_ib_unreg(rdev, true);
+	return rc;
+}
+
+static void bnxt_re_dev_unreg(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct net_device *netdev = rdev->netdev;
+
+	bnxt_re_dev_remove(rdev);
+
+	if (netdev)
+		bnxt_re_dev_unprobe(netdev, en_dev);
+}
+
+static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc = 0;
+
+	if (!is_bnxt_re_dev(netdev))
+		return -ENODEV;
+
+	en_dev = bnxt_re_dev_probe(netdev);
+	if (IS_ERR(en_dev)) {
+		pr_err("%s: Failed to probe\n", ROCE_DRV_MODULE_NAME);
+		rc = PTR_ERR(en_dev);
+		goto exit;
+	}
+	*rdev = bnxt_re_dev_add(netdev, en_dev);
+	if (!*rdev) {
+		rc = -ENOMEM;
+		bnxt_re_dev_unprobe(netdev, en_dev);
+		goto exit;
+	}
+	bnxt_re_hold(*rdev);
+exit:
+	return rc;
+}
+
+static void bnxt_re_remove_one(struct bnxt_re_dev *rdev)
+{
+	pci_dev_put(rdev->en_dev->pdev);
+}
+
+/* Handle all deferred netevents tasks */
+static void bnxt_re_task(struct work_struct *work)
+{
+	struct bnxt_re_work *re_work;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+
+	re_work = container_of(work, struct bnxt_re_work, work);
+	rdev = re_work->rdev;
+
+	if (re_work->event != NETDEV_REGISTER &&
+	    !test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+		return;
+
+	switch (re_work->event) {
+	case NETDEV_REGISTER:
+		rc = bnxt_re_ib_reg(rdev);
+		if (rc)
+			dev_err(rdev_to_dev(rdev),
+				"Failed to register with IB: %#x", rc);
+		break;
+	case NETDEV_UP:
+
+		break;
+	case NETDEV_DOWN:
+
+		break;
+
+	case NETDEV_CHANGE:
+
+		break;
+	default:
+		break;
+	}
+	kfree(re_work);
+}
+
+static void bnxt_re_init_one(struct bnxt_re_dev *rdev)
+{
+	pci_dev_get(rdev->en_dev->pdev);
+}
+
 /*
  * "Notifier chain callback can be invoked for the same chain from
  * different CPUs at the same time".
@@ -72,6 +464,62 @@  static struct workqueue_struct *bnxt_re_wq;
 static int bnxt_re_netdev_event(struct notifier_block *notifier,
 				unsigned long event, void *ptr)
 {
+	struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bnxt_re_work *re_work;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+
+	real_dev = rdma_vlan_dev_real_dev(netdev);
+	if (!real_dev)
+		real_dev = netdev;
+
+	rdev = bnxt_re_from_netdev(real_dev);
+	if (!rdev && event != NETDEV_REGISTER)
+		goto exit;
+	if (real_dev != netdev)
+		goto exit;
+
+	if (rdev)
+		bnxt_re_hold(rdev);
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		if (rdev)
+			break;
+		rc = bnxt_re_dev_reg(&rdev, real_dev);
+		if (rc == -ENODEV)
+			break;
+		if (rc) {
+			pr_err("Failed to register with the device %s: %#x\n",
+			       real_dev->name, rc);
+			break;
+		}
+		bnxt_re_init_one(rdev);
+		goto sch_work;
+
+	case NETDEV_UNREGISTER:
+		bnxt_re_ib_unreg(rdev, false);
+		bnxt_re_remove_one(rdev);
+		bnxt_re_dev_unreg(rdev);
+		break;
+
+	default:
+sch_work:
+		/* Allocate for the deferred task */
+		re_work = kzalloc(sizeof(*re_work), GFP_ATOMIC);
+		if (!re_work)
+			break;
+
+		re_work->rdev = rdev;
+		re_work->event = event;
+		re_work->vlan_dev = (real_dev == netdev ? NULL : netdev);
+		INIT_WORK(&re_work->work, bnxt_re_task);
+		queue_work(bnxt_re_wq, &re_work->work);
+		break;
+	}
+	if (rdev)
+		bnxt_re_put(rdev);
+exit:
 	return NOTIFY_DONE;
 }