diff mbox

[net-next,V3,1/2] IB/ipoib: Add rtnl_link_ops support

Message ID 1345724119-32110-2-git-send-email-ogerlitz@mellanox.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Or Gerlitz Aug. 23, 2012, 12:15 p.m. UTC
Add rtnl_link_ops to IPoIB, with the first usage being child device
create/delete through them. Childs devices are now either legacy ones,
created/deleted through the ipoib sysfs entries, or RTNL ones.

Adding support for RTNL childs involved refactoring of ipoib_vlan_add
which is now used by both the sysfs and the link_ops code.

Also, added ndo_uninit entry to support calling unregister_netdevice_queue
from the rtnl dellink entry. This required removal of calls to
ipoib_dev_cleanup from the driver in flows which use unregister_netdevice,
since the networking core will invoke ipoib_uninit which does exactly that.

Signed-off-by: Erez Shitrit <erezsh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 Documentation/infiniband/ipoib.txt           |    3 +
 drivers/infiniband/ulp/ipoib/Makefile        |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib.h         |   13 +++
 drivers/infiniband/ulp/ipoib/ipoib_main.c    |   25 ++++-
 drivers/infiniband/ulp/ipoib/ipoib_netlink.c |  122 ++++++++++++++++++++++++++
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c    |  102 ++++++++++++----------
 6 files changed, 217 insertions(+), 51 deletions(-)
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_netlink.c

Comments

Patrick McHardy Aug. 29, 2012, 12:53 p.m. UTC | #1
On Thu, 23 Aug 2012, Or Gerlitz wrote:

> new file mode 100644
> index 0000000..18d6ac9
> --- /dev/null
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
> ...
> +
> +enum {
> +	IFLA_IPOIB_UNSPEC,
> +	IFLA_IPOIB_CHILD_PKEY,
> +	__IFLA_IPOIB_MAX
> +};
> +
> +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1)

This should go into include/linux/if_link.h

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Aug. 29, 2012, 12:59 p.m. UTC | #2
On 29/08/2012 15:53, Patrick McHardy wrote:
> On Thu, 23 Aug 2012, Or Gerlitz wrote:
>
>> new file mode 100644
>> index 0000000..18d6ac9
>> --- /dev/null
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
>> ...
>> +
>> +enum {
>> +    IFLA_IPOIB_UNSPEC,
>> +    IFLA_IPOIB_CHILD_PKEY,
>> +    __IFLA_IPOIB_MAX
>> +};
>> +
>> +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1)
>
> This should go into include/linux/if_link.h
>

sure, will fix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 12, 2012, 10:40 a.m. UTC | #3
Patrick McHardy wrote:
> Or Gerlitz wrote:
>
>> +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1)
>
> This should go into include/linux/if_link.h
>

This comment was easy to fix... HOWEVER, using V3 -- this posting
http://marc.info/?l=linux-netdev&m=134572609226343&w=2 -- of the patch
over latest net-next (commit 280050cc81ccb2e06e4061228ee34c0cc86b1560
"x86 bpf_jit: support MOD operation"), when I just run the following trivial
sequence which loads the module and creates/deletes legacy child

$ modprobe ib_ipoib debug_level=1
$ echo 0x8001 > /sys/class/net/ib0/create_child
$ echo 0x8001 > /sys/class/net/ib0/delete_child
$ modprobe -r ib_ipoib

I get the below lockdep warning, pointing to ipoib_vlan_delete which is
by not means called by the module unload sequence, confusing... any idea?

Or.

======================================================
[ INFO: possible circular locking dependency detected ]
3.6.0-rc3+ #144 Not tainted
-------------------------------------------------------
modprobe/4443 is trying to acquire lock:
  (s_active#155){++++.+}, at: [<ffffffff8114f93f>] 
sysfs_addrm_finish+0x29/0x52

but task is already holding lock:
  (rtnl_mutex){+.+.+.}, at: [<ffffffff812fc103>] rtnl_lock+0x12/0x14

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (rtnl_mutex){+.+.+.}:
        [<ffffffff81072b30>] lock_acquire+0x14f/0x19b
        [<ffffffff81396a43>] mutex_lock_nested+0x64/0x2ce
        [<ffffffff812fc103>] rtnl_lock+0x12/0x14
        [<ffffffff812eecf1>] netdev_run_todo+0xa5/0x27e
        [<ffffffff812fc0dd>] rtnl_unlock+0x9/0xb
        [<ffffffffa0394889>] ipoib_vlan_delete+0x111/0x148 [ib_ipoib]
        [<ffffffffa038d29b>] delete_child+0x44/0x60 [ib_ipoib]
        [<ffffffff81247bd8>] dev_attr_store+0x1b/0x1d
        [<ffffffff8114e223>] sysfs_write_file+0x103/0x13f
        [<ffffffff810f206b>] vfs_write+0xae/0x133
        [<ffffffff810f21a9>] sys_write+0x45/0x6c
        [<ffffffff813a05e2>] system_call_fastpath+0x16/0x1b

-> #0 (s_active#155){++++.+}:
        [<ffffffff81072364>] __lock_acquire+0x10d1/0x174e
        [<ffffffff81072b30>] lock_acquire+0x14f/0x19b
        [<ffffffff8114ef79>] sysfs_deactivate+0x93/0xca
        [<ffffffff8114f93f>] sysfs_addrm_finish+0x29/0x52
        [<ffffffff8114fa36>] sysfs_remove_dir+0x8b/0x9e
        [<ffffffff81199c6d>] kobject_del+0x16/0x37
        [<ffffffff812491ca>] device_del+0x18f/0x19f
        [<ffffffff813000d3>] netdev_unregister_kobject+0x52/0x57
        [<ffffffff812eeacc>] rollback_registered_many+0x238/0x27c
        [<ffffffff812eebe9>] unregister_netdevice_queue+0x7f/0xbf
        [<ffffffff812eec45>] unregister_netdev+0x1c/0x23
        [<ffffffffa038d1eb>] ipoib_remove_one+0xad/0xe7 [ib_ipoib]
        [<ffffffffa01a89ec>] ib_unregister_client+0x3d/0x11c [ib_core]
        [<ffffffffa0398fcf>] ipoib_cleanup_module+0x2f/0x4e [ib_ipoib]
        [<ffffffff8107d81d>] sys_delete_module+0x1ac/0x210
        [<ffffffff813a05e2>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(rtnl_mutex);
                                lock(s_active#155);
                                lock(rtnl_mutex);
   lock(s_active#155);

  *** DEADLOCK ***

2 locks held by modprobe/4443:
  #0:  (device_mutex){+.+.+.}, at: [<ffffffffa01a89d1>] 
ib_unregister_client+0x22/0x11c [ib_core]
  #1:  (rtnl_mutex){+.+.+.}, at: [<ffffffff812fc103>] rtnl_lock+0x12/0x14

stack backtrace:
Pid: 4443, comm: modprobe Not tainted 3.6.0-rc3+ #144
Call Trace:
  [<ffffffff8102dad8>] ? console_unlock+0x329/0x37e
  [<ffffffff81070c15>] print_circular_bug+0x28e/0x29f
  [<ffffffff81072364>] __lock_acquire+0x10d1/0x174e
  [<ffffffff8114f93f>] ? sysfs_addrm_finish+0x29/0x52
  [<ffffffff81072b30>] lock_acquire+0x14f/0x19b
  [<ffffffff8114f93f>] ? sysfs_addrm_finish+0x29/0x52
  [<ffffffff8114ef79>] sysfs_deactivate+0x93/0xca
  [<ffffffff8114f93f>] ? sysfs_addrm_finish+0x29/0x52
  [<ffffffff8114f93f>] sysfs_addrm_finish+0x29/0x52
  [<ffffffff8114fa36>] sysfs_remove_dir+0x8b/0x9e
  [<ffffffff81199c6d>] kobject_del+0x16/0x37
  [<ffffffff812491ca>] device_del+0x18f/0x19f
  [<ffffffff813000d3>] netdev_unregister_kobject+0x52/0x57
  [<ffffffff812eeacc>] rollback_registered_many+0x238/0x27c
  [<ffffffff812eebe9>] unregister_netdevice_queue+0x7f/0xbf
  [<ffffffff812eec45>] unregister_netdev+0x1c/0x23
  [<ffffffffa038d1eb>] ipoib_remove_one+0xad/0xe7 [ib_ipoib]
  [<ffffffffa01a89ec>] ib_unregister_client+0x3d/0x11c [ib_core]
  [<ffffffffa0398fcf>] ipoib_cleanup_module+0x2f/0x4e [ib_ipoib]
  [<ffffffff8107d81d>] sys_delete_module+0x1ac/0x210
  [<ffffffff8106fc4f>] ? trace_hardirqs_on_caller+0x11e/0x155
  [<ffffffff811a2a0e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
  [<ffffffff813a05e2>] system_call_fastpath+0x16/0x1b

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rami Rosen Sept. 12, 2012, 2:53 p.m. UTC | #4
Hi,

From the dump of CPU #1, it seems indeed not related at all to "modprobe -r".

Could it be that there is some IB stack sysfs write activity?
(regardless of the modprobe -r" you issued) ?  I see some candidates
for it.

delete_child() is a method of the IB stack (ipoib/ipoib_main.c)

Maybe in order to help debug the problem, you might try to add in
delete_child() method, print of the name of the attribute which is
being deleted ?

  (struct device_attribute has a a member "struct attribute attr",
which in turn has  "const char *name").


Regards,
Rami Rosen
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 12, 2012, 3:13 p.m. UTC | #5
On Wed, 2012-09-12 at 17:53 +0300, Rami Rosen wrote:
> Hi,
> 
> From the dump of CPU #1, it seems indeed not related at all to "modprobe -r".
> 
> Could it be that there is some IB stack sysfs write activity?
> (regardless of the modprobe -r" you issued) ?  I see some candidates
> for it.
> 
> delete_child() is a method of the IB stack (ipoib/ipoib_main.c)
> 
> Maybe in order to help debug the problem, you might try to add in
> delete_child() method, print of the name of the attribute which is
> being deleted ?
> 
>   (struct device_attribute has a a member "struct attribute attr",
> which in turn has  "const char *name").


It might be related to module load/unload

udevd or some external daemon can access sysfs files while you unload
the module


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 13, 2012, 10:54 a.m. UTC | #6
On 12/09/2012 18:13, Eric Dumazet wrote:
> It might be related to module load/unload udevd or some external 
> daemon can access sysfs files while you unload the module 

Hi Eric,

I see, the IPoIB add/delete child sysfs handlers (ipoib_vlan_add/delete) 
use
RTNL locking to protect against netdev changes while the handlers are in 
action.

IPoIB uses devide_create_file to add the sysfs entries, but doesn't care 
to use
device_remove_file as these entries sit under the netdevice 
/sys/class/net/DEV
directory and are removed by higher layer when DEV gets 
unregistered/deleted, I'm
not sure if/how the driver is supposed to protect against access to 
these entries
while going down, when the module is unloaded, etc, any idea will be 
appreciated.


Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 13, 2012, 11:28 a.m. UTC | #7
On 12/09/2012 17:53, Rami Rosen wrote:
>  From the dump of CPU #1, it seems indeed not related at all to "modprobe -r".
>
> Could it be that there is some IB stack sysfs write activity?
> (regardless of the modprobe -r" you issued) ?  I see some candidates for it.
>
> delete_child() is a method of the IB stack (ipoib/ipoib_main.c)
>
> Maybe in order to help debug the problem, you might try to add in
> delete_child() method, print of the name of the attribute which is being deleted?
>
> (struct device_attribute has a a member "struct attribute attr",
> which in turn has  "const char *name").
>

> the existing dependency chain (in reverse order) is:
>
> -> #1 (rtnl_mutex){+.+.+.}:
>        [<ffffffff81072b30>] lock_acquire+0x14f/0x19b
>        [<ffffffff81396a43>] mutex_lock_nested+0x64/0x2ce
>        [<ffffffff812fc103>] rtnl_lock+0x12/0x14
>        [<ffffffff812eecf1>] netdev_run_todo+0xa5/0x27e
>        [<ffffffff812fc0dd>] rtnl_unlock+0x9/0xb
>        [<ffffffffa0394889>] ipoib_vlan_delete+0x111/0x148 [ib_ipoib]
>        [<ffffffffa038d29b>] delete_child+0x44/0x60 [ib_ipoib]
>        [<ffffffff81247bd8>] dev_attr_store+0x1b/0x1d
>        [<ffffffff8114e223>] sysfs_write_file+0x103/0x13f
>        [<ffffffff810f206b>] vfs_write+0xae/0x133
>        [<ffffffff810f21a9>] sys_write+0x45/0x6c
>        [<ffffffff813a05e2>] system_call_fastpath+0x16/0x1b 

I've added code in ipoib_delete_child to print the caller PID and dump 
the stack,
its triggeredwhen I do "echo 0x8001 > /sys/class/net/ib0/delete_child" 
but the lockdep
warningis raised only when I actually unload the module, and no print... 
that is
ipoib_delete_child  isn't called.

Or.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
index 64eeb55..f2cfe26 100644
--- a/Documentation/infiniband/ipoib.txt
+++ b/Documentation/infiniband/ipoib.txt
@@ -24,6 +24,9 @@  Partitions and P_Keys
   The P_Key for any interface is given by the "pkey" file, and the
   main interface for a subinterface is in "parent."
 
+  Child interface create/delete can also be done using IPoIB's
+  rtnl_link_ops, where childs created using either way behave the same.
+
 Datagram vs Connected modes
 
   The IPoIB driver supports two modes of operation: datagram and
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
index 3090100..e5430dd 100644
--- a/drivers/infiniband/ulp/ipoib/Makefile
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -5,7 +5,8 @@  ib_ipoib-y					:= ipoib_main.o \
 						   ipoib_multicast.o \
 						   ipoib_verbs.o \
 						   ipoib_vlan.o \
-						   ipoib_ethtool.o
+						   ipoib_ethtool.o \
+						   ipoib_netlink.o
 ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
 ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca43901..381f51b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -104,6 +104,10 @@  enum {
 
 	MAX_SEND_CQE		  = 16,
 	IPOIB_CM_COPYBREAK	  = 256,
+
+	IPOIB_NON_CHILD		  = 0,
+	IPOIB_LEGACY_CHILD	  = 1,
+	IPOIB_RTNL_CHILD	  = 2,
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -350,6 +354,7 @@  struct ipoib_dev_priv {
 	struct net_device *parent;
 	struct list_head child_intfs;
 	struct list_head list;
+	int    child_type;
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	struct ipoib_cm_dev_priv cm;
@@ -509,6 +514,14 @@  void ipoib_event(struct ib_event_handler *handler,
 int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
 int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
 
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int child_type);
+
+int  __init ipoib_netlink_init(void);
+void __exit ipoib_netlink_fini(void);
+
+void ipoib_setup(struct net_device *dev);
+
 void ipoib_pkey_poll(struct work_struct *work);
 int ipoib_pkey_dev_delay_open(struct net_device *dev);
 void ipoib_drain_cq(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 3e2085a..b3e9709 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -173,6 +173,11 @@  static int ipoib_stop(struct net_device *dev)
 	return 0;
 }
 
+static void ipoib_uninit(struct net_device *dev)
+{
+	ipoib_dev_cleanup(dev);
+}
+
 static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1262,6 +1267,9 @@  out:
 void ipoib_dev_cleanup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
+	LIST_HEAD(head);
+
+	ASSERT_RTNL();
 
 	ipoib_delete_debug_files(dev);
 
@@ -1270,10 +1278,9 @@  void ipoib_dev_cleanup(struct net_device *dev)
 		/* Stop GC on child */
 		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
 		cancel_delayed_work(&cpriv->neigh_reap_task);
-		unregister_netdev(cpriv->dev);
-		ipoib_dev_cleanup(cpriv->dev);
-		free_netdev(cpriv->dev);
+		unregister_netdevice_queue(cpriv->dev, &head);
 	}
+	unregister_netdevice_many(&head);
 
 	ipoib_ib_dev_cleanup(dev);
 
@@ -1291,6 +1298,7 @@  static const struct header_ops ipoib_header_ops = {
 };
 
 static const struct net_device_ops ipoib_netdev_ops = {
+	.ndo_uninit		 = ipoib_uninit,
 	.ndo_open		 = ipoib_open,
 	.ndo_stop		 = ipoib_stop,
 	.ndo_change_mtu		 = ipoib_change_mtu,
@@ -1300,7 +1308,7 @@  static const struct net_device_ops ipoib_netdev_ops = {
 	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
 };
 
-static void ipoib_setup(struct net_device *dev)
+void ipoib_setup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
@@ -1662,7 +1670,6 @@  static void ipoib_remove_one(struct ib_device *device)
 		flush_workqueue(ipoib_workqueue);
 
 		unregister_netdev(priv->dev);
-		ipoib_dev_cleanup(priv->dev);
 		free_netdev(priv->dev);
 	}
 
@@ -1714,8 +1721,15 @@  static int __init ipoib_init_module(void)
 	if (ret)
 		goto err_sa;
 
+	ret = ipoib_netlink_init();
+	if (ret)
+		goto err_client;
+
 	return 0;
 
+err_client:
+	ib_unregister_client(&ipoib_client);
+
 err_sa:
 	ib_sa_unregister_client(&ipoib_sa_client);
 	destroy_workqueue(ipoib_workqueue);
@@ -1728,6 +1742,7 @@  err_fs:
 
 static void __exit ipoib_cleanup_module(void)
 {
+	ipoib_netlink_fini();
 	ib_unregister_client(&ipoib_client);
 	ib_sa_unregister_client(&ipoib_sa_client);
 	ipoib_unregister_debugfs();
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
new file mode 100644
index 0000000..18d6ac9
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -0,0 +1,122 @@ 
+/*
+ * Copyright (c) 2012 Mellanox Technologies. -  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <net/rtnetlink.h>
+#include "ipoib.h"
+
+enum {
+	IFLA_IPOIB_UNSPEC,
+	IFLA_IPOIB_CHILD_PKEY,
+	__IFLA_IPOIB_MAX
+};
+
+#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1)
+
+static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = {
+	[IFLA_IPOIB_CHILD_PKEY]		= { .type = NLA_U16 },
+};
+
+static int ipoib_new_child_link(struct net *src_net, struct net_device *dev,
+			       struct nlattr *tb[], struct nlattr *data[])
+{
+	struct net_device *pdev;
+	struct ipoib_dev_priv *ppriv;
+	u16 child_pkey;
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!pdev)
+		return -ENODEV;
+
+	ppriv = netdev_priv(pdev);
+
+	if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) {
+		ipoib_warn(ppriv, "child creation disallowed for child devices\n");
+		return -EINVAL;
+	}
+
+	if (!data || !data[IFLA_IPOIB_CHILD_PKEY]) {
+		ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n");
+		child_pkey  = ppriv->pkey;
+	} else
+		child_pkey  = nla_get_u16(data[IFLA_IPOIB_CHILD_PKEY]);
+
+	err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD);
+
+	return err;
+}
+
+static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head)
+{
+	struct ipoib_dev_priv *priv, *ppriv;
+
+	priv = netdev_priv(dev);
+	ppriv = netdev_priv(priv->parent);
+
+	mutex_lock(&ppriv->vlan_mutex);
+	unregister_netdevice_queue(dev, head);
+	list_del(&priv->list);
+	mutex_unlock(&ppriv->vlan_mutex);
+}
+
+static size_t ipoib_get_size(const struct net_device *dev)
+{
+	return nla_total_size(2);	/* IFLA_IPOIB_CHILD_PKEY */
+}
+
+static struct rtnl_link_ops ipoib_link_ops __read_mostly = {
+	.kind		= "ipoib",
+	.maxtype	= IFLA_IPOIB_MAX,
+	.policy		= ipoib_policy,
+	.priv_size	= sizeof(struct ipoib_dev_priv),
+	.setup		= ipoib_setup,
+	.newlink	= ipoib_new_child_link,
+	.dellink	= ipoib_unregister_child_dev,
+	.get_size	= ipoib_get_size,
+};
+
+int __init ipoib_netlink_init(void)
+{
+	return rtnl_link_register(&ipoib_link_ops);
+}
+
+void __exit ipoib_netlink_fini(void)
+{
+	rtnl_link_unregister(&ipoib_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("ipoib");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index d7e9740..238bbf9 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -49,47 +49,11 @@  static ssize_t show_parent(struct device *d, struct device_attribute *attr,
 }
 static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
 
-int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int type)
 {
-	struct ipoib_dev_priv *ppriv, *priv;
-	char intf_name[IFNAMSIZ];
 	int result;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
-	ppriv = netdev_priv(pdev);
-
-	if (!rtnl_trylock())
-		return restart_syscall();
-	mutex_lock(&ppriv->vlan_mutex);
-
-	/*
-	 * First ensure this isn't a duplicate. We check the parent device and
-	 * then all of the child interfaces to make sure the Pkey doesn't match.
-	 */
-	if (ppriv->pkey == pkey) {
-		result = -ENOTUNIQ;
-		priv = NULL;
-		goto err;
-	}
-
-	list_for_each_entry(priv, &ppriv->child_intfs, list) {
-		if (priv->pkey == pkey) {
-			result = -ENOTUNIQ;
-			priv = NULL;
-			goto err;
-		}
-	}
-
-	snprintf(intf_name, sizeof intf_name, "%s.%04x",
-		 ppriv->dev->name, pkey);
-	priv = ipoib_intf_alloc(intf_name);
-	if (!priv) {
-		result = -ENOMEM;
-		goto err;
-	}
-
 	priv->max_ib_mtu = ppriv->max_ib_mtu;
 	/* MTU will be reset when mcast join happens */
 	priv->dev->mtu   = IPOIB_UD_MTU(priv->max_ib_mtu);
@@ -134,14 +98,13 @@  int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 	if (device_create_file(&priv->dev->dev, &dev_attr_parent))
 		goto sysfs_failed;
 
+	priv->child_type = type;
 	list_add_tail(&priv->list, &ppriv->child_intfs);
 
-	mutex_unlock(&ppriv->vlan_mutex);
-	rtnl_unlock();
-
 	return 0;
 
 sysfs_failed:
+	result = -ENOMEM;
 	ipoib_delete_debug_files(priv->dev);
 	unregister_netdevice(priv->dev);
 
@@ -149,11 +112,60 @@  register_failed:
 	ipoib_dev_cleanup(priv->dev);
 
 err:
+	return result;
+}
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv;
+	char intf_name[IFNAMSIZ];
+	struct ipoib_dev_priv *tpriv;
+	int result;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	snprintf(intf_name, sizeof intf_name, "%s.%04x",
+		 ppriv->dev->name, pkey);
+	priv = ipoib_intf_alloc(intf_name);
+	if (!priv)
+		return -ENOMEM;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	mutex_lock(&ppriv->vlan_mutex);
+
+	/*
+	 * First ensure this isn't a duplicate. We check the parent device and
+	 * then all of the legacy child interfaces to make sure the Pkey
+	 * doesn't match.
+	 */
+	if (ppriv->pkey == pkey) {
+		result = -ENOTUNIQ;
+		goto out;
+	}
+
+	list_for_each_entry(tpriv, &ppriv->child_intfs, list) {
+		if (tpriv->pkey == pkey &&
+		    tpriv->child_type == IPOIB_LEGACY_CHILD) {
+			result = -ENOTUNIQ;
+			goto out;
+		}
+	}
+
+	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
+
+out:
 	mutex_unlock(&ppriv->vlan_mutex);
-	rtnl_unlock();
-	if (priv)
+
+	if (result)
 		free_netdev(priv->dev);
 
+	rtnl_unlock();
+
 	return result;
 }
 
@@ -171,9 +183,9 @@  int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
 		return restart_syscall();
 	mutex_lock(&ppriv->vlan_mutex);
 	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
-		if (priv->pkey == pkey) {
+		if (priv->pkey == pkey &&
+		    priv->child_type == IPOIB_LEGACY_CHILD) {
 			unregister_netdevice(priv->dev);
-			ipoib_dev_cleanup(priv->dev);
 			list_del(&priv->list);
 			dev = priv->dev;
 			break;