diff mbox

[1/3] net: introduce a list of device addresses dev_addr_list (v3)

Message ID 20090417115723.GE9556@psychotron.englab.brq.redhat.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Jiri Pirko April 17, 2009, 11:57 a.m. UTC
v2 -> v3 (current):
-removed unnecessary rcu read locking
-moved dev_addr_flush() calling to ensure no null dereference of dev_addr

v1 -> v2:
-added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
-removed unnecessary rcu_read locking in dev_addr_init
-use compare_ether_addr_64bits instead of compare_ether_addr
-use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
-use call_rcu instead of rcu_synchronize
-moved is_etherdev_addr into __KERNEL__ ifdef

This patch introduces a new list in struct net_device and brings a set of
functions to handle the work with device address list. The list is a replacement
for the original dev_addr field and because in some situations there is need to
carry several device addresses with the net device. To be backward compatible,
dev_addr is made to point to the first member of the list so original drivers
sees no difference.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 include/linux/etherdevice.h |   27 +++++
 include/linux/netdevice.h   |   32 +++++-
 net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 318 insertions(+), 2 deletions(-)

Comments

stephen hemminger April 17, 2009, 3:33 p.m. UTC | #1
On Fri, 17 Apr 2009 13:57:24 +0200
Jiri Pirko <jpirko@redhat.com> wrote:

> v2 -> v3 (current):
> -removed unnecessary rcu read locking
> -moved dev_addr_flush() calling to ensure no null dereference of dev_addr
> 
> v1 -> v2:
> -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush
> -removed unnecessary rcu_read locking in dev_addr_init
> -use compare_ether_addr_64bits instead of compare_ether_addr
> -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr
> -use call_rcu instead of rcu_synchronize
> -moved is_etherdev_addr into __KERNEL__ ifdef
> 
> This patch introduces a new list in struct net_device and brings a set of
> functions to handle the work with device address list. The list is a replacement
> for the original dev_addr field and because in some situations there is need to
> carry several device addresses with the net device. To be backward compatible,
> dev_addr is made to point to the first member of the list so original drivers
> sees no difference.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  include/linux/etherdevice.h |   27 +++++
>  include/linux/netdevice.h   |   32 +++++-
>  net/core/dev.c              |  261 +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 318 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
> index a1f17ab..3d7a668 100644
> --- a/include/linux/etherdevice.h
> +++ b/include/linux/etherdevice.h
> @@ -182,6 +182,33 @@ static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
>  	return compare_ether_addr(addr1, addr2);
>  #endif
>  }
> +
> +/**
> + * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
> + * @dev: Pointer to a device structure
> + * @addr: Pointer to a six-byte array containing the Ethernet address
> + *
> + * Compare passed address with all addresses of the device. Return true if the
> + * address if one of the device addresses.
> + *
> + * Note that this function calls compare_ether_addr_64bits() so take care of
> + * the right padding.
> + */
> +static inline bool is_etherdev_addr(const struct net_device *dev,
> +				    const u8 addr[6 + 2])
> +{
> +	struct netdev_hw_addr *ha;
> +	int res = 1;
> +
> +	rcu_read_lock();
> +	for_each_dev_addr(dev, ha) {
> +		res = compare_ether_addr_64bits(addr, ha->addr);
> +		if (!res)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return !res;
> +}
>  #endif	/* __KERNEL__ */
>  
>  /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 2e7783f..89ad6d2 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -210,6 +210,13 @@ struct dev_addr_list
>  #define dmi_users	da_users
>  #define dmi_gusers	da_gusers
>  
> +struct netdev_hw_addr {
> +	struct list_head	list;
> +	unsigned char		addr[MAX_ADDR_LEN];
> +	int			refcount;
> +	struct rcu_head		rcu_head;
> +};

Minor nit, the ordering of elements cause holes that might not be
needed.

Space saving? is rcu_head needed or would using synchronize_net
make code cleaner and save space. 

>  struct hh_cache
>  {
>  	struct hh_cache *hh_next;	/* Next entry			     */
> @@ -776,8 +783,11 @@ struct net_device
>   */
>  	unsigned long		last_rx;	/* Time of last Rx	*/
>  	/* Interface address info used in eth_type_trans() */
> -	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
> -							   because most packets are unicast) */
> +	unsigned char		*dev_addr;	/* hw address, (before bcast
> +						   because most packets are
> +						   unicast) */
> +
> +	struct list_head	dev_addr_list; /* list of device hw addresses */
>  
>  	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
>  
> @@ -1778,6 +1788,13 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
>  	spin_unlock_bh(&dev->addr_list_lock);
>  }
>  
> +/*
> + * dev_addr_list walker. Should be used only for read access. Call with
> + * rcu_read_lock held.
> + */
> +#define for_each_dev_addr(dev, ha) \
> +		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
> +
>  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
>  
>  extern void		ether_setup(struct net_device *dev);
> @@ -1790,6 +1807,17 @@ extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	alloc_netdev_mq(sizeof_priv, name, setup, 1)
>  extern int		register_netdev(struct net_device *dev);
>  extern void		unregister_netdev(struct net_device *dev);
> +
> +/* Functions used for device addresses handling */
> +extern int		dev_addr_add(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_del(struct net_device *dev,
> +				     unsigned char *addr);
> +extern int		dev_addr_add_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +extern int		dev_addr_del_multiple(struct net_device *to_dev,
> +					      struct net_device *from_dev);
> +
>  /* Functions used for secondary unicast and multicast support */
>  extern void		dev_set_rx_mode(struct net_device *dev);
>  extern void		__dev_set_rx_mode(struct net_device *dev);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 343883f..b4503ac 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3438,6 +3438,263 @@ void dev_set_rx_mode(struct net_device *dev)
>  	netif_addr_unlock_bh(dev);
>  }
>  
> +/* hw addresses list handling functions */
> +
> +static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	if (addr_len > MAX_ADDR_LEN)
> +		return -EINVAL;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			ha->refcount++;
> +			return 0;
> +		}
> +	}
> +
> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
> +	if (!ha)
> +		return -ENOMEM;
Since you are initializing all fields, kzalloc isn't really needed

> +	memcpy(ha->addr, addr, addr_len);
> +	ha->refcount = 1;
> +	list_add_tail_rcu(&ha->list, list);
> +	return 0;
> +}
> +
> +static int __hw_addr_add(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_add_ii(list, addr, addr_len, -1);
> +}
> +
> +static void ha_rcu_free(struct rcu_head *head)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	ha = container_of(head, struct netdev_hw_addr, rcu_head);
> +	kfree(ha);
> +}
> +
> +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
> +			    int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +	int i = 0;
> +
> +	list_for_each_entry(ha, list, list) {
> +		if (i++ != ignore_index &&
> +		    !memcmp(ha->addr, addr, addr_len)) {
> +			if (--ha->refcount)
> +				return 0;
> +			list_del_rcu(&ha->list);
> +			call_rcu(&ha->rcu_head, ha_rcu_free);
> +			return 0;
> +		}
> +	}
> +	return -ENOENT;
> +}
> +
> +static int __hw_addr_del(struct list_head *list, unsigned char *addr,
> +			 int addr_len)
> +{
> +	return __hw_addr_del_ii(list, addr, addr_len, -1);
> +}
> +
> +static int __hw_addr_add_multiple_ii(struct list_head *to_list,
> +				     struct list_head *from_list,
> +				     int addr_len, int ignore_index)
> +{
> +	int err;
> +	struct netdev_hw_addr *ha, *ha2;
> +
> +	list_for_each_entry(ha, from_list, list) {
> +		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
> +		if (err)
> +			goto unroll;
> +	}
> +	return 0;
> +
> +unroll:
> +	list_for_each_entry(ha2, from_list, list) {
> +		if (ha2 == ha)
> +			break;
> +		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
> +	}
> +	return err;
> +}
> +
> +static int __hw_addr_add_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_del_multiple_ii(struct list_head *to_list,
> +				      struct list_head *from_list,
> +				      int addr_len, int ignore_index)
> +{
> +	struct netdev_hw_addr *ha;
> +
> +	list_for_each_entry(ha, from_list, list) {
> +		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
> +	}
> +}
> +
> +static void __hw_addr_del_multiple(struct list_head *to_list,
> +					 struct list_head *from_list,
> +					 int addr_len)
> +{
> +	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
> +}
> +
> +static void __hw_addr_flush(struct list_head *list)
> +{
> +	struct netdev_hw_addr *ha, *tmp;
> +
> +	list_for_each_entry_safe(ha, tmp, list, list) {
> +		list_del_rcu(&ha->list);
> +		call_rcu(&ha->rcu_head, ha_rcu_free);
> +	}
> +}
> +
> +/* Device addresses handling functions */
> +
> +static void dev_addr_flush(struct net_device *dev)
> +{
> +	ASSERT_RTNL();
> +
Since this is local you should be able to audit all
the callers and remove this ASSERT.

> +	__hw_addr_flush(&dev->dev_addr_list);
> +	dev->dev_addr = NULL;
> +}
> +
> +static int dev_addr_init(struct net_device *dev)
> +{
> +	unsigned char addr[MAX_ADDR_LEN];
> +	struct netdev_hw_addr *ha;
> +	int err;
> +
> +	ASSERT_RTNL();
Ditto, ASSERT_RTNL makes sense for exposed kernel API and
initial testing.

> +	INIT_LIST_HEAD(&dev->dev_addr_list);
> +	memset(addr, 0, sizeof(*addr));
> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
> +	if (!err) {
> +		/*
> +		 * Get the first (previously created) address from the list
> +		 * and set dev_addr pointer to this location.
> +		 */
> +		ha = list_first_entry(&dev->dev_addr_list,
> +				      struct netdev_hw_addr, list);
> +		dev->dev_addr = ha->addr;
> +	}
> +	return err;
> +}
> +
> +/**
> + *	dev_addr_add	- Add a device address
> + *	@dev: device
> + *	@addr: address to add
> + *
> + *	Add a device address to the device or increase the reference count if
> + *	it already exists.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add);
> +
> +/**
> + *	dev_addr_del	- Release a device address.
> + *	@dev: device
> + *	@addr: address to delete
> + *
> + *	Release reference to a device address and remove it from the device
> + *	if the reference count drops to zero.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del(struct net_device *dev, unsigned char *addr)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_del);
> +
> +/**
> + *	dev_addr_add_multiple	- Add device addresses from another device
> + *	@to_dev: device to which addresses will be added
> + *	@from_dev: device from which addresses will be added
> + *
> + *	Add device addresses of the one device to another.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_add_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +					&from_dev->dev_addr_list,
> +					to_dev->addr_len, 0);
> +	if (!err)
> +		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return err;
> +}
> +EXPORT_SYMBOL(dev_addr_add_multiple);
> +
> +/**
> + *	dev_addr_del_multiple	- Delete device addresses by another device
> + *	@to_dev: device where the addresses will be deleted
> + *	@from_dev: device by which addresses the addresses will be deleted
> + *
> + *	Deletes addresses in to device by the list of addresses in from device.
> + *
> + *	The caller must hold the rtnl_mutex.
> + */
> +int dev_addr_del_multiple(struct net_device *to_dev,
> +			  struct net_device *from_dev)
> +{
> +	ASSERT_RTNL();
> +
> +	if (from_dev->addr_len != to_dev->addr_len)
> +		return -EINVAL;
> +	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
> +				  &from_dev->dev_addr_list,
> +				  to_dev->addr_len, 0);
> +	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
> +	return 0;
> +}
> +EXPORT_SYMBOL(dev_addr_del_multiple);
> +
> +/* unicast and multicast addresses handling functions */
> +
>  int __dev_addr_delete(struct dev_addr_list **list, int *count,
>  		      void *addr, int alen, int glbl)
>  {
> @@ -4780,6 +5037,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  
>  	dev->gso_max_size = GSO_MAX_SIZE;
>  
> +	dev_addr_init(dev);
>  	netdev_init_queues(dev);
>  
>  	INIT_LIST_HEAD(&dev->napi_list);
> @@ -4805,6 +5063,9 @@ void free_netdev(struct net_device *dev)
>  
>  	kfree(dev->_tx);
>  
> +	/* Flush device addresses */
> +	dev_addr_flush(dev);
> +
>  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>  		netif_napi_del(p);
>  
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko April 18, 2009, 7:01 a.m. UTC | #2
Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:

<snip>

>> +struct netdev_hw_addr {
>> +	struct list_head	list;
>> +	unsigned char		addr[MAX_ADDR_LEN];
>> +	int			refcount;
>> +	struct rcu_head		rcu_head;
>> +};
>
>Minor nit, the ordering of elements cause holes that might not be
>needed.

Agree that ordering might be done better. Will do.
>
>Space saving? is rcu_head needed or would using synchronize_net
>make code cleaner and save space. 
>

Well I originaly had this done by synchronize_rcu(). Eric argued that it might
cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
called) once it hits the tree.

<snip>

>> +	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
>> +	if (!ha)
>> +		return -ENOMEM;
>Since you are initializing all fields, kzalloc isn't really needed

Noted.
>
>> +	memcpy(ha->addr, addr, addr_len);
>> +	ha->refcount = 1;
>> +	list_add_tail_rcu(&ha->list, list);
>> +	return 0;
>> +}

<snip>

>> +static void dev_addr_flush(struct net_device *dev)
>> +{
>> +	ASSERT_RTNL();
>> +
>Since this is local you should be able to audit all
>the callers and remove this ASSERT.

Okay. I will at least put a comment instead of this.
>
>> +	__hw_addr_flush(&dev->dev_addr_list);
>> +	dev->dev_addr = NULL;
>> +}
>> +
>> +static int dev_addr_init(struct net_device *dev)
>> +{
>> +	unsigned char addr[MAX_ADDR_LEN];
>> +	struct netdev_hw_addr *ha;
>> +	int err;
>> +
>> +	ASSERT_RTNL();
>Ditto, ASSERT_RTNL makes sense for exposed kernel API and
>initial testing.
>
>> +	INIT_LIST_HEAD(&dev->dev_addr_list);
>> +	memset(addr, 0, sizeof(*addr));
>> +	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
>> +	if (!err) {
>> +		/*
>> +		 * Get the first (previously created) address from the list
>> +		 * and set dev_addr pointer to this location.
>> +		 */
>> +		ha = list_first_entry(&dev->dev_addr_list,
>> +				      struct netdev_hw_addr, list);
>> +		dev->dev_addr = ha->addr;
>> +	}
>> +	return err;
>> +}

<snip>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 18, 2009, 7:35 a.m. UTC | #3
Jiri Pirko a écrit :
> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
> 
> <snip>
> 
>>> +struct netdev_hw_addr {
>>> +	struct list_head	list;
>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>> +	int			refcount;
>>> +	struct rcu_head		rcu_head;
>>> +};
>> Minor nit, the ordering of elements cause holes that might not be
>> needed.
> 
> Agree that ordering might be done better. Will do.
>> Space saving? is rcu_head needed or would using synchronize_net
>> make code cleaner and save space. 
>>
> 
> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
> called) once it hits the tree.
> 

Yes, and dont forget we wont save space, as we allocate a full
cache line to hold a 'struct netdev_hw_addr', since we dont want this
critical and read_mostly object polluted by a hot spot elsewhere in kernel...

Considering this, letting 'rcu_head' at the end of structure, even if we
have an eventual hole on 64 bit arches is not really a problem, and IMHO
the best thing to do, as rcu_head is only used at dismantle time.

And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)

Thank you


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko April 18, 2009, 7:44 a.m. UTC | #4
Sat, Apr 18, 2009 at 09:35:32AM CEST, dada1@cosmosbay.com wrote:
>Jiri Pirko a écrit :
>> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
>> 
>> <snip>
>> 
>>>> +struct netdev_hw_addr {
>>>> +	struct list_head	list;
>>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>>> +	int			refcount;
>>>> +	struct rcu_head		rcu_head;
>>>> +};
>>> Minor nit, the ordering of elements cause holes that might not be
>>> needed.
>> 
>> Agree that ordering might be done better. Will do.
>>> Space saving? is rcu_head needed or would using synchronize_net
>>> make code cleaner and save space. 
>>>
>> 
>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>> called) once it hits the tree.
>> 
>
>Yes, and dont forget we wont save space, as we allocate a full
>cache line to hold a 'struct netdev_hw_addr', since we dont want this
>critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>
>Considering this, letting 'rcu_head' at the end of structure, even if we
>have an eventual hole on 64 bit arches is not really a problem, and IMHO
>the best thing to do, as rcu_head is only used at dismantle time.

I will order the struct better, there are archs with small cache line size where
it makes sense.

>
>And yes, maybe kfree_rcu() will makes its way in kernel, eventually :)
>
>Thank you
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 18, 2009, 8:06 a.m. UTC | #5
Jiri Pirko a écrit :
> Sat, Apr 18, 2009 at 09:35:32AM CEST, dada1@cosmosbay.com wrote:
>> Jiri Pirko a écrit :
>>> Fri, Apr 17, 2009 at 05:33:15PM CEST, shemminger@vyatta.com wrote:
>>>
>>> <snip>
>>>
>>>>> +struct netdev_hw_addr {
>>>>> +	struct list_head	list;
>>>>> +	unsigned char		addr[MAX_ADDR_LEN];
>>>>> +	int			refcount;
>>>>> +	struct rcu_head		rcu_head;
>>>>> +};
>>>> Minor nit, the ordering of elements cause holes that might not be
>>>> needed.
>>> Agree that ordering might be done better. Will do.
>>>> Space saving? is rcu_head needed or would using synchronize_net
>>>> make code cleaner and save space. 
>>>>
>>> Well I originaly had this done by synchronize_rcu(). Eric argued that it might
>>> cause especially __hw_addr_del_multiple_ii() to run long and suggested to use
>>> call_rcu() instead. I plan to switch this to kfree_rcu() (or whatever it's
>>> called) once it hits the tree.
>>>
>> Yes, and dont forget we wont save space, as we allocate a full
>> cache line to hold a 'struct netdev_hw_addr', since we dont want this
>> critical and read_mostly object polluted by a hot spot elsewhere in kernel...
>>
>> Considering this, letting 'rcu_head' at the end of structure, even if we
>> have an eventual hole on 64 bit arches is not really a problem, and IMHO
>> the best thing to do, as rcu_head is only used at dismantle time.
> 
> I will order the struct better, there are archs with small cache line size where
> it makes sense.

How exactly ?

If you consider a 32bit arch with 16 or 32 bytes cache line,
sizeof(struct_list_dead) is 8
sizeof(addr) = 32     (but we really use 6 bytes for ethernet)

struct netdev_hw_addr {
	unsigned char		addr[MAX_ADDR_LEN];
	struct list_head	list;
	int			refcount;
	struct rcu_head		rcu_head;
};

would cost more at lookup time, since we would use two cache lines

struct netdev_hw_addr {
	struct list_head	list;
	unsigned char		addr[MAX_ADDR_LEN];
	int			refcount;
	struct rcu_head		rcu_head;
};

Is nicer, because at least 8 bytes of addr share the same cache line
than list. So direct dev->dev_addr would be fast (for devices with one
address), and is_etherdev_addr() would still use one cache line per
item.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1f17ab..3d7a668 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -182,6 +182,33 @@  static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
 	return compare_ether_addr(addr1, addr2);
 #endif
 }
+
+/**
+ * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+ * @dev: Pointer to a device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Compare passed address with all addresses of the device. Return true if the
+ * address if one of the device addresses.
+ *
+ * Note that this function calls compare_ether_addr_64bits() so take care of
+ * the right padding.
+ */
+static inline bool is_etherdev_addr(const struct net_device *dev,
+				    const u8 addr[6 + 2])
+{
+	struct netdev_hw_addr *ha;
+	int res = 1;
+
+	rcu_read_lock();
+	for_each_dev_addr(dev, ha) {
+		res = compare_ether_addr_64bits(addr, ha->addr);
+		if (!res)
+			break;
+	}
+	rcu_read_unlock();
+	return !res;
+}
 #endif	/* __KERNEL__ */
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..89ad6d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -210,6 +210,13 @@  struct dev_addr_list
 #define dmi_users	da_users
 #define dmi_gusers	da_gusers
 
+struct netdev_hw_addr {
+	struct list_head	list;
+	unsigned char		addr[MAX_ADDR_LEN];
+	int			refcount;
+	struct rcu_head		rcu_head;
+};
+
 struct hh_cache
 {
 	struct hh_cache *hh_next;	/* Next entry			     */
@@ -776,8 +783,11 @@  struct net_device
  */
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
-	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast
-							   because most packets are unicast) */
+	unsigned char		*dev_addr;	/* hw address, (before bcast
+						   because most packets are
+						   unicast) */
+
+	struct list_head	dev_addr_list; /* list of device hw addresses */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -1778,6 +1788,13 @@  static inline void netif_addr_unlock_bh(struct net_device *dev)
 	spin_unlock_bh(&dev->addr_list_lock);
 }
 
+/*
+ * dev_addr_list walker. Should be used only for read access. Call with
+ * rcu_read_lock held.
+ */
+#define for_each_dev_addr(dev, ha) \
+		list_for_each_entry_rcu(ha, &dev->dev_addr_list, list)
+
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
 
 extern void		ether_setup(struct net_device *dev);
@@ -1790,6 +1807,17 @@  extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
+
+/* Functions used for device addresses handling */
+extern int		dev_addr_add(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_del(struct net_device *dev,
+				     unsigned char *addr);
+extern int		dev_addr_add_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+extern int		dev_addr_del_multiple(struct net_device *to_dev,
+					      struct net_device *from_dev);
+
 /* Functions used for secondary unicast and multicast support */
 extern void		dev_set_rx_mode(struct net_device *dev);
 extern void		__dev_set_rx_mode(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..b4503ac 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3438,6 +3438,263 @@  void dev_set_rx_mode(struct net_device *dev)
 	netif_addr_unlock_bh(dev);
 }
 
+/* hw addresses list handling functions */
+
+static int __hw_addr_add_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+	ha = kzalloc(max(sizeof(*ha), L1_CACHE_BYTES), GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->refcount = 1;
+	list_add_tail_rcu(&ha->list, list);
+	return 0;
+}
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_add_ii(list, addr, addr_len, -1);
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+	struct netdev_hw_addr *ha;
+
+	ha = container_of(head, struct netdev_hw_addr, rcu_head);
+	kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+			    int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+	int i = 0;
+
+	list_for_each_entry(ha, list, list) {
+		if (i++ != ignore_index &&
+		    !memcmp(ha->addr, addr, addr_len)) {
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			call_rcu(&ha->rcu_head, ha_rcu_free);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct list_head *list, unsigned char *addr,
+			 int addr_len)
+{
+	return __hw_addr_del_ii(list, addr, addr_len, -1);
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+				     struct list_head *from_list,
+				     int addr_len, int ignore_index)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+
+	list_for_each_entry(ha, from_list, list) {
+		err = __hw_addr_add_ii(to_list, ha->addr, addr_len, 0);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, from_list, list) {
+		if (ha2 == ha)
+			break;
+		__hw_addr_del_ii(to_list, ha2->addr, addr_len, 0);
+	}
+	return err;
+}
+
+static int __hw_addr_add_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	return __hw_addr_add_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+				      struct list_head *from_list,
+				      int addr_len, int ignore_index)
+{
+	struct netdev_hw_addr *ha;
+
+	list_for_each_entry(ha, from_list, list) {
+		__hw_addr_del_ii(to_list, ha->addr, addr_len, 0);
+	}
+}
+
+static void __hw_addr_del_multiple(struct list_head *to_list,
+					 struct list_head *from_list,
+					 int addr_len)
+{
+	__hw_addr_del_multiple_ii(to_list, from_list, addr_len, -1);
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, list, list) {
+		list_del_rcu(&ha->list);
+		call_rcu(&ha->rcu_head, ha_rcu_free);
+	}
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+	ASSERT_RTNL();
+
+	__hw_addr_flush(&dev->dev_addr_list);
+	dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	ASSERT_RTNL();
+
+	INIT_LIST_HEAD(&dev->dev_addr_list);
+	memset(addr, 0, sizeof(*addr));
+	err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr));
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addr_list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+
+/**
+ *	dev_addr_add	- Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del	- Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple	- Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *
+ *	Add device addresses of the one device to another.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+					&from_dev->dev_addr_list,
+					to_dev->addr_len, 0);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple	- Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device by which addresses the addresses will be deleted
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+				  &from_dev->dev_addr_list,
+				  to_dev->addr_len, 0);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
 int __dev_addr_delete(struct dev_addr_list **list, int *count,
 		      void *addr, int alen, int glbl)
 {
@@ -4780,6 +5037,7 @@  struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
+	dev_addr_init(dev);
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -4805,6 +5063,9 @@  void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);