diff mbox

[V3,1/2] virtio-net: fix the set affinity bug when CPU IDs are not consecutive

Message ID 1357639660-6660-1-git-send-email-gaowanlong@cn.fujitsu.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Wanlong Gao Jan. 8, 2013, 10:07 a.m. UTC
As Michael mentioned, set affinity and select queue will not work very
well when CPU IDs are not consecutive, this can happen with hot unplug.
Fix this bug by traversal the online CPUs, and create a per cpu variable
to find the mapping from CPU to the preferable virtual-queue.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Eric Dumazet <erdnetdev@gmail.com>
Cc: virtualization@lists.linux-foundation.org
Cc: netdev@vger.kernel.org
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 drivers/net/virtio_net.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

Comments

Jason Wang Jan. 8, 2013, 10:26 a.m. UTC | #1
On 01/08/2013 06:07 PM, Wanlong Gao wrote:
> As Michael mentioned, set affinity and select queue will not work very
> well when CPU IDs are not consecutive, this can happen with hot unplug.
> Fix this bug by traversal the online CPUs, and create a per cpu variable
> to find the mapping from CPU to the preferable virtual-queue.
>
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: Eric Dumazet <erdnetdev@gmail.com>
> Cc: virtualization@lists.linux-foundation.org
> Cc: netdev@vger.kernel.org
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  drivers/net/virtio_net.c | 39 +++++++++++++++++++++++++++++----------
>  1 file changed, 29 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a6fcf15..a77f86c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -41,6 +41,8 @@ module_param(gso, bool, 0444);
>  #define VIRTNET_SEND_COMMAND_SG_MAX    2
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
> +DEFINE_PER_CPU(int, vq_index) = -1;
> +

I think this should not be a global one, consider we may have more than
one virtio-net cards with different max queues.
>  struct virtnet_stats {
>  	struct u64_stats_sync tx_syncp;
>  	struct u64_stats_sync rx_syncp;
> @@ -1016,6 +1018,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
>  static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
>  {
>  	int i;
> +	int cpu;
>  
>  	/* In multiqueue mode, when the number of cpu is equal to the number of
>  	 * queue pairs, we let the queue pairs to be private to one cpu by
> @@ -1029,16 +1032,29 @@ static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
>  			return;
>  	}
>  
> -	for (i = 0; i < vi->max_queue_pairs; i++) {
> -		int cpu = set ? i : -1;
> -		virtqueue_set_affinity(vi->rq[i].vq, cpu);
> -		virtqueue_set_affinity(vi->sq[i].vq, cpu);
> -	}
> +	if (set) {
> +		i = 0;
> +		for_each_online_cpu(cpu) {
> +			virtqueue_set_affinity(vi->rq[i].vq, cpu);
> +			virtqueue_set_affinity(vi->sq[i].vq, cpu);
> +			per_cpu(vq_index, cpu) = i;
> +			i++;
> +			if (i >= vi->max_queue_pairs)
> +				break;

Can this happen? we check only set when the number are equal.
> +		}
>  
> -	if (set)
>  		vi->affinity_hint_set = true;
> -	else
> +	} else {
> +		for(i = 0; i < vi->max_queue_pairs; i++) {
> +			virtqueue_set_affinity(vi->rq[i].vq, -1);
> +			virtqueue_set_affinity(vi->sq[i].vq, -1);
> +		}
> +
> +		for_each_online_cpu(cpu)
> +			per_cpu(vq_index, cpu) = -1;
> +

This looks suboptimal since it may leads only txq zero is used.
>  		vi->affinity_hint_set = false;
> +	}
>  }
>  
>  static void virtnet_get_ringparam(struct net_device *dev,
> @@ -1127,12 +1143,15 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
>  
>  /* To avoid contending a lock hold by a vcpu who would exit to host, select the
>   * txq based on the processor id.
> - * TODO: handle cpu hotplug.
>   */
>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>  {
> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
> -		  smp_processor_id();
> +	int txq = 0;
> +
> +	if (skb_rx_queue_recorded(skb))
> +		txq = skb_get_rx_queue(skb);
> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
> +		txq = 0;
>  
>  	while (unlikely(txq >= dev->real_num_tx_queues))
>  		txq -= dev->real_num_tx_queues;

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell Jan. 8, 2013, 11:31 p.m. UTC | #2
Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>   */
>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>  {
> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
> -		  smp_processor_id();
> +	int txq = 0;
> +
> +	if (skb_rx_queue_recorded(skb))
> +		txq = skb_get_rx_queue(skb);
> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
> +		txq = 0;

You should use __get_cpu_var() instead of smp_processor_id() here, ie:

        else if ((txq = __get_cpu_var(vq_index)) == -1)

And AFAICT, no reason to initialize txq to 0 to start with.

So:

        int txq;

        if (skb_rx_queue_recorded(skb))
		txq = skb_get_rx_queue(skb);
        else {
                txq = __get_cpu_var(vq_index);
                if (txq == -1)
                        txq = 0;
        }

Now, just to confirm, I assume this can happen even if we use vq_index,
right, because of races with virtnet_set_channels?

  	while (unlikely(txq >= dev->real_num_tx_queues))
  		txq -= dev->real_num_tx_queues;


Thanks,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanlong Gao Jan. 9, 2013, 1:52 a.m. UTC | #3
On 01/08/2013 06:26 PM, Jason Wang wrote:
> On 01/08/2013 06:07 PM, Wanlong Gao wrote:
>> As Michael mentioned, set affinity and select queue will not work very
>> well when CPU IDs are not consecutive, this can happen with hot unplug.
>> Fix this bug by traversal the online CPUs, and create a per cpu variable
>> to find the mapping from CPU to the preferable virtual-queue.
>>
>> Cc: Rusty Russell <rusty@rustcorp.com.au>
>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> Cc: Jason Wang <jasowang@redhat.com>
>> Cc: Eric Dumazet <erdnetdev@gmail.com>
>> Cc: virtualization@lists.linux-foundation.org
>> Cc: netdev@vger.kernel.org
>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>> ---
>>  drivers/net/virtio_net.c | 39 +++++++++++++++++++++++++++++----------
>>  1 file changed, 29 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index a6fcf15..a77f86c 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -41,6 +41,8 @@ module_param(gso, bool, 0444);
>>  #define VIRTNET_SEND_COMMAND_SG_MAX    2
>>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>>  
>> +DEFINE_PER_CPU(int, vq_index) = -1;
>> +
> 
> I think this should not be a global one, consider we may have more than
> one virtio-net cards with different max queues.

Yes, would you move this into virtio_info?

>>  struct virtnet_stats {
>>  	struct u64_stats_sync tx_syncp;
>>  	struct u64_stats_sync rx_syncp;
>> @@ -1016,6 +1018,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
>>  static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
>>  {
>>  	int i;
>> +	int cpu;
>>  
>>  	/* In multiqueue mode, when the number of cpu is equal to the number of
>>  	 * queue pairs, we let the queue pairs to be private to one cpu by
>> @@ -1029,16 +1032,29 @@ static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
>>  			return;
>>  	}
>>  
>> -	for (i = 0; i < vi->max_queue_pairs; i++) {
>> -		int cpu = set ? i : -1;
>> -		virtqueue_set_affinity(vi->rq[i].vq, cpu);
>> -		virtqueue_set_affinity(vi->sq[i].vq, cpu);
>> -	}
>> +	if (set) {
>> +		i = 0;
>> +		for_each_online_cpu(cpu) {
>> +			virtqueue_set_affinity(vi->rq[i].vq, cpu);
>> +			virtqueue_set_affinity(vi->sq[i].vq, cpu);
>> +			per_cpu(vq_index, cpu) = i;
>> +			i++;
>> +			if (i >= vi->max_queue_pairs)
>> +				break;
> 
> Can this happen? we check only set when the number are equal.

will remove.

>> +		}
>>  
>> -	if (set)
>>  		vi->affinity_hint_set = true;
>> -	else
>> +	} else {
>> +		for(i = 0; i < vi->max_queue_pairs; i++) {
>> +			virtqueue_set_affinity(vi->rq[i].vq, -1);
>> +			virtqueue_set_affinity(vi->sq[i].vq, -1);
>> +		}
>> +
>> +		for_each_online_cpu(cpu)
>> +			per_cpu(vq_index, cpu) = -1;
>> +
> 
> This looks suboptimal since it may leads only txq zero is used.

So, which value is best for txq when we don't set affinity?
just remain to smp_processor_id()?

Thanks,
Wanlong Gao

>>  		vi->affinity_hint_set = false;
>> +	}
>>  }
>>  
>>  static void virtnet_get_ringparam(struct net_device *dev,
>> @@ -1127,12 +1143,15 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
>>  
>>  /* To avoid contending a lock hold by a vcpu who would exit to host, select the
>>   * txq based on the processor id.
>> - * TODO: handle cpu hotplug.
>>   */
>>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>>  {
>> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
>> -		  smp_processor_id();
>> +	int txq = 0;
>> +
>> +	if (skb_rx_queue_recorded(skb))
>> +		txq = skb_get_rx_queue(skb);
>> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
>> +		txq = 0;
>>  
>>  	while (unlikely(txq >= dev->real_num_tx_queues))
>>  		txq -= dev->real_num_tx_queues;
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanlong Gao Jan. 9, 2013, 1:54 a.m. UTC | #4
On 01/09/2013 07:31 AM, Rusty Russell wrote:
> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>>   */
>>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>>  {
>> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
>> -		  smp_processor_id();
>> +	int txq = 0;
>> +
>> +	if (skb_rx_queue_recorded(skb))
>> +		txq = skb_get_rx_queue(skb);
>> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
>> +		txq = 0;
> 
> You should use __get_cpu_var() instead of smp_processor_id() here, ie:
> 
>         else if ((txq = __get_cpu_var(vq_index)) == -1)
> 
> And AFAICT, no reason to initialize txq to 0 to start with.
> 
> So:
> 
>         int txq;
> 
>         if (skb_rx_queue_recorded(skb))
> 		txq = skb_get_rx_queue(skb);
>         else {
>                 txq = __get_cpu_var(vq_index);
>                 if (txq == -1)
>                         txq = 0;
>         }

Got it, thank you.

> 
> Now, just to confirm, I assume this can happen even if we use vq_index,
> right, because of races with virtnet_set_channels?

I still can't understand this race, could you explain more? thank you.

Regards,
Wanlong Gao

> 
>   	while (unlikely(txq >= dev->real_num_tx_queues))
>   		txq -= dev->real_num_tx_queues;
> 
> 
> Thanks,
> Rusty.
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Wang Jan. 9, 2013, 3:06 a.m. UTC | #5
On 01/09/2013 09:52 AM, Wanlong Gao wrote:
> On 01/08/2013 06:26 PM, Jason Wang wrote:
>> On 01/08/2013 06:07 PM, Wanlong Gao wrote:
>>> As Michael mentioned, set affinity and select queue will not work very
>>> well when CPU IDs are not consecutive, this can happen with hot unplug.
>>> Fix this bug by traversal the online CPUs, and create a per cpu variable
>>> to find the mapping from CPU to the preferable virtual-queue.
>>>
>>> Cc: Rusty Russell <rusty@rustcorp.com.au>
>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>> Cc: Jason Wang <jasowang@redhat.com>
>>> Cc: Eric Dumazet <erdnetdev@gmail.com>
>>> Cc: virtualization@lists.linux-foundation.org
>>> Cc: netdev@vger.kernel.org
>>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>>> ---
>>>  drivers/net/virtio_net.c | 39 +++++++++++++++++++++++++++++----------
>>>  1 file changed, 29 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index a6fcf15..a77f86c 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -41,6 +41,8 @@ module_param(gso, bool, 0444);
>>>  #define VIRTNET_SEND_COMMAND_SG_MAX    2
>>>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>>>  
>>> +DEFINE_PER_CPU(int, vq_index) = -1;
>>> +
>> I think this should not be a global one, consider we may have more than
>> one virtio-net cards with different max queues.
> Yes, would you move this into virtio_info?

Yes, I think it's better.
>>>  struct virtnet_stats {
>>>  	struct u64_stats_sync tx_syncp;
>>>  	struct u64_stats_sync rx_syncp;
>>> @@ -1016,6 +1018,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
>>>  static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
>>>  {
>>>  	int i;
>>> +	int cpu;
>>>  
>>>  	/* In multiqueue mode, when the number of cpu is equal to the number of
>>>  	 * queue pairs, we let the queue pairs to be private to one cpu by
>>> @@ -1029,16 +1032,29 @@ static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
>>>  			return;
>>>  	}
>>>  
>>> -	for (i = 0; i < vi->max_queue_pairs; i++) {
>>> -		int cpu = set ? i : -1;
>>> -		virtqueue_set_affinity(vi->rq[i].vq, cpu);
>>> -		virtqueue_set_affinity(vi->sq[i].vq, cpu);
>>> -	}
>>> +	if (set) {
>>> +		i = 0;
>>> +		for_each_online_cpu(cpu) {
>>> +			virtqueue_set_affinity(vi->rq[i].vq, cpu);
>>> +			virtqueue_set_affinity(vi->sq[i].vq, cpu);
>>> +			per_cpu(vq_index, cpu) = i;
>>> +			i++;
>>> +			if (i >= vi->max_queue_pairs)
>>> +				break;
>> Can this happen? we check only set when the number are equal.
> will remove.
>
>>> +		}
>>>  
>>> -	if (set)
>>>  		vi->affinity_hint_set = true;
>>> -	else
>>> +	} else {
>>> +		for(i = 0; i < vi->max_queue_pairs; i++) {
>>> +			virtqueue_set_affinity(vi->rq[i].vq, -1);
>>> +			virtqueue_set_affinity(vi->sq[i].vq, -1);
>>> +		}
>>> +
>>> +		for_each_online_cpu(cpu)
>>> +			per_cpu(vq_index, cpu) = -1;
>>> +
>> This looks suboptimal since it may leads only txq zero is used.
> So, which value is best for txq when we don't set affinity?
> just remain to smp_processor_id()?

The value which will let us use all queues are ok.

How about this?
 
i = 0;
for_each_online_cpu(cpu)
    per_cpu(vq_index, cpu) = ++i % vi->curr_queues;
> Thanks,
> Wanlong Gao
>
>>>  		vi->affinity_hint_set = false;
>>> +	}
>>>  }
>>>  
>>>  static void virtnet_get_ringparam(struct net_device *dev,
>>> @@ -1127,12 +1143,15 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
>>>  
>>>  /* To avoid contending a lock hold by a vcpu who would exit to host, select the
>>>   * txq based on the processor id.
>>> - * TODO: handle cpu hotplug.
>>>   */
>>>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>>>  {
>>> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
>>> -		  smp_processor_id();
>>> +	int txq = 0;
>>> +
>>> +	if (skb_rx_queue_recorded(skb))
>>> +		txq = skb_get_rx_queue(skb);
>>> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
>>> +		txq = 0;
>>>  
>>>  	while (unlikely(txq >= dev->real_num_tx_queues))
>>>  		txq -= dev->real_num_tx_queues;
>>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell Jan. 10, 2013, 12:49 a.m. UTC | #6
Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
> On 01/09/2013 07:31 AM, Rusty Russell wrote:
>> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>>>   */
>>>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>>>  {
>>> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
>>> -		  smp_processor_id();
>>> +	int txq = 0;
>>> +
>>> +	if (skb_rx_queue_recorded(skb))
>>> +		txq = skb_get_rx_queue(skb);
>>> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
>>> +		txq = 0;
>> 
>> You should use __get_cpu_var() instead of smp_processor_id() here, ie:
>> 
>>         else if ((txq = __get_cpu_var(vq_index)) == -1)
>> 
>> And AFAICT, no reason to initialize txq to 0 to start with.
>> 
>> So:
>> 
>>         int txq;
>> 
>>         if (skb_rx_queue_recorded(skb))
>> 		txq = skb_get_rx_queue(skb);
>>         else {
>>                 txq = __get_cpu_var(vq_index);
>>                 if (txq == -1)
>>                         txq = 0;
>>         }
>
> Got it, thank you.
>
>> 
>> Now, just to confirm, I assume this can happen even if we use vq_index,
>> right, because of races with virtnet_set_channels?
>
> I still can't understand this race, could you explain more? thank you.

I assume that someone can call virtnet_set_channels() while we are
inside virtnet_select_queue(), so they reduce dev->real_num_tx_queues,
causing virtnet_set_channels to do:

	while (unlikely(txq >= dev->real_num_tx_queues))
		txq -= dev->real_num_tx_queues;

Otherwise, when is this loop called?

Thanks,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanlong Gao Jan. 10, 2013, 9:26 a.m. UTC | #7
On 01/10/2013 08:49 AM, Rusty Russell wrote:
> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>> On 01/09/2013 07:31 AM, Rusty Russell wrote:
>>> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>>>>   */
>>>>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>>>>  {
>>>> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
>>>> -		  smp_processor_id();
>>>> +	int txq = 0;
>>>> +
>>>> +	if (skb_rx_queue_recorded(skb))
>>>> +		txq = skb_get_rx_queue(skb);
>>>> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
>>>> +		txq = 0;
>>>
>>> You should use __get_cpu_var() instead of smp_processor_id() here, ie:
>>>
>>>         else if ((txq = __get_cpu_var(vq_index)) == -1)
>>>
>>> And AFAICT, no reason to initialize txq to 0 to start with.
>>>
>>> So:
>>>
>>>         int txq;
>>>
>>>         if (skb_rx_queue_recorded(skb))
>>> 		txq = skb_get_rx_queue(skb);
>>>         else {
>>>                 txq = __get_cpu_var(vq_index);
>>>                 if (txq == -1)
>>>                         txq = 0;
>>>         }
>>
>> Got it, thank you.
>>
>>>
>>> Now, just to confirm, I assume this can happen even if we use vq_index,
>>> right, because of races with virtnet_set_channels?
>>
>> I still can't understand this race, could you explain more? thank you.
> 
> I assume that someone can call virtnet_set_channels() while we are
> inside virtnet_select_queue(), so they reduce dev->real_num_tx_queues,
> causing virtnet_set_channels to do:
> 
> 	while (unlikely(txq >= dev->real_num_tx_queues))
> 		txq -= dev->real_num_tx_queues;
> 
> Otherwise, when is this loop called?

How about just remove this loop? 

Eric, can you give a help here?

Thanks,
Wanlong Gao

> 
> Thanks,
> Rusty.
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings Jan. 10, 2013, 7:12 p.m. UTC | #8
On Thu, 2013-01-10 at 11:19 +1030, Rusty Russell wrote:
> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
> > On 01/09/2013 07:31 AM, Rusty Russell wrote:
> >> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
> >>>   */
> >>>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
> >>>  {
> >>> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
> >>> -		  smp_processor_id();
> >>> +	int txq = 0;
> >>> +
> >>> +	if (skb_rx_queue_recorded(skb))
> >>> +		txq = skb_get_rx_queue(skb);
> >>> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
> >>> +		txq = 0;
> >> 
> >> You should use __get_cpu_var() instead of smp_processor_id() here, ie:
> >> 
> >>         else if ((txq = __get_cpu_var(vq_index)) == -1)
> >> 
> >> And AFAICT, no reason to initialize txq to 0 to start with.
> >> 
> >> So:
> >> 
> >>         int txq;
> >> 
> >>         if (skb_rx_queue_recorded(skb))
> >> 		txq = skb_get_rx_queue(skb);
> >>         else {
> >>                 txq = __get_cpu_var(vq_index);
> >>                 if (txq == -1)
> >>                         txq = 0;
> >>         }
> >
> > Got it, thank you.
> >
> >> 
> >> Now, just to confirm, I assume this can happen even if we use vq_index,
> >> right, because of races with virtnet_set_channels?
> >
> > I still can't understand this race, could you explain more? thank you.
> 
> I assume that someone can call virtnet_set_channels() while we are
> inside virtnet_select_queue(), so they reduce dev->real_num_tx_queues,
> causing virtnet_set_channels to do:
> 
> 	while (unlikely(txq >= dev->real_num_tx_queues))
> 		txq -= dev->real_num_tx_queues;
> 
> Otherwise, when is this loop called?

In fact, this race can result in the TX scheduler using a queue that has
been disabled, or other weirdness (consider what happens if
real_num_tx_queues increases between those two uses).

virtnet_set_channels() really must disable TX temporarily:

	netif_tx_lock(dev);
	netif_device_detach(dev);
	netif_tx_unlock(dev);
	...
	netif_device_attach(dev);

Ben.
Jason Wang Jan. 11, 2013, 8:37 a.m. UTC | #9
On 01/11/2013 03:12 AM, Ben Hutchings wrote:
> On Thu, 2013-01-10 at 11:19 +1030, Rusty Russell wrote:
>> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>>> On 01/09/2013 07:31 AM, Rusty Russell wrote:
>>>> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>>>>>   */
>>>>>  static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
>>>>>  {
>>>>> -	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
>>>>> -		  smp_processor_id();
>>>>> +	int txq = 0;
>>>>> +
>>>>> +	if (skb_rx_queue_recorded(skb))
>>>>> +		txq = skb_get_rx_queue(skb);
>>>>> +	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
>>>>> +		txq = 0;
>>>> You should use __get_cpu_var() instead of smp_processor_id() here, ie:
>>>>
>>>>         else if ((txq = __get_cpu_var(vq_index)) == -1)
>>>>
>>>> And AFAICT, no reason to initialize txq to 0 to start with.
>>>>
>>>> So:
>>>>
>>>>         int txq;
>>>>
>>>>         if (skb_rx_queue_recorded(skb))
>>>> 		txq = skb_get_rx_queue(skb);
>>>>         else {
>>>>                 txq = __get_cpu_var(vq_index);
>>>>                 if (txq == -1)
>>>>                         txq = 0;
>>>>         }
>>> Got it, thank you.
>>>
>>>> Now, just to confirm, I assume this can happen even if we use vq_index,
>>>> right, because of races with virtnet_set_channels?
>>> I still can't understand this race, could you explain more? thank you.
>> I assume that someone can call virtnet_set_channels() while we are
>> inside virtnet_select_queue(), so they reduce dev->real_num_tx_queues,
>> causing virtnet_set_channels to do:
>>
>> 	while (unlikely(txq >= dev->real_num_tx_queues))
>> 		txq -= dev->real_num_tx_queues;
>>
>> Otherwise, when is this loop called?
> In fact, this race can result in the TX scheduler using a queue that has
> been disabled, or other weirdness (consider what happens if
> real_num_tx_queues increases between those two uses).
>
> virtnet_set_channels() really must disable TX temporarily:
>
> 	netif_tx_lock(dev);
> 	netif_device_detach(dev);
> 	netif_tx_unlock(dev);
> 	...
> 	netif_device_attach(dev);
>
> Ben.
>

Michael, I think the future plan is trying to use multiqueue by default
instead of doing switching between the modes? If yes, we can temporarily
disable the tx instead of doing extra hacks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a6fcf15..a77f86c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -41,6 +41,8 @@  module_param(gso, bool, 0444);
 #define VIRTNET_SEND_COMMAND_SG_MAX    2
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
+DEFINE_PER_CPU(int, vq_index) = -1;
+
 struct virtnet_stats {
 	struct u64_stats_sync tx_syncp;
 	struct u64_stats_sync rx_syncp;
@@ -1016,6 +1018,7 @@  static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
 static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
 {
 	int i;
+	int cpu;
 
 	/* In multiqueue mode, when the number of cpu is equal to the number of
 	 * queue pairs, we let the queue pairs to be private to one cpu by
@@ -1029,16 +1032,29 @@  static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
 			return;
 	}
 
-	for (i = 0; i < vi->max_queue_pairs; i++) {
-		int cpu = set ? i : -1;
-		virtqueue_set_affinity(vi->rq[i].vq, cpu);
-		virtqueue_set_affinity(vi->sq[i].vq, cpu);
-	}
+	if (set) {
+		i = 0;
+		for_each_online_cpu(cpu) {
+			virtqueue_set_affinity(vi->rq[i].vq, cpu);
+			virtqueue_set_affinity(vi->sq[i].vq, cpu);
+			per_cpu(vq_index, cpu) = i;
+			i++;
+			if (i >= vi->max_queue_pairs)
+				break;
+		}
 
-	if (set)
 		vi->affinity_hint_set = true;
-	else
+	} else {
+		for(i = 0; i < vi->max_queue_pairs; i++) {
+			virtqueue_set_affinity(vi->rq[i].vq, -1);
+			virtqueue_set_affinity(vi->sq[i].vq, -1);
+		}
+
+		for_each_online_cpu(cpu)
+			per_cpu(vq_index, cpu) = -1;
+
 		vi->affinity_hint_set = false;
+	}
 }
 
 static void virtnet_get_ringparam(struct net_device *dev,
@@ -1127,12 +1143,15 @@  static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
 
 /* To avoid contending a lock hold by a vcpu who would exit to host, select the
  * txq based on the processor id.
- * TODO: handle cpu hotplug.
  */
 static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
 {
-	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
-		  smp_processor_id();
+	int txq = 0;
+
+	if (skb_rx_queue_recorded(skb))
+		txq = skb_get_rx_queue(skb);
+	else if ((txq = per_cpu(vq_index, smp_processor_id())) == -1)
+		txq = 0;
 
 	while (unlikely(txq >= dev->real_num_tx_queues))
 		txq -= dev->real_num_tx_queues;