Message ID | 20081103213758.59a8361d@extreme |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
Stephen Hemminger a écrit : > This is something I was trying out to try improving loopback performance. > It moves loopback processing from the generic backlog per-cpu queues, > to its own set of queues. I thought this might help the cache locality > and the loopback doesn't have to do relatively expensive local_irq_disable() > operations. > > BUT it didn't seem to help with tbench, but it might help others who > want to go farther with it. Code runs but treat it ASIS at this point. > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> > > > --- a/drivers/net/loopback.c 2008-11-03 10:13:31.000000000 -0800 > +++ b/drivers/net/loopback.c 2008-11-03 11:05:52.000000000 -0800 > @@ -59,51 +59,79 @@ > #include <linux/percpu.h> > #include <net/net_namespace.h> > > -struct pcpu_lstats { > +struct loopback_per_cpu { > + struct sk_buff_head rxq; > + struct napi_struct napi; > + > unsigned long packets; > unsigned long bytes; > }; > > + > /* > * The higher levels take care of making this non-reentrant (it's > * called with bh's disabled). > */ > static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) > { > - struct pcpu_lstats *pcpu_lstats, *lb_stats; > + struct loopback_per_cpu *lo = dev->ml_priv, *pcpu; > > skb_orphan(skb); > > - skb->protocol = eth_type_trans(skb,dev); > + skb->protocol = eth_type_trans(skb, dev); > > dev->last_rx = jiffies; > > /* it's OK to use per_cpu_ptr() because BHs are off */ > - pcpu_lstats = dev->ml_priv; > - lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); > - lb_stats->bytes += skb->len; > - lb_stats->packets++; > + pcpu = per_cpu_ptr(lo, smp_processor_id()); > > - netif_rx(skb); > + if (likely(pcpu->rxq.qlen < netdev_max_backlog)) { > + if (pcpu->rxq.qlen == 0) > + __napi_schedule(&lo->napi); Hum... Most of the time I suspect we take this __napi_schedule() thing. So are you sure you use the right one (&lo->napi, not &pcpu->napi) ? This is the where point we still do a cache line ping-pong. > + > + __skb_queue_tail(&pcpu->rxq, skb); > + pcpu->bytes += skb->len; > + pcpu->packets++; > + > + return NET_XMIT_SUCCESS; > + } else { > + dev->stats.rx_dropped++; > + dev_kfree_skb_any(skb); > + return NET_XMIT_DROP; > + } > +} > > - return 0; > +static int loopback_poll(struct napi_struct *napi, int work_limit) > +{ > + struct loopback_per_cpu *pcpu > + = container_of(napi, struct loopback_per_cpu, napi); > + struct sk_buff *skb; > + int work_done = 0; > + > + while ( (skb = __skb_dequeue(&pcpu->rxq)) ) { > + netif_receive_skb(skb); > + > + if (++work_done >= work_limit) > + goto done; > + } > + > + __napi_complete(napi); > +done: > + return work_done; > } > > static struct net_device_stats *get_stats(struct net_device *dev) > { > - const struct pcpu_lstats *pcpu_lstats; > + const struct loopback_per_cpu *lo = dev->ml_priv; > struct net_device_stats *stats = &dev->stats; > unsigned long bytes = 0; > unsigned long packets = 0; > int i; > > - pcpu_lstats = dev->ml_priv; > for_each_possible_cpu(i) { > - const struct pcpu_lstats *lb_stats; > - > - lb_stats = per_cpu_ptr(pcpu_lstats, i); > - bytes += lb_stats->bytes; > - packets += lb_stats->packets; > + const struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i); > + bytes += pcpu->bytes; > + packets += pcpu->packets; > } > stats->rx_packets = packets; > stats->tx_packets = packets; > @@ -127,21 +155,43 @@ static const struct ethtool_ops loopback > > static int loopback_dev_init(struct net_device *dev) > { > - struct pcpu_lstats *lstats; > + struct loopback_per_cpu *lo; > + int i; > > - lstats = alloc_percpu(struct pcpu_lstats); > - if (!lstats) > + lo = alloc_percpu(struct loopback_per_cpu); > + if (!lo) > return -ENOMEM; > > - dev->ml_priv = lstats; > + dev->ml_priv = lo; > + > + for_each_possible_cpu(i) { > + struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i); > + skb_queue_head_init(&pcpu->rxq); > + netif_napi_add(dev, &pcpu->napi, loopback_poll, 64); > + } > + > + return 0; > +} > + > +static int loopback_dev_stop(struct net_device *dev) > +{ > + struct loopback_per_cpu *lo = dev->ml_priv; > + int i; > + > + for_each_possible_cpu(i) { > + struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i); > + > + napi_synchronize(&pcpu->napi); > + __skb_queue_purge(&pcpu->rxq); > + } > return 0; > } > > static void loopback_dev_free(struct net_device *dev) > { > - struct pcpu_lstats *lstats = dev->ml_priv; > + struct loopback_per_cpu *lo = dev->ml_priv; > > - free_percpu(lstats); > + free_percpu(lo); > free_netdev(dev); > } > > @@ -169,6 +219,7 @@ static void loopback_setup(struct net_de > dev->header_ops = ð_header_ops; > dev->init = loopback_dev_init; > dev->destructor = loopback_dev_free; > + dev->stop = loopback_dev_stop; > } > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Eric Dumazet <dada1@cosmosbay.com> Date: Tue, 04 Nov 2008 07:36:48 +0100 > So are you sure you use the right one (&lo->napi, not &pcpu->napi) ? Yes I think this logic error in Stephen's patch might explain why it had no effect. Stephen please try with lo->napi replaced with pcpu->napi here. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
--- a/drivers/net/loopback.c 2008-11-03 10:13:31.000000000 -0800 +++ b/drivers/net/loopback.c 2008-11-03 11:05:52.000000000 -0800 @@ -59,51 +59,79 @@ #include <linux/percpu.h> #include <net/net_namespace.h> -struct pcpu_lstats { +struct loopback_per_cpu { + struct sk_buff_head rxq; + struct napi_struct napi; + unsigned long packets; unsigned long bytes; }; + /* * The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) { - struct pcpu_lstats *pcpu_lstats, *lb_stats; + struct loopback_per_cpu *lo = dev->ml_priv, *pcpu; skb_orphan(skb); - skb->protocol = eth_type_trans(skb,dev); + skb->protocol = eth_type_trans(skb, dev); dev->last_rx = jiffies; /* it's OK to use per_cpu_ptr() because BHs are off */ - pcpu_lstats = dev->ml_priv; - lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); - lb_stats->bytes += skb->len; - lb_stats->packets++; + pcpu = per_cpu_ptr(lo, smp_processor_id()); - netif_rx(skb); + if (likely(pcpu->rxq.qlen < netdev_max_backlog)) { + if (pcpu->rxq.qlen == 0) + __napi_schedule(&lo->napi); + + __skb_queue_tail(&pcpu->rxq, skb); + pcpu->bytes += skb->len; + pcpu->packets++; + + return NET_XMIT_SUCCESS; + } else { + dev->stats.rx_dropped++; + dev_kfree_skb_any(skb); + return NET_XMIT_DROP; + } +} - return 0; +static int loopback_poll(struct napi_struct *napi, int work_limit) +{ + struct loopback_per_cpu *pcpu + = container_of(napi, struct loopback_per_cpu, napi); + struct sk_buff *skb; + int work_done = 0; + + while ( (skb = __skb_dequeue(&pcpu->rxq)) ) { + netif_receive_skb(skb); + + if (++work_done >= work_limit) + goto done; + } + + __napi_complete(napi); +done: + return work_done; } static struct net_device_stats *get_stats(struct net_device *dev) { - const struct pcpu_lstats *pcpu_lstats; + const struct loopback_per_cpu *lo = dev->ml_priv; struct net_device_stats *stats = &dev->stats; unsigned long bytes = 0; unsigned long packets = 0; int i; - pcpu_lstats = dev->ml_priv; for_each_possible_cpu(i) { - const struct pcpu_lstats *lb_stats; - - lb_stats = per_cpu_ptr(pcpu_lstats, i); - bytes += lb_stats->bytes; - packets += lb_stats->packets; + const struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i); + bytes += pcpu->bytes; + packets += pcpu->packets; } stats->rx_packets = packets; stats->tx_packets = packets; @@ -127,21 +155,43 @@ static const struct ethtool_ops loopback static int loopback_dev_init(struct net_device *dev) { - struct pcpu_lstats *lstats; + struct loopback_per_cpu *lo; + int i; - lstats = alloc_percpu(struct pcpu_lstats); - if (!lstats) + lo = alloc_percpu(struct loopback_per_cpu); + if (!lo) return -ENOMEM; - dev->ml_priv = lstats; + dev->ml_priv = lo; + + for_each_possible_cpu(i) { + struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i); + skb_queue_head_init(&pcpu->rxq); + netif_napi_add(dev, &pcpu->napi, loopback_poll, 64); + } + + return 0; +} + +static int loopback_dev_stop(struct net_device *dev) +{ + struct loopback_per_cpu *lo = dev->ml_priv; + int i; + + for_each_possible_cpu(i) { + struct loopback_per_cpu *pcpu = per_cpu_ptr(lo, i); + + napi_synchronize(&pcpu->napi); + __skb_queue_purge(&pcpu->rxq); + } return 0; } static void loopback_dev_free(struct net_device *dev) { - struct pcpu_lstats *lstats = dev->ml_priv; + struct loopback_per_cpu *lo = dev->ml_priv; - free_percpu(lstats); + free_percpu(lo); free_netdev(dev); } @@ -169,6 +219,7 @@ static void loopback_setup(struct net_de dev->header_ops = ð_header_ops; dev->init = loopback_dev_init; dev->destructor = loopback_dev_free; + dev->stop = loopback_dev_stop; } -- To unsubscribe from this list: send the line "unsubscribe netdev" in
This is something I was trying out to try improving loopback performance. It moves loopback processing from the generic backlog per-cpu queues, to its own set of queues. I thought this might help the cache locality and the loopback doesn't have to do relatively expensive local_irq_disable() operations. BUT it didn't seem to help with tbench, but it might help others who want to go farther with it. Code runs but treat it ASIS at this point. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html