Message ID | 1389682387-28601-1-git-send-email-jasowang@redhat.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: > We used to limit the number of packets queued through tx_queue_length. This > has several issues: > > - tx_queue_length is the control of qdisc queue length, simply reusing it > to control the packets queued by device may cause confusion. > - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add > support of packet capture on macvtap device."), an unexpected qdisc > caused by non-zero tx_queue_length will lead qdisc lock contention for > multiqueue deivce. > - What we really want is to limit the total amount of memory occupied not > the number of packets. > > So this patch tries to solve the above issues by using socket rcvbuf to > limit the packets could be queued for tun/macvtap. This was done by using > sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two > new ioctl() were introduced for userspace to change the rcvbuf like what we > have done for sndbuf. > > With this fix, we can safely change the tx_queue_len of macvtap to > zero. This will make multiqueue works without extra lock contention. > > Cc: Vlad Yasevich <vyasevic@redhat.com> > Cc: Michael S. Tsirkin <mst@redhat.com> > Cc: John Fastabend <john.r.fastabend@intel.com> > Cc: Stephen Hemminger <stephen@networkplumber.org> > Cc: Herbert Xu <herbert@gondor.apana.org.au> > Signed-off-by: Jason Wang <jasowang@redhat.com> No, I don't think we can change userspace-visible behaviour like that. This will break any existing user that tries to control queue length through sysfs,netlink or device ioctl. Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com which gives one way to set tx_queue_len to zero without breaking userspace. > --- > drivers/net/macvtap.c | 31 ++++++++++++++++++++--------- > drivers/net/tun.c | 48 +++++++++++++++++++++++++++++++++------------ > include/uapi/linux/if_tun.h | 3 +++ > 3 files changed, 60 insertions(+), 22 deletions(-) > > diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c > index a2c3a89..c429c56 100644 > --- a/drivers/net/macvtap.c > +++ b/drivers/net/macvtap.c > @@ -292,9 +292,6 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) > if (!q) > return RX_HANDLER_PASS; > > - if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) > - goto drop; > - > skb_push(skb, ETH_HLEN); > > /* Apply the forward feature mask so that we perform segmentation > @@ -310,8 +307,10 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) > goto drop; > > if (!segs) { > - skb_queue_tail(&q->sk.sk_receive_queue, skb); > - goto wake_up; > + if (sock_queue_rcv_skb(&q->sk, skb)) > + goto drop; > + else > + goto wake_up; > } > > kfree_skb(skb); > @@ -319,11 +318,17 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) > struct sk_buff *nskb = segs->next; > > segs->next = NULL; > - skb_queue_tail(&q->sk.sk_receive_queue, segs); > + if (sock_queue_rcv_skb(&q->sk, segs)) { > + skb = segs; > + skb->next = nskb; > + goto drop; > + } > + > segs = nskb; > } > } else { > - skb_queue_tail(&q->sk.sk_receive_queue, skb); > + if (sock_queue_rcv_skb(&q->sk, skb)) > + goto drop; > } > > wake_up: > @@ -333,7 +338,7 @@ wake_up: > drop: > /* Count errors/drops only here, thus don't care about args. */ > macvlan_count_rx(vlan, 0, 0, 0); > - kfree_skb(skb); > + kfree_skb_list(skb); > return RX_HANDLER_CONSUMED; > } > > @@ -414,7 +419,7 @@ static void macvtap_dellink(struct net_device *dev, > static void macvtap_setup(struct net_device *dev) > { > macvlan_common_setup(dev); > - dev->tx_queue_len = TUN_READQ_SIZE; > + dev->tx_queue_len = 0; > } > > static struct rtnl_link_ops macvtap_link_ops __read_mostly = { > @@ -469,6 +474,7 @@ static int macvtap_open(struct inode *inode, struct file *file) > sock_init_data(&q->sock, &q->sk); > q->sk.sk_write_space = macvtap_sock_write_space; > q->sk.sk_destruct = macvtap_sock_destruct; > + q->sk.sk_rcvbuf = TUN_RCVBUF; > q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; > q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); > > @@ -1040,6 +1046,13 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, > q->sk.sk_sndbuf = u; > return 0; > > + case TUNSETRCVBUF: > + if (get_user(u, up)) > + return -EFAULT; > + > + q->sk.sk_rcvbuf = u; > + return 0; > + > case TUNGETVNETHDRSZ: > s = q->vnet_hdr_sz; > if (put_user(s, sp)) > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 09f6662..7a08fa3 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -177,6 +177,7 @@ struct tun_struct { > > int vnet_hdr_sz; > int sndbuf; > + int rcvbuf; > struct tap_filter txflt; > struct sock_fprog fprog; > /* protected by rtnl lock */ > @@ -771,17 +772,6 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) > if (!check_filter(&tun->txflt, skb)) > goto drop; > > - if (tfile->socket.sk->sk_filter && > - sk_filter(tfile->socket.sk, skb)) > - goto drop; > - > - /* Limit the number of packets queued by dividing txq length with the > - * number of queues. > - */ > - if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) > - >= dev->tx_queue_len / tun->numqueues) > - goto drop; > - > if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) > goto drop; > > @@ -798,7 +788,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) > nf_reset(skb); > > /* Enqueue packet */ > - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); > + if (sock_queue_rcv_skb(tfile->socket.sk, skb)) > + goto drop; > > /* Notify and wake up reader process */ > if (tfile->flags & TUN_FASYNC) > @@ -1668,6 +1659,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > > tun->filter_attached = false; > tun->sndbuf = tfile->socket.sk->sk_sndbuf; > + tun->rcvbuf = tfile->socket.sk->sk_rcvbuf; > > spin_lock_init(&tun->lock); > > @@ -1837,6 +1829,17 @@ static void tun_set_sndbuf(struct tun_struct *tun) > } > } > > +static void tun_set_rcvbuf(struct tun_struct *tun) > +{ > + struct tun_file *tfile; > + int i; > + > + for (i = 0; i < tun->numqueues; i++) { > + tfile = rtnl_dereference(tun->tfiles[i]); > + tfile->socket.sk->sk_sndbuf = tun->sndbuf; > + } > +} > + > static int tun_set_queue(struct file *file, struct ifreq *ifr) > { > struct tun_file *tfile = file->private_data; > @@ -1878,7 +1881,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > struct ifreq ifr; > kuid_t owner; > kgid_t group; > - int sndbuf; > + int sndbuf, rcvbuf; > int vnet_hdr_sz; > unsigned int ifindex; > int ret; > @@ -2061,6 +2064,22 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > tun_set_sndbuf(tun); > break; > > + case TUNGETRCVBUF: > + rcvbuf = tfile->socket.sk->sk_rcvbuf; > + if (copy_to_user(argp, &rcvbuf, sizeof(rcvbuf))) > + ret = -EFAULT; > + break; > + > + case TUNSETRCVBUF: > + if (copy_from_user(&rcvbuf, argp, sizeof(rcvbuf))) { > + ret = -EFAULT; > + break; > + } > + > + tun->rcvbuf = rcvbuf; > + tun_set_rcvbuf(tun); > + break; > + > case TUNGETVNETHDRSZ: > vnet_hdr_sz = tun->vnet_hdr_sz; > if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) > @@ -2139,6 +2158,8 @@ static long tun_chr_compat_ioctl(struct file *file, > case TUNSETTXFILTER: > case TUNGETSNDBUF: > case TUNSETSNDBUF: > + case TUNGETRCVBUF: > + case TUNSETRCVBUF: > case SIOCGIFHWADDR: > case SIOCSIFHWADDR: > arg = (unsigned long)compat_ptr(arg); > @@ -2204,6 +2225,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) > > tfile->sk.sk_write_space = tun_sock_write_space; > tfile->sk.sk_sndbuf = INT_MAX; > + tfile->sk.sk_rcvbuf = TUN_RCVBUF; > > file->private_data = tfile; > set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags); > diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > index e9502dd..8e04657 100644 > --- a/include/uapi/linux/if_tun.h > +++ b/include/uapi/linux/if_tun.h > @@ -22,6 +22,7 @@ > > /* Read queue size */ > #define TUN_READQ_SIZE 500 > +#define TUN_RCVBUF (512 * PAGE_SIZE) > > /* TUN device flags */ > #define TUN_TUN_DEV 0x0001 That's about 16x less than we were able to queue previously by default. How can you be sure this won't break any applications? > @@ -58,6 +59,8 @@ > #define TUNSETQUEUE _IOW('T', 217, int) > #define TUNSETIFINDEX _IOW('T', 218, unsigned int) > #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) > +#define TUNGETRCVBUF _IOR('T', 220, int) > +#define TUNSETRCVBUF _IOW('T', 221, int) > > /* TUNSETIFF ifr flags */ > #define IFF_TUN 0x0001 > -- > 1.8.3.2 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: > On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: >> We used to limit the number of packets queued through tx_queue_length. This >> has several issues: >> >> - tx_queue_length is the control of qdisc queue length, simply reusing it >> to control the packets queued by device may cause confusion. >> - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add >> support of packet capture on macvtap device."), an unexpected qdisc >> caused by non-zero tx_queue_length will lead qdisc lock contention for >> multiqueue deivce. >> - What we really want is to limit the total amount of memory occupied not >> the number of packets. >> >> So this patch tries to solve the above issues by using socket rcvbuf to >> limit the packets could be queued for tun/macvtap. This was done by using >> sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two >> new ioctl() were introduced for userspace to change the rcvbuf like what we >> have done for sndbuf. >> >> With this fix, we can safely change the tx_queue_len of macvtap to >> zero. This will make multiqueue works without extra lock contention. >> >> Cc: Vlad Yasevich <vyasevic@redhat.com> >> Cc: Michael S. Tsirkin <mst@redhat.com> >> Cc: John Fastabend <john.r.fastabend@intel.com> >> Cc: Stephen Hemminger <stephen@networkplumber.org> >> Cc: Herbert Xu <herbert@gondor.apana.org.au> >> Signed-off-by: Jason Wang <jasowang@redhat.com> > No, I don't think we can change userspace-visible behaviour like that. > > This will break any existing user that tries to control > queue length through sysfs,netlink or device ioctl. But it looks like a buggy API, since tx_queue_len should be for qdisc queue length instead of device itself. If we really want to preserve the behaviour, how about using a new feature flag and change the behaviour only when the device is created (TUNSETIFF) with the new flag? > > Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com > which gives one way to set tx_queue_len to zero without > breaking userspace. If I read the patch correctly, it will make no way for the user who really want to change the qdisc queue length for tun. > > >> --- >> drivers/net/macvtap.c | 31 ++++++++++++++++++++--------- >> drivers/net/tun.c | 48 +++++++++++++++++++++++++++++++++------------ >> include/uapi/linux/if_tun.h | 3 +++ >> 3 files changed, 60 insertions(+), 22 deletions(-) >> >> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c >> index a2c3a89..c429c56 100644 >> --- a/drivers/net/macvtap.c >> +++ b/drivers/net/macvtap.c >> @@ -292,9 +292,6 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) >> if (!q) >> return RX_HANDLER_PASS; >> >> - if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) >> - goto drop; >> - >> skb_push(skb, ETH_HLEN); >> >> /* Apply the forward feature mask so that we perform segmentation >> @@ -310,8 +307,10 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) >> goto drop; >> >> if (!segs) { >> - skb_queue_tail(&q->sk.sk_receive_queue, skb); >> - goto wake_up; >> + if (sock_queue_rcv_skb(&q->sk, skb)) >> + goto drop; >> + else >> + goto wake_up; >> } >> >> kfree_skb(skb); >> @@ -319,11 +318,17 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) >> struct sk_buff *nskb = segs->next; >> >> segs->next = NULL; >> - skb_queue_tail(&q->sk.sk_receive_queue, segs); >> + if (sock_queue_rcv_skb(&q->sk, segs)) { >> + skb = segs; >> + skb->next = nskb; >> + goto drop; >> + } >> + >> segs = nskb; >> } >> } else { >> - skb_queue_tail(&q->sk.sk_receive_queue, skb); >> + if (sock_queue_rcv_skb(&q->sk, skb)) >> + goto drop; >> } >> >> wake_up: >> @@ -333,7 +338,7 @@ wake_up: >> drop: >> /* Count errors/drops only here, thus don't care about args. */ >> macvlan_count_rx(vlan, 0, 0, 0); >> - kfree_skb(skb); >> + kfree_skb_list(skb); >> return RX_HANDLER_CONSUMED; >> } >> >> @@ -414,7 +419,7 @@ static void macvtap_dellink(struct net_device *dev, >> static void macvtap_setup(struct net_device *dev) >> { >> macvlan_common_setup(dev); >> - dev->tx_queue_len = TUN_READQ_SIZE; >> + dev->tx_queue_len = 0; >> } >> >> static struct rtnl_link_ops macvtap_link_ops __read_mostly = { >> @@ -469,6 +474,7 @@ static int macvtap_open(struct inode *inode, struct file *file) >> sock_init_data(&q->sock, &q->sk); >> q->sk.sk_write_space = macvtap_sock_write_space; >> q->sk.sk_destruct = macvtap_sock_destruct; >> + q->sk.sk_rcvbuf = TUN_RCVBUF; >> q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; >> q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); >> >> @@ -1040,6 +1046,13 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, >> q->sk.sk_sndbuf = u; >> return 0; >> >> + case TUNSETRCVBUF: >> + if (get_user(u, up)) >> + return -EFAULT; >> + >> + q->sk.sk_rcvbuf = u; >> + return 0; >> + >> case TUNGETVNETHDRSZ: >> s = q->vnet_hdr_sz; >> if (put_user(s, sp)) >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index 09f6662..7a08fa3 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -177,6 +177,7 @@ struct tun_struct { >> >> int vnet_hdr_sz; >> int sndbuf; >> + int rcvbuf; >> struct tap_filter txflt; >> struct sock_fprog fprog; >> /* protected by rtnl lock */ >> @@ -771,17 +772,6 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) >> if (!check_filter(&tun->txflt, skb)) >> goto drop; >> >> - if (tfile->socket.sk->sk_filter && >> - sk_filter(tfile->socket.sk, skb)) >> - goto drop; >> - >> - /* Limit the number of packets queued by dividing txq length with the >> - * number of queues. >> - */ >> - if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) >> - >= dev->tx_queue_len / tun->numqueues) >> - goto drop; >> - >> if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) >> goto drop; >> >> @@ -798,7 +788,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) >> nf_reset(skb); >> >> /* Enqueue packet */ >> - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); >> + if (sock_queue_rcv_skb(tfile->socket.sk, skb)) >> + goto drop; >> >> /* Notify and wake up reader process */ >> if (tfile->flags & TUN_FASYNC) >> @@ -1668,6 +1659,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >> >> tun->filter_attached = false; >> tun->sndbuf = tfile->socket.sk->sk_sndbuf; >> + tun->rcvbuf = tfile->socket.sk->sk_rcvbuf; >> >> spin_lock_init(&tun->lock); >> >> @@ -1837,6 +1829,17 @@ static void tun_set_sndbuf(struct tun_struct *tun) >> } >> } >> >> +static void tun_set_rcvbuf(struct tun_struct *tun) >> +{ >> + struct tun_file *tfile; >> + int i; >> + >> + for (i = 0; i < tun->numqueues; i++) { >> + tfile = rtnl_dereference(tun->tfiles[i]); >> + tfile->socket.sk->sk_sndbuf = tun->sndbuf; >> + } >> +} >> + >> static int tun_set_queue(struct file *file, struct ifreq *ifr) >> { >> struct tun_file *tfile = file->private_data; >> @@ -1878,7 +1881,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> struct ifreq ifr; >> kuid_t owner; >> kgid_t group; >> - int sndbuf; >> + int sndbuf, rcvbuf; >> int vnet_hdr_sz; >> unsigned int ifindex; >> int ret; >> @@ -2061,6 +2064,22 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, >> tun_set_sndbuf(tun); >> break; >> >> + case TUNGETRCVBUF: >> + rcvbuf = tfile->socket.sk->sk_rcvbuf; >> + if (copy_to_user(argp, &rcvbuf, sizeof(rcvbuf))) >> + ret = -EFAULT; >> + break; >> + >> + case TUNSETRCVBUF: >> + if (copy_from_user(&rcvbuf, argp, sizeof(rcvbuf))) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + tun->rcvbuf = rcvbuf; >> + tun_set_rcvbuf(tun); >> + break; >> + >> case TUNGETVNETHDRSZ: >> vnet_hdr_sz = tun->vnet_hdr_sz; >> if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) >> @@ -2139,6 +2158,8 @@ static long tun_chr_compat_ioctl(struct file *file, >> case TUNSETTXFILTER: >> case TUNGETSNDBUF: >> case TUNSETSNDBUF: >> + case TUNGETRCVBUF: >> + case TUNSETRCVBUF: >> case SIOCGIFHWADDR: >> case SIOCSIFHWADDR: >> arg = (unsigned long)compat_ptr(arg); >> @@ -2204,6 +2225,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) >> >> tfile->sk.sk_write_space = tun_sock_write_space; >> tfile->sk.sk_sndbuf = INT_MAX; >> + tfile->sk.sk_rcvbuf = TUN_RCVBUF; >> >> file->private_data = tfile; >> set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags); >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index e9502dd..8e04657 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -22,6 +22,7 @@ >> >> /* Read queue size */ >> #define TUN_READQ_SIZE 500 >> +#define TUN_RCVBUF (512 * PAGE_SIZE) >> >> /* TUN device flags */ >> #define TUN_TUN_DEV 0x0001 > That's about 16x less than we were able to queue previously > by default. > How can you be sure this won't break any applications? > Ok, we can change it back to 500 * 65535, but I'm not sure whether this or not this value (about 32M) is too big for a single socket. >> @@ -58,6 +59,8 @@ >> #define TUNSETQUEUE _IOW('T', 217, int) >> #define TUNSETIFINDEX _IOW('T', 218, unsigned int) >> #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) >> +#define TUNGETRCVBUF _IOR('T', 220, int) >> +#define TUNSETRCVBUF _IOW('T', 221, int) >> >> /* TUNSETIFF ifr flags */ >> #define IFF_TUN 0x0001 >> -- >> 1.8.3.2 > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 14, 2014 at 04:45:24PM +0800, Jason Wang wrote: > On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: > > On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: > >> We used to limit the number of packets queued through tx_queue_length. This > >> has several issues: > >> > >> - tx_queue_length is the control of qdisc queue length, simply reusing it > >> to control the packets queued by device may cause confusion. > >> - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add > >> support of packet capture on macvtap device."), an unexpected qdisc > >> caused by non-zero tx_queue_length will lead qdisc lock contention for > >> multiqueue deivce. > >> - What we really want is to limit the total amount of memory occupied not > >> the number of packets. > >> > >> So this patch tries to solve the above issues by using socket rcvbuf to > >> limit the packets could be queued for tun/macvtap. This was done by using > >> sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two > >> new ioctl() were introduced for userspace to change the rcvbuf like what we > >> have done for sndbuf. > >> > >> With this fix, we can safely change the tx_queue_len of macvtap to > >> zero. This will make multiqueue works without extra lock contention. > >> > >> Cc: Vlad Yasevich <vyasevic@redhat.com> > >> Cc: Michael S. Tsirkin <mst@redhat.com> > >> Cc: John Fastabend <john.r.fastabend@intel.com> > >> Cc: Stephen Hemminger <stephen@networkplumber.org> > >> Cc: Herbert Xu <herbert@gondor.apana.org.au> > >> Signed-off-by: Jason Wang <jasowang@redhat.com> > > No, I don't think we can change userspace-visible behaviour like that. > > > > This will break any existing user that tries to control > > queue length through sysfs,netlink or device ioctl. > > But it looks like a buggy API, since tx_queue_len should be for qdisc > queue length instead of device itself. Probably, but it's been like this since 2.6.x time. Also, qdisc queue is unused for tun so it seemed kind of reasonable to override tx_queue_len. > If we really want to preserve the > behaviour, how about using a new feature flag and change the behaviour > only when the device is created (TUNSETIFF) with the new flag? OK this addresses the issue partially, but there's also an issue of permissions: tx_queue_len can only be changed if capable(CAP_NET_ADMIN). OTOH in your patch a regular user can change the amount of memory consumed per queue by calling TUNSETRCVBUF. > > > > Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com > > which gives one way to set tx_queue_len to zero without > > breaking userspace. > > If I read the patch correctly, it will make no way for the user who > really want to change the qdisc queue length for tun. Why would this matter? As far as I can see qdisc queue is currently unused. > > > > > >> --- > >> drivers/net/macvtap.c | 31 ++++++++++++++++++++--------- > >> drivers/net/tun.c | 48 +++++++++++++++++++++++++++++++++------------ > >> include/uapi/linux/if_tun.h | 3 +++ > >> 3 files changed, 60 insertions(+), 22 deletions(-) > >> > >> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c > >> index a2c3a89..c429c56 100644 > >> --- a/drivers/net/macvtap.c > >> +++ b/drivers/net/macvtap.c > >> @@ -292,9 +292,6 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) > >> if (!q) > >> return RX_HANDLER_PASS; > >> > >> - if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) > >> - goto drop; > >> - > >> skb_push(skb, ETH_HLEN); > >> > >> /* Apply the forward feature mask so that we perform segmentation > >> @@ -310,8 +307,10 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) > >> goto drop; > >> > >> if (!segs) { > >> - skb_queue_tail(&q->sk.sk_receive_queue, skb); > >> - goto wake_up; > >> + if (sock_queue_rcv_skb(&q->sk, skb)) > >> + goto drop; > >> + else > >> + goto wake_up; > >> } > >> > >> kfree_skb(skb); > >> @@ -319,11 +318,17 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) > >> struct sk_buff *nskb = segs->next; > >> > >> segs->next = NULL; > >> - skb_queue_tail(&q->sk.sk_receive_queue, segs); > >> + if (sock_queue_rcv_skb(&q->sk, segs)) { > >> + skb = segs; > >> + skb->next = nskb; > >> + goto drop; > >> + } > >> + > >> segs = nskb; > >> } > >> } else { > >> - skb_queue_tail(&q->sk.sk_receive_queue, skb); > >> + if (sock_queue_rcv_skb(&q->sk, skb)) > >> + goto drop; > >> } > >> > >> wake_up: > >> @@ -333,7 +338,7 @@ wake_up: > >> drop: > >> /* Count errors/drops only here, thus don't care about args. */ > >> macvlan_count_rx(vlan, 0, 0, 0); > >> - kfree_skb(skb); > >> + kfree_skb_list(skb); > >> return RX_HANDLER_CONSUMED; > >> } > >> > >> @@ -414,7 +419,7 @@ static void macvtap_dellink(struct net_device *dev, > >> static void macvtap_setup(struct net_device *dev) > >> { > >> macvlan_common_setup(dev); > >> - dev->tx_queue_len = TUN_READQ_SIZE; > >> + dev->tx_queue_len = 0; > >> } > >> > >> static struct rtnl_link_ops macvtap_link_ops __read_mostly = { > >> @@ -469,6 +474,7 @@ static int macvtap_open(struct inode *inode, struct file *file) > >> sock_init_data(&q->sock, &q->sk); > >> q->sk.sk_write_space = macvtap_sock_write_space; > >> q->sk.sk_destruct = macvtap_sock_destruct; > >> + q->sk.sk_rcvbuf = TUN_RCVBUF; > >> q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; > >> q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); > >> > >> @@ -1040,6 +1046,13 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, > >> q->sk.sk_sndbuf = u; > >> return 0; > >> > >> + case TUNSETRCVBUF: > >> + if (get_user(u, up)) > >> + return -EFAULT; > >> + > >> + q->sk.sk_rcvbuf = u; > >> + return 0; > >> + > >> case TUNGETVNETHDRSZ: > >> s = q->vnet_hdr_sz; > >> if (put_user(s, sp)) > >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c > >> index 09f6662..7a08fa3 100644 > >> --- a/drivers/net/tun.c > >> +++ b/drivers/net/tun.c > >> @@ -177,6 +177,7 @@ struct tun_struct { > >> > >> int vnet_hdr_sz; > >> int sndbuf; > >> + int rcvbuf; > >> struct tap_filter txflt; > >> struct sock_fprog fprog; > >> /* protected by rtnl lock */ > >> @@ -771,17 +772,6 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) > >> if (!check_filter(&tun->txflt, skb)) > >> goto drop; > >> > >> - if (tfile->socket.sk->sk_filter && > >> - sk_filter(tfile->socket.sk, skb)) > >> - goto drop; > >> - > >> - /* Limit the number of packets queued by dividing txq length with the > >> - * number of queues. > >> - */ > >> - if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) > >> - >= dev->tx_queue_len / tun->numqueues) > >> - goto drop; > >> - > >> if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) > >> goto drop; > >> > >> @@ -798,7 +788,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) > >> nf_reset(skb); > >> > >> /* Enqueue packet */ > >> - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); > >> + if (sock_queue_rcv_skb(tfile->socket.sk, skb)) > >> + goto drop; > >> > >> /* Notify and wake up reader process */ > >> if (tfile->flags & TUN_FASYNC) > >> @@ -1668,6 +1659,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > >> > >> tun->filter_attached = false; > >> tun->sndbuf = tfile->socket.sk->sk_sndbuf; > >> + tun->rcvbuf = tfile->socket.sk->sk_rcvbuf; > >> > >> spin_lock_init(&tun->lock); > >> > >> @@ -1837,6 +1829,17 @@ static void tun_set_sndbuf(struct tun_struct *tun) > >> } > >> } > >> > >> +static void tun_set_rcvbuf(struct tun_struct *tun) > >> +{ > >> + struct tun_file *tfile; > >> + int i; > >> + > >> + for (i = 0; i < tun->numqueues; i++) { > >> + tfile = rtnl_dereference(tun->tfiles[i]); > >> + tfile->socket.sk->sk_sndbuf = tun->sndbuf; > >> + } > >> +} > >> + > >> static int tun_set_queue(struct file *file, struct ifreq *ifr) > >> { > >> struct tun_file *tfile = file->private_data; > >> @@ -1878,7 +1881,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> struct ifreq ifr; > >> kuid_t owner; > >> kgid_t group; > >> - int sndbuf; > >> + int sndbuf, rcvbuf; > >> int vnet_hdr_sz; > >> unsigned int ifindex; > >> int ret; > >> @@ -2061,6 +2064,22 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, > >> tun_set_sndbuf(tun); > >> break; > >> > >> + case TUNGETRCVBUF: > >> + rcvbuf = tfile->socket.sk->sk_rcvbuf; > >> + if (copy_to_user(argp, &rcvbuf, sizeof(rcvbuf))) > >> + ret = -EFAULT; > >> + break; > >> + > >> + case TUNSETRCVBUF: > >> + if (copy_from_user(&rcvbuf, argp, sizeof(rcvbuf))) { > >> + ret = -EFAULT; > >> + break; > >> + } > >> + > >> + tun->rcvbuf = rcvbuf; > >> + tun_set_rcvbuf(tun); > >> + break; > >> + > >> case TUNGETVNETHDRSZ: > >> vnet_hdr_sz = tun->vnet_hdr_sz; > >> if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) > >> @@ -2139,6 +2158,8 @@ static long tun_chr_compat_ioctl(struct file *file, > >> case TUNSETTXFILTER: > >> case TUNGETSNDBUF: > >> case TUNSETSNDBUF: > >> + case TUNGETRCVBUF: > >> + case TUNSETRCVBUF: > >> case SIOCGIFHWADDR: > >> case SIOCSIFHWADDR: > >> arg = (unsigned long)compat_ptr(arg); > >> @@ -2204,6 +2225,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) > >> > >> tfile->sk.sk_write_space = tun_sock_write_space; > >> tfile->sk.sk_sndbuf = INT_MAX; > >> + tfile->sk.sk_rcvbuf = TUN_RCVBUF; > >> > >> file->private_data = tfile; > >> set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags); > >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h > >> index e9502dd..8e04657 100644 > >> --- a/include/uapi/linux/if_tun.h > >> +++ b/include/uapi/linux/if_tun.h > >> @@ -22,6 +22,7 @@ > >> > >> /* Read queue size */ > >> #define TUN_READQ_SIZE 500 > >> +#define TUN_RCVBUF (512 * PAGE_SIZE) > >> > >> /* TUN device flags */ > >> #define TUN_TUN_DEV 0x0001 > > That's about 16x less than we were able to queue previously > > by default. > > How can you be sure this won't break any applications? > > > > Ok, we can change it back to 500 * 65535, but I'm not sure whether this > or not this value (about 32M) is too big for a single socket. > >> @@ -58,6 +59,8 @@ > >> #define TUNSETQUEUE _IOW('T', 217, int) > >> #define TUNSETIFINDEX _IOW('T', 218, unsigned int) > >> #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) > >> +#define TUNGETRCVBUF _IOR('T', 220, int) > >> +#define TUNSETRCVBUF _IOW('T', 221, int) > >> > >> /* TUNSETIFF ifr flags */ > >> #define IFF_TUN 0x0001 > >> -- > >> 1.8.3.2 > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 01/14/2014 05:52 PM, Michael S. Tsirkin wrote: > On Tue, Jan 14, 2014 at 04:45:24PM +0800, Jason Wang wrote: >> > On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: >>> > > On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: >>>> > >> We used to limit the number of packets queued through tx_queue_length. This >>>> > >> has several issues: >>>> > >> >>>> > >> - tx_queue_length is the control of qdisc queue length, simply reusing it >>>> > >> to control the packets queued by device may cause confusion. >>>> > >> - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add >>>> > >> support of packet capture on macvtap device."), an unexpected qdisc >>>> > >> caused by non-zero tx_queue_length will lead qdisc lock contention for >>>> > >> multiqueue deivce. >>>> > >> - What we really want is to limit the total amount of memory occupied not >>>> > >> the number of packets. >>>> > >> >>>> > >> So this patch tries to solve the above issues by using socket rcvbuf to >>>> > >> limit the packets could be queued for tun/macvtap. This was done by using >>>> > >> sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two >>>> > >> new ioctl() were introduced for userspace to change the rcvbuf like what we >>>> > >> have done for sndbuf. >>>> > >> >>>> > >> With this fix, we can safely change the tx_queue_len of macvtap to >>>> > >> zero. This will make multiqueue works without extra lock contention. >>>> > >> >>>> > >> Cc: Vlad Yasevich <vyasevic@redhat.com> >>>> > >> Cc: Michael S. Tsirkin <mst@redhat.com> >>>> > >> Cc: John Fastabend <john.r.fastabend@intel.com> >>>> > >> Cc: Stephen Hemminger <stephen@networkplumber.org> >>>> > >> Cc: Herbert Xu <herbert@gondor.apana.org.au> >>>> > >> Signed-off-by: Jason Wang <jasowang@redhat.com> >>> > > No, I don't think we can change userspace-visible behaviour like that. >>> > > >>> > > This will break any existing user that tries to control >>> > > queue length through sysfs,netlink or device ioctl. >> > >> > But it looks like a buggy API, since tx_queue_len should be for qdisc >> > queue length instead of device itself. > Probably, but it's been like this since 2.6.x time. > Also, qdisc queue is unused for tun so it seemed kind of > reasonable to override tx_queue_len. > >> > If we really want to preserve the >> > behaviour, how about using a new feature flag and change the behaviour >> > only when the device is created (TUNSETIFF) with the new flag? > OK this addresses the issue partially, but there's also an issue > of permissions: tx_queue_len can only be changed if > capable(CAP_NET_ADMIN). OTOH in your patch a regular user > can change the amount of memory consumed per queue > by calling TUNSETRCVBUF. Yes, but we have the same issue for TUNSETSNDBUF. > >>> > > >>> > > Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com >>> > > which gives one way to set tx_queue_len to zero without >>> > > breaking userspace. >> > >> > If I read the patch correctly, it will make no way for the user who >> > really want to change the qdisc queue length for tun. > Why would this matter? As far as I can see qdisc queue is currently unused. > User may use qdisc to do port mirroring, bandwidth limitation, traffic prioritization or more for a VM. So we do have users and maybe more consider the case of vpn. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 15, 2014 at 11:36:01AM +0800, Jason Wang wrote: > On 01/14/2014 05:52 PM, Michael S. Tsirkin wrote: > > On Tue, Jan 14, 2014 at 04:45:24PM +0800, Jason Wang wrote: > >> > On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: > >>> > > On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: > >>>> > >> We used to limit the number of packets queued through tx_queue_length. This > >>>> > >> has several issues: > >>>> > >> > >>>> > >> - tx_queue_length is the control of qdisc queue length, simply reusing it > >>>> > >> to control the packets queued by device may cause confusion. > >>>> > >> - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add > >>>> > >> support of packet capture on macvtap device."), an unexpected qdisc > >>>> > >> caused by non-zero tx_queue_length will lead qdisc lock contention for > >>>> > >> multiqueue deivce. > >>>> > >> - What we really want is to limit the total amount of memory occupied not > >>>> > >> the number of packets. > >>>> > >> > >>>> > >> So this patch tries to solve the above issues by using socket rcvbuf to > >>>> > >> limit the packets could be queued for tun/macvtap. This was done by using > >>>> > >> sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two > >>>> > >> new ioctl() were introduced for userspace to change the rcvbuf like what we > >>>> > >> have done for sndbuf. > >>>> > >> > >>>> > >> With this fix, we can safely change the tx_queue_len of macvtap to > >>>> > >> zero. This will make multiqueue works without extra lock contention. > >>>> > >> > >>>> > >> Cc: Vlad Yasevich <vyasevic@redhat.com> > >>>> > >> Cc: Michael S. Tsirkin <mst@redhat.com> > >>>> > >> Cc: John Fastabend <john.r.fastabend@intel.com> > >>>> > >> Cc: Stephen Hemminger <stephen@networkplumber.org> > >>>> > >> Cc: Herbert Xu <herbert@gondor.apana.org.au> > >>>> > >> Signed-off-by: Jason Wang <jasowang@redhat.com> > >>> > > No, I don't think we can change userspace-visible behaviour like that. > >>> > > > >>> > > This will break any existing user that tries to control > >>> > > queue length through sysfs,netlink or device ioctl. > >> > > >> > But it looks like a buggy API, since tx_queue_len should be for qdisc > >> > queue length instead of device itself. > > Probably, but it's been like this since 2.6.x time. > > Also, qdisc queue is unused for tun so it seemed kind of > > reasonable to override tx_queue_len. > > > >> > If we really want to preserve the > >> > behaviour, how about using a new feature flag and change the behaviour > >> > only when the device is created (TUNSETIFF) with the new flag? > > OK this addresses the issue partially, but there's also an issue > > of permissions: tx_queue_len can only be changed if > > capable(CAP_NET_ADMIN). OTOH in your patch a regular user > > can change the amount of memory consumed per queue > > by calling TUNSETRCVBUF. > > Yes, but we have the same issue for TUNSETSNDBUF. To an extent, but TUNSETSNDBUF is different. It limits how much device can queue *in the networking stack* but each queue in the stack is also limited, when we exceed that we star dropping packets. So while with infinite value (which is the default btw) you can keep host pretty busy, you will not be able to run it out of memory. The proposed TUNSETRCVBUF would keep configured amount of memory around indefinitely so you can run host out of memory. So assuming all this How about an ethtool or netlink command to configure this instead? > > > >>> > > > >>> > > Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com > >>> > > which gives one way to set tx_queue_len to zero without > >>> > > breaking userspace. > >> > > >> > If I read the patch correctly, it will make no way for the user who > >> > really want to change the qdisc queue length for tun. > > Why would this matter? As far as I can see qdisc queue is currently unused. > > > > User may use qdisc to do port mirroring, bandwidth limitation, traffic > prioritization or more for a VM. So we do have users and maybe more > consider the case of vpn. Well it's not used by default at least. I remember that we discussed this previously actually. If all we want to do actually is utilize no_qdisc by default, we can simply use Eric's patch: http://article.gmane.org/gmane.linux.kernel/1279597 and a similar patch for macvtap. I tried it at the time and it didn't seem to help performance at all, but a lot has changed since, in particular I didn't test mq. If you now have results showing how it's beneficial, pls post them.
On 01/15/2014 03:21 PM, Michael S. Tsirkin wrote: > On Wed, Jan 15, 2014 at 11:36:01AM +0800, Jason Wang wrote: >> On 01/14/2014 05:52 PM, Michael S. Tsirkin wrote: >>> On Tue, Jan 14, 2014 at 04:45:24PM +0800, Jason Wang wrote: >>>>> On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: >>>>>>> On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: >>>>>>>>> We used to limit the number of packets queued through tx_queue_length. This >>>>>>>>> has several issues: >>>>>>>>> >>>>>>>>> - tx_queue_length is the control of qdisc queue length, simply reusing it >>>>>>>>> to control the packets queued by device may cause confusion. >>>>>>>>> - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add >>>>>>>>> support of packet capture on macvtap device."), an unexpected qdisc >>>>>>>>> caused by non-zero tx_queue_length will lead qdisc lock contention for >>>>>>>>> multiqueue deivce. >>>>>>>>> - What we really want is to limit the total amount of memory occupied not >>>>>>>>> the number of packets. >>>>>>>>> >>>>>>>>> So this patch tries to solve the above issues by using socket rcvbuf to >>>>>>>>> limit the packets could be queued for tun/macvtap. This was done by using >>>>>>>>> sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two >>>>>>>>> new ioctl() were introduced for userspace to change the rcvbuf like what we >>>>>>>>> have done for sndbuf. >>>>>>>>> >>>>>>>>> With this fix, we can safely change the tx_queue_len of macvtap to >>>>>>>>> zero. This will make multiqueue works without extra lock contention. >>>>>>>>> >>>>>>>>> Cc: Vlad Yasevich<vyasevic@redhat.com> >>>>>>>>> Cc: Michael S. Tsirkin<mst@redhat.com> >>>>>>>>> Cc: John Fastabend<john.r.fastabend@intel.com> >>>>>>>>> Cc: Stephen Hemminger<stephen@networkplumber.org> >>>>>>>>> Cc: Herbert Xu<herbert@gondor.apana.org.au> >>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>>>>> No, I don't think we can change userspace-visible behaviour like that. >>>>>>> >>>>>>> This will break any existing user that tries to control >>>>>>> queue length through sysfs,netlink or device ioctl. >>>>> But it looks like a buggy API, since tx_queue_len should be for qdisc >>>>> queue length instead of device itself. >>> Probably, but it's been like this since 2.6.x time. >>> Also, qdisc queue is unused for tun so it seemed kind of >>> reasonable to override tx_queue_len. >>> >>>>> If we really want to preserve the >>>>> behaviour, how about using a new feature flag and change the behaviour >>>>> only when the device is created (TUNSETIFF) with the new flag? >>> OK this addresses the issue partially, but there's also an issue >>> of permissions: tx_queue_len can only be changed if >>> capable(CAP_NET_ADMIN). OTOH in your patch a regular user >>> can change the amount of memory consumed per queue >>> by calling TUNSETRCVBUF. >> Yes, but we have the same issue for TUNSETSNDBUF. > To an extent, but TUNSETSNDBUF is different. It limits how much device can queue > *in the networking stack* but each queue in the stack is also > limited, when we exceed that we star dropping packets. > So while with infinite value (which is the default btw) > you can keep host pretty busy, you will not be able to run > it out of memory. > > The proposed TUNSETRCVBUF would keep configured amount > of memory around indefinitely so you can run host out of memory. > > So assuming all this > How about an ethtool or netlink command to configure this > instead? > Ok, so we can add net admin check for before trying to set rcvbuf. I think it's better to use ioctl since we've already use it for sndbuf. Using ethool means you need a dedicated new ethtool method just for tuntap which seems sub-optimal. Netlink looks better, but we should also implement other ioctl also. >>>>>>> Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com >>>>>>> which gives one way to set tx_queue_len to zero without >>>>>>> breaking userspace. >>>>> If I read the patch correctly, it will make no way for the user who >>>>> really want to change the qdisc queue length for tun. >>> Why would this matter? As far as I can see qdisc queue is currently unused. >>> >> User may use qdisc to do port mirroring, bandwidth limitation, traffic >> prioritization or more for a VM. So we do have users and maybe more >> consider the case of vpn. > Well it's not used by default at least. > I remember that we discussed this previously actually. > > If all we want to do actually is utilize no_qdisc by default, > we can simply use Eric's patch: > > http://article.gmane.org/gmane.linux.kernel/1279597 > > and a similar patch for macvtap. > I tried it at the time and it didn't seem to help performance > at all, but a lot has changed since, in particular I didn't > test mq. > > If you now have results showing how it's beneficial, pls post them. > I will have a test to see the difference. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 16, 2014 at 12:29:35PM +0800, Jason Wang wrote: > On 01/15/2014 03:21 PM, Michael S. Tsirkin wrote: > >On Wed, Jan 15, 2014 at 11:36:01AM +0800, Jason Wang wrote: > >>On 01/14/2014 05:52 PM, Michael S. Tsirkin wrote: > >>>On Tue, Jan 14, 2014 at 04:45:24PM +0800, Jason Wang wrote: > >>>>>On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: > >>>>>>>On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: > >>>>>>>>>We used to limit the number of packets queued through tx_queue_length. This > >>>>>>>>>has several issues: > >>>>>>>>> > >>>>>>>>>- tx_queue_length is the control of qdisc queue length, simply reusing it > >>>>>>>>> to control the packets queued by device may cause confusion. > >>>>>>>>>- After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add > >>>>>>>>> support of packet capture on macvtap device."), an unexpected qdisc > >>>>>>>>> caused by non-zero tx_queue_length will lead qdisc lock contention for > >>>>>>>>> multiqueue deivce. > >>>>>>>>>- What we really want is to limit the total amount of memory occupied not > >>>>>>>>> the number of packets. > >>>>>>>>> > >>>>>>>>>So this patch tries to solve the above issues by using socket rcvbuf to > >>>>>>>>>limit the packets could be queued for tun/macvtap. This was done by using > >>>>>>>>>sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two > >>>>>>>>>new ioctl() were introduced for userspace to change the rcvbuf like what we > >>>>>>>>>have done for sndbuf. > >>>>>>>>> > >>>>>>>>>With this fix, we can safely change the tx_queue_len of macvtap to > >>>>>>>>>zero. This will make multiqueue works without extra lock contention. > >>>>>>>>> > >>>>>>>>>Cc: Vlad Yasevich<vyasevic@redhat.com> > >>>>>>>>>Cc: Michael S. Tsirkin<mst@redhat.com> > >>>>>>>>>Cc: John Fastabend<john.r.fastabend@intel.com> > >>>>>>>>>Cc: Stephen Hemminger<stephen@networkplumber.org> > >>>>>>>>>Cc: Herbert Xu<herbert@gondor.apana.org.au> > >>>>>>>>>Signed-off-by: Jason Wang<jasowang@redhat.com> > >>>>>>>No, I don't think we can change userspace-visible behaviour like that. > >>>>>>> > >>>>>>>This will break any existing user that tries to control > >>>>>>>queue length through sysfs,netlink or device ioctl. > >>>>>But it looks like a buggy API, since tx_queue_len should be for qdisc > >>>>>queue length instead of device itself. > >>>Probably, but it's been like this since 2.6.x time. > >>>Also, qdisc queue is unused for tun so it seemed kind of > >>>reasonable to override tx_queue_len. > >>> > >>>>>If we really want to preserve the > >>>>>behaviour, how about using a new feature flag and change the behaviour > >>>>>only when the device is created (TUNSETIFF) with the new flag? > >>>OK this addresses the issue partially, but there's also an issue > >>>of permissions: tx_queue_len can only be changed if > >>>capable(CAP_NET_ADMIN). OTOH in your patch a regular user > >>>can change the amount of memory consumed per queue > >>>by calling TUNSETRCVBUF. > >>Yes, but we have the same issue for TUNSETSNDBUF. > >To an extent, but TUNSETSNDBUF is different. It limits how much device can queue > >*in the networking stack* but each queue in the stack is also > >limited, when we exceed that we star dropping packets. > >So while with infinite value (which is the default btw) > >you can keep host pretty busy, you will not be able to run > >it out of memory. > > > >The proposed TUNSETRCVBUF would keep configured amount > >of memory around indefinitely so you can run host out of memory. > > > >So assuming all this > >How about an ethtool or netlink command to configure this > >instead? > > > > Ok, so we can add net admin check for before trying to set rcvbuf. No, in practice I think using ioctl for sndbuf was also a mistake. Applications have no idea what to set it to - you need to know what else is running on the system, after a while QEMU ended up setting it back to infinity otherwise things kept breaking. ethtool or netlink would not have this problem. Which of the two is preferable I'm not sure. I wonder what do management tools such as libvirt prefer. > I > think it's better to use ioctl since we've already use it for > sndbuf. Using ethool means you need a dedicated new ethtool method > just for tuntap which seems sub-optimal. > Netlink looks better, but > we should also implement other ioctl also. I'm not sure what this last phrase means. Can you clarify pls? > >>>>>>>Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com > >>>>>>>which gives one way to set tx_queue_len to zero without > >>>>>>>breaking userspace. > >>>>>If I read the patch correctly, it will make no way for the user who > >>>>>really want to change the qdisc queue length for tun. > >>>Why would this matter? As far as I can see qdisc queue is currently unused. > >>> > >>User may use qdisc to do port mirroring, bandwidth limitation, traffic > >>prioritization or more for a VM. So we do have users and maybe more > >>consider the case of vpn. > >Well it's not used by default at least. > >I remember that we discussed this previously actually. > > > >If all we want to do actually is utilize no_qdisc by default, > >we can simply use Eric's patch: > > > >http://article.gmane.org/gmane.linux.kernel/1279597 > > > >and a similar patch for macvtap. > >I tried it at the time and it didn't seem to help performance > >at all, but a lot has changed since, in particular I didn't > >test mq. > > > >If you now have results showing how it's beneficial, pls post them. > > > > I will have a test to see the difference. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 01/16/2014 01:47 PM, Michael S. Tsirkin wrote: > On Thu, Jan 16, 2014 at 12:29:35PM +0800, Jason Wang wrote: >> On 01/15/2014 03:21 PM, Michael S. Tsirkin wrote: >>> On Wed, Jan 15, 2014 at 11:36:01AM +0800, Jason Wang wrote: >>>> On 01/14/2014 05:52 PM, Michael S. Tsirkin wrote: >>>>> On Tue, Jan 14, 2014 at 04:45:24PM +0800, Jason Wang wrote: >>>>>>> On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: >>>>>>>>> On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: >>>>>>>>>>> We used to limit the number of packets queued through tx_queue_length. This >>>>>>>>>>> has several issues: >>>>>>>>>>> >>>>>>>>>>> - tx_queue_length is the control of qdisc queue length, simply reusing it >>>>>>>>>>> to control the packets queued by device may cause confusion. >>>>>>>>>>> - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add >>>>>>>>>>> support of packet capture on macvtap device."), an unexpected qdisc >>>>>>>>>>> caused by non-zero tx_queue_length will lead qdisc lock contention for >>>>>>>>>>> multiqueue deivce. >>>>>>>>>>> - What we really want is to limit the total amount of memory occupied not >>>>>>>>>>> the number of packets. >>>>>>>>>>> >>>>>>>>>>> So this patch tries to solve the above issues by using socket rcvbuf to >>>>>>>>>>> limit the packets could be queued for tun/macvtap. This was done by using >>>>>>>>>>> sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two >>>>>>>>>>> new ioctl() were introduced for userspace to change the rcvbuf like what we >>>>>>>>>>> have done for sndbuf. >>>>>>>>>>> >>>>>>>>>>> With this fix, we can safely change the tx_queue_len of macvtap to >>>>>>>>>>> zero. This will make multiqueue works without extra lock contention. >>>>>>>>>>> >>>>>>>>>>> Cc: Vlad Yasevich<vyasevic@redhat.com> >>>>>>>>>>> Cc: Michael S. Tsirkin<mst@redhat.com> >>>>>>>>>>> Cc: John Fastabend<john.r.fastabend@intel.com> >>>>>>>>>>> Cc: Stephen Hemminger<stephen@networkplumber.org> >>>>>>>>>>> Cc: Herbert Xu<herbert@gondor.apana.org.au> >>>>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>>>>>>> No, I don't think we can change userspace-visible behaviour like that. >>>>>>>>> >>>>>>>>> This will break any existing user that tries to control >>>>>>>>> queue length through sysfs,netlink or device ioctl. >>>>>>> But it looks like a buggy API, since tx_queue_len should be for qdisc >>>>>>> queue length instead of device itself. >>>>> Probably, but it's been like this since 2.6.x time. >>>>> Also, qdisc queue is unused for tun so it seemed kind of >>>>> reasonable to override tx_queue_len. >>>>> >>>>>>> If we really want to preserve the >>>>>>> behaviour, how about using a new feature flag and change the behaviour >>>>>>> only when the device is created (TUNSETIFF) with the new flag? >>>>> OK this addresses the issue partially, but there's also an issue >>>>> of permissions: tx_queue_len can only be changed if >>>>> capable(CAP_NET_ADMIN). OTOH in your patch a regular user >>>>> can change the amount of memory consumed per queue >>>>> by calling TUNSETRCVBUF. >>>> Yes, but we have the same issue for TUNSETSNDBUF. >>> To an extent, but TUNSETSNDBUF is different. It limits how much device can queue >>> *in the networking stack* but each queue in the stack is also >>> limited, when we exceed that we star dropping packets. >>> So while with infinite value (which is the default btw) >>> you can keep host pretty busy, you will not be able to run >>> it out of memory. >>> >>> The proposed TUNSETRCVBUF would keep configured amount >>> of memory around indefinitely so you can run host out of memory. >>> >>> So assuming all this >>> How about an ethtool or netlink command to configure this >>> instead? >>> >> Ok, so we can add net admin check for before trying to set rcvbuf. > No, in practice I think using ioctl for sndbuf was also a mistake. > Applications have no idea what to set it to - you need to know what else > is running on the system, after a while > QEMU ended up setting it back to infinity otherwise things kept > breaking. Yes, but it's not the problem of ioctl itself. > ethtool or netlink would not have this problem. > Which of the two is preferable I'm not sure. > I wonder what do management tools such as libvirt prefer. > Libvirt usually create file descriptor, so it can also use ioctl() to set sndbuf. >> I >> think it's better to use ioctl since we've already use it for >> sndbuf. Using ethool means you need a dedicated new ethtool method >> just for tuntap which seems sub-optimal. >> Netlink looks better, but >> we should also implement other ioctl also. > I'm not sure what this last phrase means. Can you clarify pls? > Sure, I mean if we choose netlink, we'd better add other function such as sndbuf, persistent flag and so on. Consider lots of existed function were implemented through ioctl(), using it also for rcvbuf looks ok and can simplify the changes of userspace. >>>>>>>>> Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com >>>>>>>>> which gives one way to set tx_queue_len to zero without >>>>>>>>> breaking userspace. >>>>>>> If I read the patch correctly, it will make no way for the user who >>>>>>> really want to change the qdisc queue length for tun. >>>>> Why would this matter? As far as I can see qdisc queue is currently unused. >>>>> >>>> User may use qdisc to do port mirroring, bandwidth limitation, traffic >>>> prioritization or more for a VM. So we do have users and maybe more >>>> consider the case of vpn. >>> Well it's not used by default at least. >>> I remember that we discussed this previously actually. >>> >>> If all we want to do actually is utilize no_qdisc by default, >>> we can simply use Eric's patch: >>> >>> http://article.gmane.org/gmane.linux.kernel/1279597 >>> >>> and a similar patch for macvtap. >>> I tried it at the time and it didn't seem to help performance >>> at all, but a lot has changed since, in particular I didn't >>> test mq. >>> >>> If you now have results showing how it's beneficial, pls post them. >>> >> I will have a test to see the difference. > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 16, 2014 at 02:03:38PM +0800, Jason Wang wrote: > On 01/16/2014 01:47 PM, Michael S. Tsirkin wrote: > >On Thu, Jan 16, 2014 at 12:29:35PM +0800, Jason Wang wrote: > >>On 01/15/2014 03:21 PM, Michael S. Tsirkin wrote: > >>>On Wed, Jan 15, 2014 at 11:36:01AM +0800, Jason Wang wrote: > >>>>On 01/14/2014 05:52 PM, Michael S. Tsirkin wrote: > >>>>>On Tue, Jan 14, 2014 at 04:45:24PM +0800, Jason Wang wrote: > >>>>>>>On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: > >>>>>>>>>On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: > >>>>>>>>>>>We used to limit the number of packets queued through tx_queue_length. This > >>>>>>>>>>>has several issues: > >>>>>>>>>>> > >>>>>>>>>>>- tx_queue_length is the control of qdisc queue length, simply reusing it > >>>>>>>>>>> to control the packets queued by device may cause confusion. > >>>>>>>>>>>- After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add > >>>>>>>>>>> support of packet capture on macvtap device."), an unexpected qdisc > >>>>>>>>>>> caused by non-zero tx_queue_length will lead qdisc lock contention for > >>>>>>>>>>> multiqueue deivce. > >>>>>>>>>>>- What we really want is to limit the total amount of memory occupied not > >>>>>>>>>>> the number of packets. > >>>>>>>>>>> > >>>>>>>>>>>So this patch tries to solve the above issues by using socket rcvbuf to > >>>>>>>>>>>limit the packets could be queued for tun/macvtap. This was done by using > >>>>>>>>>>>sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two > >>>>>>>>>>>new ioctl() were introduced for userspace to change the rcvbuf like what we > >>>>>>>>>>>have done for sndbuf. > >>>>>>>>>>> > >>>>>>>>>>>With this fix, we can safely change the tx_queue_len of macvtap to > >>>>>>>>>>>zero. This will make multiqueue works without extra lock contention. > >>>>>>>>>>> > >>>>>>>>>>>Cc: Vlad Yasevich<vyasevic@redhat.com> > >>>>>>>>>>>Cc: Michael S. Tsirkin<mst@redhat.com> > >>>>>>>>>>>Cc: John Fastabend<john.r.fastabend@intel.com> > >>>>>>>>>>>Cc: Stephen Hemminger<stephen@networkplumber.org> > >>>>>>>>>>>Cc: Herbert Xu<herbert@gondor.apana.org.au> > >>>>>>>>>>>Signed-off-by: Jason Wang<jasowang@redhat.com> > >>>>>>>>>No, I don't think we can change userspace-visible behaviour like that. > >>>>>>>>> > >>>>>>>>>This will break any existing user that tries to control > >>>>>>>>>queue length through sysfs,netlink or device ioctl. > >>>>>>>But it looks like a buggy API, since tx_queue_len should be for qdisc > >>>>>>>queue length instead of device itself. > >>>>>Probably, but it's been like this since 2.6.x time. > >>>>>Also, qdisc queue is unused for tun so it seemed kind of > >>>>>reasonable to override tx_queue_len. > >>>>> > >>>>>>>If we really want to preserve the > >>>>>>>behaviour, how about using a new feature flag and change the behaviour > >>>>>>>only when the device is created (TUNSETIFF) with the new flag? > >>>>>OK this addresses the issue partially, but there's also an issue > >>>>>of permissions: tx_queue_len can only be changed if > >>>>>capable(CAP_NET_ADMIN). OTOH in your patch a regular user > >>>>>can change the amount of memory consumed per queue > >>>>>by calling TUNSETRCVBUF. > >>>>Yes, but we have the same issue for TUNSETSNDBUF. > >>>To an extent, but TUNSETSNDBUF is different. It limits how much device can queue > >>>*in the networking stack* but each queue in the stack is also > >>>limited, when we exceed that we star dropping packets. > >>>So while with infinite value (which is the default btw) > >>>you can keep host pretty busy, you will not be able to run > >>>it out of memory. > >>> > >>>The proposed TUNSETRCVBUF would keep configured amount > >>>of memory around indefinitely so you can run host out of memory. > >>> > >>>So assuming all this > >>>How about an ethtool or netlink command to configure this > >>>instead? > >>> > >>Ok, so we can add net admin check for before trying to set rcvbuf. > >No, in practice I think using ioctl for sndbuf was also a mistake. > >Applications have no idea what to set it to - you need to know what else > >is running on the system, after a while > >QEMU ended up setting it back to infinity otherwise things kept > >breaking. > > Yes, but it's not the problem of ioctl itself. > >ethtool or netlink would not have this problem. > >Which of the two is preferable I'm not sure. > >I wonder what do management tools such as libvirt prefer. > > > > Libvirt usually create file descriptor, so it can also use ioctl() > to set sndbuf. > >>I > >>think it's better to use ioctl since we've already use it for > >>sndbuf. Using ethool means you need a dedicated new ethtool method > >>just for tuntap which seems sub-optimal. > >>Netlink looks better, but > >>we should also implement other ioctl also. > >I'm not sure what this last phrase means. Can you clarify pls? > > > > Sure, I mean if we choose netlink, we'd better add other function > such as sndbuf, persistent flag and so on. Consider lots of existed > function were implemented through ioctl(), using it also for rcvbuf > looks ok and can simplify the changes of userspace. I'm not sure how long will it take for userspace to take advantage of this, or what guidance we'll give userspace. The advantage of ethtool is that users can tweak settings without updating userspace management tools at all. Basically this new interface comes to replace the use of tx_queue_len which can be set from sysfs and ethtool so it should not be harder to use. > >>>>>>>>>Take a look at my patch in msg ID 20140109071721.GD19559@redhat.com > >>>>>>>>>which gives one way to set tx_queue_len to zero without > >>>>>>>>>breaking userspace. > >>>>>>>If I read the patch correctly, it will make no way for the user who > >>>>>>>really want to change the qdisc queue length for tun. > >>>>>Why would this matter? As far as I can see qdisc queue is currently unused. > >>>>> > >>>>User may use qdisc to do port mirroring, bandwidth limitation, traffic > >>>>prioritization or more for a VM. So we do have users and maybe more > >>>>consider the case of vpn. > >>>Well it's not used by default at least. > >>>I remember that we discussed this previously actually. > >>> > >>>If all we want to do actually is utilize no_qdisc by default, > >>>we can simply use Eric's patch: > >>> > >>>http://article.gmane.org/gmane.linux.kernel/1279597 > >>> > >>>and a similar patch for macvtap. > >>>I tried it at the time and it didn't seem to help performance > >>>at all, but a lot has changed since, in particular I didn't > >>>test mq. > >>> > >>>If you now have results showing how it's beneficial, pls post them. > >>> > >>I will have a test to see the difference. > >-- > >To unsubscribe from this list: send the line "unsubscribe netdev" in > >the body of a message to majordomo@vger.kernel.org > >More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index a2c3a89..c429c56 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -292,9 +292,6 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) if (!q) return RX_HANDLER_PASS; - if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) - goto drop; - skb_push(skb, ETH_HLEN); /* Apply the forward feature mask so that we perform segmentation @@ -310,8 +307,10 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) goto drop; if (!segs) { - skb_queue_tail(&q->sk.sk_receive_queue, skb); - goto wake_up; + if (sock_queue_rcv_skb(&q->sk, skb)) + goto drop; + else + goto wake_up; } kfree_skb(skb); @@ -319,11 +318,17 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) struct sk_buff *nskb = segs->next; segs->next = NULL; - skb_queue_tail(&q->sk.sk_receive_queue, segs); + if (sock_queue_rcv_skb(&q->sk, segs)) { + skb = segs; + skb->next = nskb; + goto drop; + } + segs = nskb; } } else { - skb_queue_tail(&q->sk.sk_receive_queue, skb); + if (sock_queue_rcv_skb(&q->sk, skb)) + goto drop; } wake_up: @@ -333,7 +338,7 @@ wake_up: drop: /* Count errors/drops only here, thus don't care about args. */ macvlan_count_rx(vlan, 0, 0, 0); - kfree_skb(skb); + kfree_skb_list(skb); return RX_HANDLER_CONSUMED; } @@ -414,7 +419,7 @@ static void macvtap_dellink(struct net_device *dev, static void macvtap_setup(struct net_device *dev) { macvlan_common_setup(dev); - dev->tx_queue_len = TUN_READQ_SIZE; + dev->tx_queue_len = 0; } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { @@ -469,6 +474,7 @@ static int macvtap_open(struct inode *inode, struct file *file) sock_init_data(&q->sock, &q->sk); q->sk.sk_write_space = macvtap_sock_write_space; q->sk.sk_destruct = macvtap_sock_destruct; + q->sk.sk_rcvbuf = TUN_RCVBUF; q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); @@ -1040,6 +1046,13 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, q->sk.sk_sndbuf = u; return 0; + case TUNSETRCVBUF: + if (get_user(u, up)) + return -EFAULT; + + q->sk.sk_rcvbuf = u; + return 0; + case TUNGETVNETHDRSZ: s = q->vnet_hdr_sz; if (put_user(s, sp)) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 09f6662..7a08fa3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -177,6 +177,7 @@ struct tun_struct { int vnet_hdr_sz; int sndbuf; + int rcvbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ @@ -771,17 +772,6 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (!check_filter(&tun->txflt, skb)) goto drop; - if (tfile->socket.sk->sk_filter && - sk_filter(tfile->socket.sk, skb)) - goto drop; - - /* Limit the number of packets queued by dividing txq length with the - * number of queues. - */ - if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) - >= dev->tx_queue_len / tun->numqueues) - goto drop; - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; @@ -798,7 +788,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset(skb); /* Enqueue packet */ - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); + if (sock_queue_rcv_skb(tfile->socket.sk, skb)) + goto drop; /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) @@ -1668,6 +1659,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; + tun->rcvbuf = tfile->socket.sk->sk_rcvbuf; spin_lock_init(&tun->lock); @@ -1837,6 +1829,17 @@ static void tun_set_sndbuf(struct tun_struct *tun) } } +static void tun_set_rcvbuf(struct tun_struct *tun) +{ + struct tun_file *tfile; + int i; + + for (i = 0; i < tun->numqueues; i++) { + tfile = rtnl_dereference(tun->tfiles[i]); + tfile->socket.sk->sk_sndbuf = tun->sndbuf; + } +} + static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; @@ -1878,7 +1881,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, struct ifreq ifr; kuid_t owner; kgid_t group; - int sndbuf; + int sndbuf, rcvbuf; int vnet_hdr_sz; unsigned int ifindex; int ret; @@ -2061,6 +2064,22 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_set_sndbuf(tun); break; + case TUNGETRCVBUF: + rcvbuf = tfile->socket.sk->sk_rcvbuf; + if (copy_to_user(argp, &rcvbuf, sizeof(rcvbuf))) + ret = -EFAULT; + break; + + case TUNSETRCVBUF: + if (copy_from_user(&rcvbuf, argp, sizeof(rcvbuf))) { + ret = -EFAULT; + break; + } + + tun->rcvbuf = rcvbuf; + tun_set_rcvbuf(tun); + break; + case TUNGETVNETHDRSZ: vnet_hdr_sz = tun->vnet_hdr_sz; if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) @@ -2139,6 +2158,8 @@ static long tun_chr_compat_ioctl(struct file *file, case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: + case TUNGETRCVBUF: + case TUNSETRCVBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); @@ -2204,6 +2225,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; + tfile->sk.sk_rcvbuf = TUN_RCVBUF; file->private_data = tfile; set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags); diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index e9502dd..8e04657 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -22,6 +22,7 @@ /* Read queue size */ #define TUN_READQ_SIZE 500 +#define TUN_RCVBUF (512 * PAGE_SIZE) /* TUN device flags */ #define TUN_TUN_DEV 0x0001 @@ -58,6 +59,8 @@ #define TUNSETQUEUE _IOW('T', 217, int) #define TUNSETIFINDEX _IOW('T', 218, unsigned int) #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) +#define TUNGETRCVBUF _IOR('T', 220, int) +#define TUNSETRCVBUF _IOW('T', 221, int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001
We used to limit the number of packets queued through tx_queue_length. This has several issues: - tx_queue_length is the control of qdisc queue length, simply reusing it to control the packets queued by device may cause confusion. - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add support of packet capture on macvtap device."), an unexpected qdisc caused by non-zero tx_queue_length will lead qdisc lock contention for multiqueue deivce. - What we really want is to limit the total amount of memory occupied not the number of packets. So this patch tries to solve the above issues by using socket rcvbuf to limit the packets could be queued for tun/macvtap. This was done by using sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two new ioctl() were introduced for userspace to change the rcvbuf like what we have done for sndbuf. With this fix, we can safely change the tx_queue_len of macvtap to zero. This will make multiqueue works without extra lock contention. Cc: Vlad Yasevich <vyasevic@redhat.com> Cc: Michael S. Tsirkin <mst@redhat.com> Cc: John Fastabend <john.r.fastabend@intel.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Jason Wang <jasowang@redhat.com> --- drivers/net/macvtap.c | 31 ++++++++++++++++++++--------- drivers/net/tun.c | 48 +++++++++++++++++++++++++++++++++------------ include/uapi/linux/if_tun.h | 3 +++ 3 files changed, 60 insertions(+), 22 deletions(-)