Message ID | 20190118130305.11504-4-bjorn.topel@gmail.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | AF_XDP: add socket monitoring support | expand |
On 01/18/2019 02:03 PM, bjorn.topel@gmail.com wrote: > From: Björn Töpel <bjorn.topel@intel.com> > > This patch adds the sock_diag interface for querying sockets from user > space. Tools like iproute2 ss(8) can use this interface to list open > AF_XDP sockets. > > The user-space ABI is defined in linux/xdp_diag.h and includes netlink > request and response structs. The request can query sockets and the > response contains socket information about the rings, umems, inode and > more. > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Series looks good, few minor nits inline: > --- > include/uapi/linux/xdp_diag.h | 72 +++++++++++++ > net/xdp/Kconfig | 8 ++ > net/xdp/Makefile | 1 + > net/xdp/xsk.c | 6 +- > net/xdp/xsk.h | 12 +++ > net/xdp/xsk_diag.c | 192 ++++++++++++++++++++++++++++++++++ > 6 files changed, 286 insertions(+), 5 deletions(-) > create mode 100644 include/uapi/linux/xdp_diag.h > create mode 100644 net/xdp/xsk.h > create mode 100644 net/xdp/xsk_diag.c > > diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h > new file mode 100644 > index 000000000000..efe8ce281dce > --- /dev/null > +++ b/include/uapi/linux/xdp_diag.h > @@ -0,0 +1,72 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > +/* > + * xdp_diag: interface for query/monitor XDP sockets > + * Copyright(c) 2019 Intel Corporation. > + */ > + > +#ifndef _LINUX_XDP_DIAG_H > +#define _LINUX_XDP_DIAG_H > + > +#include <linux/types.h> > + > +struct xdp_diag_req { > + __u8 sdiag_family; > + __u8 sdiag_protocol; > + __u16 pad; Presumably this one is for future use? Maybe better as '__u16 :16;' to avoid compile errors if someone tries to zero 'pad' member manually? > + __u32 xdiag_ino; > + __u32 xdiag_show; > + __u32 xdiag_cookie[2]; > +}; > + > +struct xdp_diag_msg { > + __u8 xdiag_family; > + __u8 xdiag_type; > + __u32 xdiag_ino; > + __u32 xdiag_cookie[2]; > +}; > + > +#define XDP_SHOW_INFO (1 << 0) /* Basic information */ > +#define XDP_SHOW_RING_CFG (1 << 1) > +#define XDP_SHOW_UMEM (1 << 2) > +#define XDP_SHOW_MEMINFO (1 << 3) > + > +enum { > + XDP_DIAG_NONE, > + XDP_DIAG_INFO, > + XDP_DIAG_UID, > + XDP_DIAG_RX_RING, > + XDP_DIAG_TX_RING, > + XDP_DIAG_UMEM, > + XDP_DIAG_UMEM_FILL_RING, > + XDP_DIAG_UMEM_COMPLETION_RING, > + XDP_DIAG_MEMINFO, > + __XDP_DIAG_MAX, > +}; > + > +#define XDP_DIAG_MAX (__XDP_DIAG_MAX - 1) > + > +struct xdp_diag_info { > + __u32 ifindex; > + __u32 queue_id; > +}; > + > +struct xdp_diag_ring { > + __u32 entries; /*num descs */ > +}; > + > +#define XDP_DU_F_ZEROCOPY (1 << 0) > + > +struct xdp_diag_umem { > + __u64 size; > + __u32 id; > + __u32 num_pages; > + __u32 chunk_size; > + __u32 headroom; > + __u32 ifindex; > + __u32 queue_id; > + __u32 flags; > + __u32 refs; > +}; > + > + Nit: one newline too much > +#endif /* _LINUX_XDP_DIAG_H */ > diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig > index 90e4a7152854..0255b33cff4b 100644 > --- a/net/xdp/Kconfig > +++ b/net/xdp/Kconfig > @@ -5,3 +5,11 @@ config XDP_SOCKETS > help > XDP sockets allows a channel between XDP programs and > userspace applications. > + > +config XDP_SOCKETS_DIAG > + tristate "XDP sockets: monitoring interface" > + depends on XDP_SOCKETS > + default n > + help > + Support for PF_XDP sockets monitoring interface used by the ss tool. > + If unsure, say Y. > diff --git a/net/xdp/Makefile b/net/xdp/Makefile > index 04f073146256..59dbfdf93dca 100644 > --- a/net/xdp/Makefile > +++ b/net/xdp/Makefile > @@ -1 +1,2 @@ > obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o > +obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > index 80ca48cefc42..949d3bbccb2f 100644 > --- a/net/xdp/xsk.c > +++ b/net/xdp/xsk.c > @@ -27,14 +27,10 @@ > > #include "xsk_queue.h" > #include "xdp_umem.h" > +#include "xsk.h" > > #define TX_BATCH_SIZE 16 > > -static struct xdp_sock *xdp_sk(struct sock *sk) > -{ > - return (struct xdp_sock *)sk; > -} > - > bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) > { > return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && > diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h > new file mode 100644 > index 000000000000..ba8120610426 > --- /dev/null > +++ b/net/xdp/xsk.h > @@ -0,0 +1,12 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* Copyright(c) 2019 Intel Corporation. */ > + > +#ifndef XSK_H_ > +#define XSK_H_ > + > +static inline struct xdp_sock *xdp_sk(struct sock *sk) > +{ > + return (struct xdp_sock *)sk; > +} > + > +#endif /* XSK_H_ */ > diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c > new file mode 100644 > index 000000000000..2585d7a4ac2f > --- /dev/null > +++ b/net/xdp/xsk_diag.c > @@ -0,0 +1,192 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* XDP sockets monitoring support > + * > + * Copyright(c) 2019 Intel Corporation. > + * > + * Author: Björn Töpel <bjorn.topel@intel.com> > + */ > + > +#include <linux/module.h> > +#include <net/xdp_sock.h> > +#include <linux/xdp_diag.h> > +#include <linux/sock_diag.h> > + > +#include "xsk_queue.h" > +#include "xsk.h" > + > +static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb) > +{ > + struct xdp_diag_info di; Not a bug, but can we generally zero all the structs on stack such that once we extend them nothing gets potentially leaked. > + di.ifindex = xs->dev ? xs->dev->ifindex : 0; > + di.queue_id = xs->queue_id; > + return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di); > +} > + > +static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type, > + struct sk_buff *nlskb) > +{ > + struct xdp_diag_ring dr; Ditto. > + dr.entries = queue->nentries; > + return nla_put(nlskb, nl_type, sizeof(dr), &dr); > +} > + > +static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, > + struct sk_buff *nlskb) > +{ > + int err = 0; > + > + if (xs->rx) > + err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb); > + if (!err && xs->tx) > + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb); > + return err; > +} > + > +static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) > +{ > + struct xdp_umem *umem = xs->umem; > + struct xdp_diag_umem du; Ditto > + int err; > + > + if (!umem) > + return 0; > + > + du.id = umem->id; > + du.size = umem->size; > + du.num_pages = umem->npgs; > + du.chunk_size = (__u32)(~umem->chunk_mask + 1); > + du.headroom = umem->headroom; > + du.ifindex = umem->dev ? umem->dev->ifindex : 0; > + du.queue_id = umem->queue_id; > + du.flags = 0; > + if (umem->zc) > + du.flags |= XDP_DU_F_ZEROCOPY; > + du.refs = refcount_read(&umem->users); > + > + err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); > + > + if (!err && umem->fq) > + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_FILL_RING, nlskb); > + if (!err && umem->cq) { > + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_COMPLETION_RING, > + nlskb); > + } > + return err; > +} > + > +static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, > + struct xdp_diag_req *req, > + struct user_namespace *user_ns, > + u32 portid, u32 seq, u32 flags, int sk_ino) > +{ > + struct xdp_sock *xs = xdp_sk(sk); > + struct xdp_diag_msg *msg; > + struct nlmsghdr *nlh; > + > + nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg), > + flags); > + if (!nlh) > + return -EMSGSIZE; > + > + msg = nlmsg_data(nlh); > + msg->xdiag_family = AF_XDP; > + msg->xdiag_type = sk->sk_type; > + msg->xdiag_ino = sk_ino; > + sock_diag_save_cookie(sk, msg->xdiag_cookie); Don't we have a hole in struct xdp_diag_msg after xdiag_type? Probably better to memset everything in the beginning as well. > + if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) > + goto out_nlmsg_trim; > + > + if ((req->xdiag_show & XDP_SHOW_INFO) && > + nla_put_u32(nlskb, XDP_DIAG_UID, > + from_kuid_munged(user_ns, sock_i_uid(sk)))) > + goto out_nlmsg_trim; > + > + if ((req->xdiag_show & XDP_SHOW_RING_CFG) && > + xsk_diag_put_rings_cfg(xs, nlskb)) > + goto out_nlmsg_trim; > + > + if ((req->xdiag_show & XDP_SHOW_UMEM) && > + xsk_diag_put_umem(xs, nlskb)) > + goto out_nlmsg_trim; > + > + if ((req->xdiag_show & XDP_SHOW_MEMINFO) && > + sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) > + goto out_nlmsg_trim; > + > + nlmsg_end(nlskb, nlh); > + return 0; > + > +out_nlmsg_trim: > + nlmsg_cancel(nlskb, nlh); > + return -EMSGSIZE; > +} > + > +static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb) > +{ > + struct xdp_diag_req *req = nlmsg_data(cb->nlh); > + struct net *net = sock_net(nlskb->sk); > + int num = 0, s_num = cb->args[0]; > + struct sock *sk; > + > + mutex_lock(&net->xdp.lock); > + > + sk_for_each(sk, &net->xdp.list) { > + if (!net_eq(sock_net(sk), net)) > + continue; > + if (num++ < s_num) > + continue; > + > + if (xsk_diag_fill(sk, nlskb, req, > + sk_user_ns(NETLINK_CB(cb->skb).sk), > + NETLINK_CB(cb->skb).portid, > + cb->nlh->nlmsg_seq, NLM_F_MULTI, > + sock_i_ino(sk)) < 0) { > + num--; > + break; > + } > + } > + > + mutex_unlock(&net->xdp.lock); > + cb->args[0] = num; > + return nlskb->len; > +} > + > +static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) > +{ > + struct netlink_dump_control c = { .dump = xsk_diag_dump }; > + int hdrlen = sizeof(struct xdp_diag_req); > + struct net *net = sock_net(nlskb->sk); > + struct xdp_diag_req *req; > + > + if (nlmsg_len(hdr) < hdrlen) > + return -EINVAL; > + > + if (!(hdr->nlmsg_flags & NLM_F_DUMP)) > + return -EOPNOTSUPP; > + > + req = nlmsg_data(hdr); > + return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c); > +} > + > +static const struct sock_diag_handler xsk_diag_handler = { > + .family = AF_XDP, > + .dump = xsk_diag_handler_dump, > +}; > + > +static int __init xsk_diag_init(void) > +{ > + return sock_diag_register(&xsk_diag_handler); > +} > + > +static void __exit xsk_diag_exit(void) > +{ > + sock_diag_unregister(&xsk_diag_handler); > +} > + > +module_init(xsk_diag_init); > +module_exit(xsk_diag_exit); > +MODULE_LICENSE("GPL"); > +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP); >
Den ons 23 jan. 2019 kl 14:19 skrev Daniel Borkmann <daniel@iogearbox.net>: > > On 01/18/2019 02:03 PM, bjorn.topel@gmail.com wrote: > > From: Björn Töpel <bjorn.topel@intel.com> > > > > This patch adds the sock_diag interface for querying sockets from user > > space. Tools like iproute2 ss(8) can use this interface to list open > > AF_XDP sockets. > > > > The user-space ABI is defined in linux/xdp_diag.h and includes netlink > > request and response structs. The request can query sockets and the > > response contains socket information about the rings, umems, inode and > > more. > > > > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > > Series looks good, few minor nits inline: > > > --- > > include/uapi/linux/xdp_diag.h | 72 +++++++++++++ > > net/xdp/Kconfig | 8 ++ > > net/xdp/Makefile | 1 + > > net/xdp/xsk.c | 6 +- > > net/xdp/xsk.h | 12 +++ > > net/xdp/xsk_diag.c | 192 ++++++++++++++++++++++++++++++++++ > > 6 files changed, 286 insertions(+), 5 deletions(-) > > create mode 100644 include/uapi/linux/xdp_diag.h > > create mode 100644 net/xdp/xsk.h > > create mode 100644 net/xdp/xsk_diag.c > > > > diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h > > new file mode 100644 > > index 000000000000..efe8ce281dce > > --- /dev/null > > +++ b/include/uapi/linux/xdp_diag.h > > @@ -0,0 +1,72 @@ > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > > +/* > > + * xdp_diag: interface for query/monitor XDP sockets > > + * Copyright(c) 2019 Intel Corporation. > > + */ > > + > > +#ifndef _LINUX_XDP_DIAG_H > > +#define _LINUX_XDP_DIAG_H > > + > > +#include <linux/types.h> > > + > > +struct xdp_diag_req { > > + __u8 sdiag_family; > > + __u8 sdiag_protocol; > > + __u16 pad; > > Presumably this one is for future use? Maybe better as '__u16 :16;' to > avoid compile errors if someone tries to zero 'pad' member manually? > The "pad" was there simply to have an explicitly named structure hole. I'm not following the bitfield argument. Why does that avoid compiler errors? > > + __u32 xdiag_ino; > > + __u32 xdiag_show; > > + __u32 xdiag_cookie[2]; > > +}; > > + > > +struct xdp_diag_msg { > > + __u8 xdiag_family; > > + __u8 xdiag_type; > > + __u32 xdiag_ino; > > + __u32 xdiag_cookie[2]; > > +}; > > + > > +#define XDP_SHOW_INFO (1 << 0) /* Basic information */ > > +#define XDP_SHOW_RING_CFG (1 << 1) > > +#define XDP_SHOW_UMEM (1 << 2) > > +#define XDP_SHOW_MEMINFO (1 << 3) > > + > > +enum { > > + XDP_DIAG_NONE, > > + XDP_DIAG_INFO, > > + XDP_DIAG_UID, > > + XDP_DIAG_RX_RING, > > + XDP_DIAG_TX_RING, > > + XDP_DIAG_UMEM, > > + XDP_DIAG_UMEM_FILL_RING, > > + XDP_DIAG_UMEM_COMPLETION_RING, > > + XDP_DIAG_MEMINFO, > > + __XDP_DIAG_MAX, > > +}; > > + > > +#define XDP_DIAG_MAX (__XDP_DIAG_MAX - 1) > > + > > +struct xdp_diag_info { > > + __u32 ifindex; > > + __u32 queue_id; > > +}; > > + > > +struct xdp_diag_ring { > > + __u32 entries; /*num descs */ > > +}; > > + > > +#define XDP_DU_F_ZEROCOPY (1 << 0) > > + > > +struct xdp_diag_umem { > > + __u64 size; > > + __u32 id; > > + __u32 num_pages; > > + __u32 chunk_size; > > + __u32 headroom; > > + __u32 ifindex; > > + __u32 queue_id; > > + __u32 flags; > > + __u32 refs; > > +}; > > + > > + > > Nit: one newline too much > Good catch! > > +#endif /* _LINUX_XDP_DIAG_H */ > > diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig > > index 90e4a7152854..0255b33cff4b 100644 > > --- a/net/xdp/Kconfig > > +++ b/net/xdp/Kconfig > > @@ -5,3 +5,11 @@ config XDP_SOCKETS > > help > > XDP sockets allows a channel between XDP programs and > > userspace applications. > > + > > +config XDP_SOCKETS_DIAG > > + tristate "XDP sockets: monitoring interface" > > + depends on XDP_SOCKETS > > + default n > > + help > > + Support for PF_XDP sockets monitoring interface used by the ss tool. > > + If unsure, say Y. > > diff --git a/net/xdp/Makefile b/net/xdp/Makefile > > index 04f073146256..59dbfdf93dca 100644 > > --- a/net/xdp/Makefile > > +++ b/net/xdp/Makefile > > @@ -1 +1,2 @@ > > obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o > > +obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > > index 80ca48cefc42..949d3bbccb2f 100644 > > --- a/net/xdp/xsk.c > > +++ b/net/xdp/xsk.c > > @@ -27,14 +27,10 @@ > > > > #include "xsk_queue.h" > > #include "xdp_umem.h" > > +#include "xsk.h" > > > > #define TX_BATCH_SIZE 16 > > > > -static struct xdp_sock *xdp_sk(struct sock *sk) > > -{ > > - return (struct xdp_sock *)sk; > > -} > > - > > bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) > > { > > return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && > > diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h > > new file mode 100644 > > index 000000000000..ba8120610426 > > --- /dev/null > > +++ b/net/xdp/xsk.h > > @@ -0,0 +1,12 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +/* Copyright(c) 2019 Intel Corporation. */ > > + > > +#ifndef XSK_H_ > > +#define XSK_H_ > > + > > +static inline struct xdp_sock *xdp_sk(struct sock *sk) > > +{ > > + return (struct xdp_sock *)sk; > > +} > > + > > +#endif /* XSK_H_ */ > > diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c > > new file mode 100644 > > index 000000000000..2585d7a4ac2f > > --- /dev/null > > +++ b/net/xdp/xsk_diag.c > > @@ -0,0 +1,192 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* XDP sockets monitoring support > > + * > > + * Copyright(c) 2019 Intel Corporation. > > + * > > + * Author: Björn Töpel <bjorn.topel@intel.com> > > + */ > > + > > +#include <linux/module.h> > > +#include <net/xdp_sock.h> > > +#include <linux/xdp_diag.h> > > +#include <linux/sock_diag.h> > > + > > +#include "xsk_queue.h" > > +#include "xsk.h" > > + > > +static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb) > > +{ > > + struct xdp_diag_info di; > > Not a bug, but can we generally zero all the structs on stack such that once > we extend them nothing gets potentially leaked. > Makes sense! Will fix here and... > > + di.ifindex = xs->dev ? xs->dev->ifindex : 0; > > + di.queue_id = xs->queue_id; > > + return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di); > > +} > > + > > +static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type, > > + struct sk_buff *nlskb) > > +{ > > + struct xdp_diag_ring dr; > > Ditto. > ...here... > > + dr.entries = queue->nentries; > > + return nla_put(nlskb, nl_type, sizeof(dr), &dr); > > +} > > + > > +static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, > > + struct sk_buff *nlskb) > > +{ > > + int err = 0; > > + > > + if (xs->rx) > > + err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb); > > + if (!err && xs->tx) > > + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb); > > + return err; > > +} > > + > > +static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) > > +{ > > + struct xdp_umem *umem = xs->umem; > > + struct xdp_diag_umem du; > > Ditto > ...and here. > > + int err; > > + > > + if (!umem) > > + return 0; > > + > > + du.id = umem->id; > > + du.size = umem->size; > > + du.num_pages = umem->npgs; > > + du.chunk_size = (__u32)(~umem->chunk_mask + 1); > > + du.headroom = umem->headroom; > > + du.ifindex = umem->dev ? umem->dev->ifindex : 0; > > + du.queue_id = umem->queue_id; > > + du.flags = 0; > > + if (umem->zc) > > + du.flags |= XDP_DU_F_ZEROCOPY; > > + du.refs = refcount_read(&umem->users); > > + > > + err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); > > + > > + if (!err && umem->fq) > > + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_FILL_RING, nlskb); > > + if (!err && umem->cq) { > > + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_COMPLETION_RING, > > + nlskb); > > + } > > + return err; > > +} > > + > > +static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, > > + struct xdp_diag_req *req, > > + struct user_namespace *user_ns, > > + u32 portid, u32 seq, u32 flags, int sk_ino) > > +{ > > + struct xdp_sock *xs = xdp_sk(sk); > > + struct xdp_diag_msg *msg; > > + struct nlmsghdr *nlh; > > + > > + nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg), > > + flags); > > + if (!nlh) > > + return -EMSGSIZE; > > + > > + msg = nlmsg_data(nlh); > > + msg->xdiag_family = AF_XDP; > > + msg->xdiag_type = sk->sk_type; > > + msg->xdiag_ino = sk_ino; > > + sock_diag_save_cookie(sk, msg->xdiag_cookie); > > Don't we have a hole in struct xdp_diag_msg after xdiag_type? Probably better > to memset everything in the beginning as well. > You're right! Good catch. In general for uapi structures; Explicitly named "holes" or not? The pad in req is exactly that. So, either add a pad to _msg or remove in _req. What is preferred? I'll spin up a v2. Thanks for looking at the code! Björn > > + if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) > > + goto out_nlmsg_trim; > > + > > + if ((req->xdiag_show & XDP_SHOW_INFO) && > > + nla_put_u32(nlskb, XDP_DIAG_UID, > > + from_kuid_munged(user_ns, sock_i_uid(sk)))) > > + goto out_nlmsg_trim; > > + > > + if ((req->xdiag_show & XDP_SHOW_RING_CFG) && > > + xsk_diag_put_rings_cfg(xs, nlskb)) > > + goto out_nlmsg_trim; > > + > > + if ((req->xdiag_show & XDP_SHOW_UMEM) && > > + xsk_diag_put_umem(xs, nlskb)) > > + goto out_nlmsg_trim; > > + > > + if ((req->xdiag_show & XDP_SHOW_MEMINFO) && > > + sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) > > + goto out_nlmsg_trim; > > + > > + nlmsg_end(nlskb, nlh); > > + return 0; > > + > > +out_nlmsg_trim: > > + nlmsg_cancel(nlskb, nlh); > > + return -EMSGSIZE; > > +} > > + > > +static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb) > > +{ > > + struct xdp_diag_req *req = nlmsg_data(cb->nlh); > > + struct net *net = sock_net(nlskb->sk); > > + int num = 0, s_num = cb->args[0]; > > + struct sock *sk; > > + > > + mutex_lock(&net->xdp.lock); > > + > > + sk_for_each(sk, &net->xdp.list) { > > + if (!net_eq(sock_net(sk), net)) > > + continue; > > + if (num++ < s_num) > > + continue; > > + > > + if (xsk_diag_fill(sk, nlskb, req, > > + sk_user_ns(NETLINK_CB(cb->skb).sk), > > + NETLINK_CB(cb->skb).portid, > > + cb->nlh->nlmsg_seq, NLM_F_MULTI, > > + sock_i_ino(sk)) < 0) { > > + num--; > > + break; > > + } > > + } > > + > > + mutex_unlock(&net->xdp.lock); > > + cb->args[0] = num; > > + return nlskb->len; > > +} > > + > > +static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) > > +{ > > + struct netlink_dump_control c = { .dump = xsk_diag_dump }; > > + int hdrlen = sizeof(struct xdp_diag_req); > > + struct net *net = sock_net(nlskb->sk); > > + struct xdp_diag_req *req; > > + > > + if (nlmsg_len(hdr) < hdrlen) > > + return -EINVAL; > > + > > + if (!(hdr->nlmsg_flags & NLM_F_DUMP)) > > + return -EOPNOTSUPP; > > + > > + req = nlmsg_data(hdr); > > + return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c); > > +} > > + > > +static const struct sock_diag_handler xsk_diag_handler = { > > + .family = AF_XDP, > > + .dump = xsk_diag_handler_dump, > > +}; > > + > > +static int __init xsk_diag_init(void) > > +{ > > + return sock_diag_register(&xsk_diag_handler); > > +} > > + > > +static void __exit xsk_diag_exit(void) > > +{ > > + sock_diag_unregister(&xsk_diag_handler); > > +} > > + > > +module_init(xsk_diag_init); > > +module_exit(xsk_diag_exit); > > +MODULE_LICENSE("GPL"); > > +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP); > > >
On 01/23/2019 03:24 PM, Björn Töpel wrote: > Den ons 23 jan. 2019 kl 14:19 skrev Daniel Borkmann <daniel@iogearbox.net>: >> >> On 01/18/2019 02:03 PM, bjorn.topel@gmail.com wrote: >>> From: Björn Töpel <bjorn.topel@intel.com> >>> >>> This patch adds the sock_diag interface for querying sockets from user >>> space. Tools like iproute2 ss(8) can use this interface to list open >>> AF_XDP sockets. >>> >>> The user-space ABI is defined in linux/xdp_diag.h and includes netlink >>> request and response structs. The request can query sockets and the >>> response contains socket information about the rings, umems, inode and >>> more. >>> >>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com> >> >> Series looks good, few minor nits inline: >> >>> --- >>> include/uapi/linux/xdp_diag.h | 72 +++++++++++++ >>> net/xdp/Kconfig | 8 ++ >>> net/xdp/Makefile | 1 + >>> net/xdp/xsk.c | 6 +- >>> net/xdp/xsk.h | 12 +++ >>> net/xdp/xsk_diag.c | 192 ++++++++++++++++++++++++++++++++++ >>> 6 files changed, 286 insertions(+), 5 deletions(-) >>> create mode 100644 include/uapi/linux/xdp_diag.h >>> create mode 100644 net/xdp/xsk.h >>> create mode 100644 net/xdp/xsk_diag.c >>> >>> diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h >>> new file mode 100644 >>> index 000000000000..efe8ce281dce >>> --- /dev/null >>> +++ b/include/uapi/linux/xdp_diag.h >>> @@ -0,0 +1,72 @@ >>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ >>> +/* >>> + * xdp_diag: interface for query/monitor XDP sockets >>> + * Copyright(c) 2019 Intel Corporation. >>> + */ >>> + >>> +#ifndef _LINUX_XDP_DIAG_H >>> +#define _LINUX_XDP_DIAG_H >>> + >>> +#include <linux/types.h> >>> + >>> +struct xdp_diag_req { >>> + __u8 sdiag_family; >>> + __u8 sdiag_protocol; >>> + __u16 pad; >> >> Presumably this one is for future use? Maybe better as '__u16 :16;' to >> avoid compile errors if someone tries to zero 'pad' member manually? > > The "pad" was there simply to have an explicitly named structure hole. > I'm not following the bitfield argument. Why does that avoid compiler > errors? Mostly in the sense that an application would explicitly set 'pad = 0' whereas pad could later on potentially be renamed and reused otherwise (which __u16 :16 would avoid in first place). But looking at other *_diag_req structs, it's explicitly named as 'pad' elsewhere already, then nevermind, lets have it rather consistent then and keep as is. One small thing I still spotted when looking at it again, in function xsk_diag_handler_dump() the req is unused. Thanks, Daniel
Den tors 24 jan. 2019 kl 14:08 skrev Daniel Borkmann <daniel@iogearbox.net>: > [...] > >>> +#ifndef _LINUX_XDP_DIAG_H > >>> +#define _LINUX_XDP_DIAG_H > >>> + > >>> +#include <linux/types.h> > >>> + > >>> +struct xdp_diag_req { > >>> + __u8 sdiag_family; > >>> + __u8 sdiag_protocol; > >>> + __u16 pad; > >> > >> Presumably this one is for future use? Maybe better as '__u16 :16;' to > >> avoid compile errors if someone tries to zero 'pad' member manually? > > > > The "pad" was there simply to have an explicitly named structure hole. > > I'm not following the bitfield argument. Why does that avoid compiler > > errors? > > Mostly in the sense that an application would explicitly set 'pad = 0' > whereas pad could later on potentially be renamed and reused otherwise > (which __u16 :16 would avoid in first place). But looking at other > *_diag_req structs, it's explicitly named as 'pad' elsewhere already, > then nevermind, lets have it rather consistent then and keep as is. > Ok, thanks for clearing it up. I'll address all your comments, and also adds a "pad" member in msg for consistency. > One small thing I still spotted when looking at it again, in function > xsk_diag_handler_dump() the req is unused. > Ah, thank you. Björn > Thanks, > Daniel
diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h new file mode 100644 index 000000000000..efe8ce281dce --- /dev/null +++ b/include/uapi/linux/xdp_diag.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * xdp_diag: interface for query/monitor XDP sockets + * Copyright(c) 2019 Intel Corporation. + */ + +#ifndef _LINUX_XDP_DIAG_H +#define _LINUX_XDP_DIAG_H + +#include <linux/types.h> + +struct xdp_diag_req { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u16 pad; + __u32 xdiag_ino; + __u32 xdiag_show; + __u32 xdiag_cookie[2]; +}; + +struct xdp_diag_msg { + __u8 xdiag_family; + __u8 xdiag_type; + __u32 xdiag_ino; + __u32 xdiag_cookie[2]; +}; + +#define XDP_SHOW_INFO (1 << 0) /* Basic information */ +#define XDP_SHOW_RING_CFG (1 << 1) +#define XDP_SHOW_UMEM (1 << 2) +#define XDP_SHOW_MEMINFO (1 << 3) + +enum { + XDP_DIAG_NONE, + XDP_DIAG_INFO, + XDP_DIAG_UID, + XDP_DIAG_RX_RING, + XDP_DIAG_TX_RING, + XDP_DIAG_UMEM, + XDP_DIAG_UMEM_FILL_RING, + XDP_DIAG_UMEM_COMPLETION_RING, + XDP_DIAG_MEMINFO, + __XDP_DIAG_MAX, +}; + +#define XDP_DIAG_MAX (__XDP_DIAG_MAX - 1) + +struct xdp_diag_info { + __u32 ifindex; + __u32 queue_id; +}; + +struct xdp_diag_ring { + __u32 entries; /*num descs */ +}; + +#define XDP_DU_F_ZEROCOPY (1 << 0) + +struct xdp_diag_umem { + __u64 size; + __u32 id; + __u32 num_pages; + __u32 chunk_size; + __u32 headroom; + __u32 ifindex; + __u32 queue_id; + __u32 flags; + __u32 refs; +}; + + +#endif /* _LINUX_XDP_DIAG_H */ diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig index 90e4a7152854..0255b33cff4b 100644 --- a/net/xdp/Kconfig +++ b/net/xdp/Kconfig @@ -5,3 +5,11 @@ config XDP_SOCKETS help XDP sockets allows a channel between XDP programs and userspace applications. + +config XDP_SOCKETS_DIAG + tristate "XDP sockets: monitoring interface" + depends on XDP_SOCKETS + default n + help + Support for PF_XDP sockets monitoring interface used by the ss tool. + If unsure, say Y. diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 04f073146256..59dbfdf93dca 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 80ca48cefc42..949d3bbccb2f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -27,14 +27,10 @@ #include "xsk_queue.h" #include "xdp_umem.h" +#include "xsk.h" #define TX_BATCH_SIZE 16 -static struct xdp_sock *xdp_sk(struct sock *sk) -{ - return (struct xdp_sock *)sk; -} - bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h new file mode 100644 index 000000000000..ba8120610426 --- /dev/null +++ b/net/xdp/xsk.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2019 Intel Corporation. */ + +#ifndef XSK_H_ +#define XSK_H_ + +static inline struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +#endif /* XSK_H_ */ diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c new file mode 100644 index 000000000000..2585d7a4ac2f --- /dev/null +++ b/net/xdp/xsk_diag.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets monitoring support + * + * Copyright(c) 2019 Intel Corporation. + * + * Author: Björn Töpel <bjorn.topel@intel.com> + */ + +#include <linux/module.h> +#include <net/xdp_sock.h> +#include <linux/xdp_diag.h> +#include <linux/sock_diag.h> + +#include "xsk_queue.h" +#include "xsk.h" + +static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_info di; + + di.ifindex = xs->dev ? xs->dev->ifindex : 0; + di.queue_id = xs->queue_id; + return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di); +} + +static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type, + struct sk_buff *nlskb) +{ + struct xdp_diag_ring dr; + + dr.entries = queue->nentries; + return nla_put(nlskb, nl_type, sizeof(dr), &dr); +} + +static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, + struct sk_buff *nlskb) +{ + int err = 0; + + if (xs->rx) + err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb); + if (!err && xs->tx) + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb); + return err; +} + +static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_umem *umem = xs->umem; + struct xdp_diag_umem du; + int err; + + if (!umem) + return 0; + + du.id = umem->id; + du.size = umem->size; + du.num_pages = umem->npgs; + du.chunk_size = (__u32)(~umem->chunk_mask + 1); + du.headroom = umem->headroom; + du.ifindex = umem->dev ? umem->dev->ifindex : 0; + du.queue_id = umem->queue_id; + du.flags = 0; + if (umem->zc) + du.flags |= XDP_DU_F_ZEROCOPY; + du.refs = refcount_read(&umem->users); + + err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); + + if (!err && umem->fq) + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_FILL_RING, nlskb); + if (!err && umem->cq) { + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_COMPLETION_RING, + nlskb); + } + return err; +} + +static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, + struct xdp_diag_req *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) +{ + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_diag_msg *msg; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg), + flags); + if (!nlh) + return -EMSGSIZE; + + msg = nlmsg_data(nlh); + msg->xdiag_family = AF_XDP; + msg->xdiag_type = sk->sk_type; + msg->xdiag_ino = sk_ino; + sock_diag_save_cookie(sk, msg->xdiag_cookie); + + if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_INFO) && + nla_put_u32(nlskb, XDP_DIAG_UID, + from_kuid_munged(user_ns, sock_i_uid(sk)))) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_RING_CFG) && + xsk_diag_put_rings_cfg(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_UMEM) && + xsk_diag_put_umem(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_MEMINFO) && + sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) + goto out_nlmsg_trim; + + nlmsg_end(nlskb, nlh); + return 0; + +out_nlmsg_trim: + nlmsg_cancel(nlskb, nlh); + return -EMSGSIZE; +} + +static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb) +{ + struct xdp_diag_req *req = nlmsg_data(cb->nlh); + struct net *net = sock_net(nlskb->sk); + int num = 0, s_num = cb->args[0]; + struct sock *sk; + + mutex_lock(&net->xdp.lock); + + sk_for_each(sk, &net->xdp.list) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num++ < s_num) + continue; + + if (xsk_diag_fill(sk, nlskb, req, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + num--; + break; + } + } + + mutex_unlock(&net->xdp.lock); + cb->args[0] = num; + return nlskb->len; +} + +static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) +{ + struct netlink_dump_control c = { .dump = xsk_diag_dump }; + int hdrlen = sizeof(struct xdp_diag_req); + struct net *net = sock_net(nlskb->sk); + struct xdp_diag_req *req; + + if (nlmsg_len(hdr) < hdrlen) + return -EINVAL; + + if (!(hdr->nlmsg_flags & NLM_F_DUMP)) + return -EOPNOTSUPP; + + req = nlmsg_data(hdr); + return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c); +} + +static const struct sock_diag_handler xsk_diag_handler = { + .family = AF_XDP, + .dump = xsk_diag_handler_dump, +}; + +static int __init xsk_diag_init(void) +{ + return sock_diag_register(&xsk_diag_handler); +} + +static void __exit xsk_diag_exit(void) +{ + sock_diag_unregister(&xsk_diag_handler); +} + +module_init(xsk_diag_init); +module_exit(xsk_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP);