diff mbox

cgroup namespaces: add a 'nsroot=' mountinfo field

Message ID 20160321234858.GA28232@ubuntumail
State New
Headers show

Commit Message

Serge E. Hallyn March 21, 2016, 11:48 p.m. UTC
[ note - this is a version of the patch I just sent to lkml ported to
  our xenial tree.  It's needed for things like docker and lxc to
  be certain of which cgroup tasks file is their own in certain nesting
  situations.  We currently work around it by blindly assuming that
  there are no legacy container managers running on cgroup-ns-enabled
  kernels ]

One practical problem I've found with cgroup namespaces is that there
is no way to disambiguate between a cgroupfs mount which was done in
a cgroup namespace, and a bind mount of a cgroupfs directory.  So
whether I do

unshare --cgroup -- bash -c "mount -t cgroup -o freezer f /mnt; cat /proc/self/mountinfo"

or whether I just

mount --bind /sys/fs/cgroup/freezer/$(awk -F: '/freezer/ { print $3 }' /proc/self/cgroup) /mnt

'mount root' field (field 3) in /proc/self/mountinfo will show the
same thing, the result of awk -F: '/freezer/ { print $3 }' /proc/self/cgroup.

This patch adds a 'nsroot=' field to cgroup mountinfo entries, so that
userspace can distinguish a mount made in a cgroup namespace from a bind
mount from a cgroup subdirectory.

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
---
 fs/kernfs/mount.c       |  2 +-
 include/linux/kernfs.h  |  3 ++-
 kernel/cgroup.c         | 29 ++++++++++++++++++++++++++++-
 4 files changed, 37 insertions(+), 5 deletions(-)

Comments

Tim Gardner March 22, 2016, 1:46 p.m. UTC | #1
Applied anyway, but what are the odds that this'll break a user space
regular expression ?
Serge E. Hallyn March 22, 2016, 5:15 p.m. UTC | #2
Quoting Tim Gardner (tim.gardner@canonical.com):
> Applied anyway, but what are the odds that this'll break a user space
> regular expression ?

It's possible, although my xenial vms booted fine with it.  I would have
preferred to add this to the list of 'optional' fields before the -,
but there was not a hook for that.  As it is, elements in the options
field can be controllers if they are preceded by 'name=' or if they
are kernel controllers (listed in /proc/cgroups).  If userspace wants
every field element that it doesn't recognize to be a controller it could
be a problem.  As of now though the list of non-controller keywords I
regularly see includes rw, xattr (?), clone_children, release_agent=.
diff mbox

Patch

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 18ed4dc..5e621f3 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -37,7 +37,7 @@  static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 	struct kernfs_syscall_ops *scops = root->syscall_ops;
 
 	if (scops && scops->show_options)
-		return scops->show_options(sf, root);
+		return scops->show_options(sf, dentry, root);
 	return 0;
 }
 
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 307e1a6..d8bb68f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -145,7 +145,8 @@  struct kernfs_node {
  */
 struct kernfs_syscall_ops {
 	int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
-	int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
+	int (*show_options)(struct seq_file *sf, struct dentry *dentry,
+			    struct kernfs_root *root);
 
 	int (*mkdir)(struct kernfs_node *parent, const char *name,
 		     umode_t mode);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a66fbc1..ef0c25d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1589,7 +1589,32 @@  static int rebind_subsystems(struct cgroup_root *dst_root,
 	return 0;
 }
 
-static int cgroup_show_options(struct seq_file *seq,
+static void cgroup_show_nsroot(struct seq_file *seq, struct dentry *dentry,
+			       struct kernfs_root *kf_root)
+{
+	struct kernfs_node *d_kn = dentry->d_fsdata;
+	char *nsroot;
+	int len, ret;
+
+	if (!kf_root)
+		return;
+	len = kernfs_path_from_node(d_kn, kf_root->kn, NULL, 0);
+	if (len <= 0)
+		return;
+	nsroot = kzalloc(len + 1, GFP_ATOMIC);
+	if (!nsroot)
+		return;
+	ret = kernfs_path_from_node(d_kn, kf_root->kn, nsroot, len + 1);
+	if (ret <= 0 || ret > len)
+		goto out;
+
+	seq_show_option(seq, "nsroot", nsroot);
+
+out:
+	kfree(nsroot);
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry,
 			       struct kernfs_root *kf_root)
 {
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
@@ -1615,6 +1640,8 @@  static int cgroup_show_options(struct seq_file *seq,
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_show_option(seq, "name", root->name);
+	cgroup_show_nsroot(seq, dentry, kf_root);
+
 	return 0;
 }