diff mbox

[1/5] fs: Verify access of user towards block device file when mounting

Message ID 1443644116-41366-2-git-send-email-seth.forshee@canonical.com
State Deferred
Headers show

Commit Message

Seth Forshee Sept. 30, 2015, 8:15 p.m. UTC
When mounting a filesystem on a block device there is currently
no verification that the user has appropriate access to the
device file passed to mount. This has not been an issue so far
since the user in question has always been root, but this must
be changed before allowing unprivileged users to mount in user
namespaces.

To fix this, add an argument to lookup_bdev() to specify the
required permissions. If the mask of permissions is zero, or
if the user has CAP_SYS_ADMIN, the permission check is skipped,
otherwise the lookup fails if the user does not have the
specified access rights for the inode at the supplied path.

Callers associated with mounting are updated to pass permission
masks to lookup_bdev() so that these mounts will fail for an
unprivileged user who lacks permissions for the block device
inode. All other callers pass 0 to maintain their current
behaviors.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
---
 drivers/md/bcache/super.c |  2 +-
 drivers/md/dm-table.c     |  2 +-
 drivers/mtd/mtdsuper.c    |  6 +++++-
 fs/block_dev.c            | 18 +++++++++++++++---
 fs/quota/quota.c          |  2 +-
 include/linux/fs.h        |  2 +-
 6 files changed, 24 insertions(+), 8 deletions(-)

Comments

Mike Snitzer Sept. 30, 2015, 11:42 p.m. UTC | #1
On Wed, Sep 30 2015 at  4:15pm -0400,
Seth Forshee <seth.forshee@canonical.com> wrote:

> When mounting a filesystem on a block device there is currently
> no verification that the user has appropriate access to the
> device file passed to mount. This has not been an issue so far
> since the user in question has always been root, but this must
> be changed before allowing unprivileged users to mount in user
> namespaces.
> 
> To fix this, add an argument to lookup_bdev() to specify the
> required permissions. If the mask of permissions is zero, or
> if the user has CAP_SYS_ADMIN, the permission check is skipped,
> otherwise the lookup fails if the user does not have the
> specified access rights for the inode at the supplied path.
> 
> Callers associated with mounting are updated to pass permission
> masks to lookup_bdev() so that these mounts will fail for an
> unprivileged user who lacks permissions for the block device
> inode. All other callers pass 0 to maintain their current
> behaviors.
> 
> Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> ---
>  drivers/md/bcache/super.c |  2 +-
>  drivers/md/dm-table.c     |  2 +-
>  drivers/mtd/mtdsuper.c    |  6 +++++-
>  fs/block_dev.c            | 18 +++++++++++++++---
>  fs/quota/quota.c          |  2 +-
>  include/linux/fs.h        |  2 +-
>  6 files changed, 24 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index e76ed003769e..35bb3ea4cbe2 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -380,7 +380,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
>  	BUG_ON(!t);
>  
>  	/* convert the path to a device */
> -	bdev = lookup_bdev(path);
> +	bdev = lookup_bdev(path, 0);
>  	if (IS_ERR(bdev)) {
>  		dev = name_to_dev_t(path);
>  		if (!dev)

Given dm_get_device() is passed @mode why not have it do something like
you did in blkdev_get_by_path()? e.g.:

> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 26cee058dc02..54d94cd64577 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1394,9 +1394,14 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
>  					void *holder)
>  {
>  	struct block_device *bdev;
> +	int perm = 0;
>  	int err;
>  
> -	bdev = lookup_bdev(path);
> +	if (mode & FMODE_READ)
> +		perm |= MAY_READ;
> +	if (mode & FMODE_WRITE)
> +		perm |= MAY_WRITE;
> +	bdev = lookup_bdev(path, perm);
>  	if (IS_ERR(bdev))
>  		return bdev;
>
Seth Forshee Oct. 1, 2015, 12:55 p.m. UTC | #2
On Wed, Sep 30, 2015 at 07:42:15PM -0400, Mike Snitzer wrote:
> On Wed, Sep 30 2015 at  4:15pm -0400,
> Seth Forshee <seth.forshee@canonical.com> wrote:
> 
> > When mounting a filesystem on a block device there is currently
> > no verification that the user has appropriate access to the
> > device file passed to mount. This has not been an issue so far
> > since the user in question has always been root, but this must
> > be changed before allowing unprivileged users to mount in user
> > namespaces.
> > 
> > To fix this, add an argument to lookup_bdev() to specify the
> > required permissions. If the mask of permissions is zero, or
> > if the user has CAP_SYS_ADMIN, the permission check is skipped,
> > otherwise the lookup fails if the user does not have the
> > specified access rights for the inode at the supplied path.
> > 
> > Callers associated with mounting are updated to pass permission
> > masks to lookup_bdev() so that these mounts will fail for an
> > unprivileged user who lacks permissions for the block device
> > inode. All other callers pass 0 to maintain their current
> > behaviors.
> > 
> > Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> > ---
> >  drivers/md/bcache/super.c |  2 +-
> >  drivers/md/dm-table.c     |  2 +-
> >  drivers/mtd/mtdsuper.c    |  6 +++++-
> >  fs/block_dev.c            | 18 +++++++++++++++---
> >  fs/quota/quota.c          |  2 +-
> >  include/linux/fs.h        |  2 +-
> >  6 files changed, 24 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > index e76ed003769e..35bb3ea4cbe2 100644
> > --- a/drivers/md/dm-table.c
> > +++ b/drivers/md/dm-table.c
> > @@ -380,7 +380,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> >  	BUG_ON(!t);
> >  
> >  	/* convert the path to a device */
> > -	bdev = lookup_bdev(path);
> > +	bdev = lookup_bdev(path, 0);
> >  	if (IS_ERR(bdev)) {
> >  		dev = name_to_dev_t(path);
> >  		if (!dev)
> 
> Given dm_get_device() is passed @mode why not have it do something like
> you did in blkdev_get_by_path()? e.g.:

I only dealt with code related to mounting in this patch since that's
what I'm working on. I have it on my TODO list to consider converting
other callers of lookup_bdev. But if you're sure doing so makes sense
for dm_get_device and that it won't cause regressions then I could add a
patch for it.

Thanks,
Seth
Mike Snitzer Oct. 1, 2015, 1:40 p.m. UTC | #3
On Thu, Oct 01 2015 at  8:55am -0400,
Seth Forshee <seth.forshee@canonical.com> wrote:

> On Wed, Sep 30, 2015 at 07:42:15PM -0400, Mike Snitzer wrote:
> > On Wed, Sep 30 2015 at  4:15pm -0400,
> > Seth Forshee <seth.forshee@canonical.com> wrote:
> > 
> > > When mounting a filesystem on a block device there is currently
> > > no verification that the user has appropriate access to the
> > > device file passed to mount. This has not been an issue so far
> > > since the user in question has always been root, but this must
> > > be changed before allowing unprivileged users to mount in user
> > > namespaces.
> > > 
> > > To fix this, add an argument to lookup_bdev() to specify the
> > > required permissions. If the mask of permissions is zero, or
> > > if the user has CAP_SYS_ADMIN, the permission check is skipped,
> > > otherwise the lookup fails if the user does not have the
> > > specified access rights for the inode at the supplied path.
> > > 
> > > Callers associated with mounting are updated to pass permission
> > > masks to lookup_bdev() so that these mounts will fail for an
> > > unprivileged user who lacks permissions for the block device
> > > inode. All other callers pass 0 to maintain their current
> > > behaviors.
> > > 
> > > Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> > > ---
> > >  drivers/md/bcache/super.c |  2 +-
> > >  drivers/md/dm-table.c     |  2 +-
> > >  drivers/mtd/mtdsuper.c    |  6 +++++-
> > >  fs/block_dev.c            | 18 +++++++++++++++---
> > >  fs/quota/quota.c          |  2 +-
> > >  include/linux/fs.h        |  2 +-
> > >  6 files changed, 24 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > > index e76ed003769e..35bb3ea4cbe2 100644
> > > --- a/drivers/md/dm-table.c
> > > +++ b/drivers/md/dm-table.c
> > > @@ -380,7 +380,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> > >  	BUG_ON(!t);
> > >  
> > >  	/* convert the path to a device */
> > > -	bdev = lookup_bdev(path);
> > > +	bdev = lookup_bdev(path, 0);
> > >  	if (IS_ERR(bdev)) {
> > >  		dev = name_to_dev_t(path);
> > >  		if (!dev)
> > 
> > Given dm_get_device() is passed @mode why not have it do something like
> > you did in blkdev_get_by_path()? e.g.:
> 
> I only dealt with code related to mounting in this patch since that's
> what I'm working on. I have it on my TODO list to consider converting
> other callers of lookup_bdev. But if you're sure doing so makes sense
> for dm_get_device and that it won't cause regressions then I could add a
> patch for it.

OK, dm_get_device() is called in DM device activation path (by tools
like lvm2).

After lookup_bdev() it goes on to call blkdev_get_by_dev() with this
call chain: 
  dm_get_device -> dm_get_table_device -> open_table_device -> blkdev_get_by_dev

Not immediately clear to me why we'd need to augment blkdev_get_by_dev()
to do this checking also.

However, thinking further: In a device stack (e.g. dm/lvm2, md, etc)
new virtual block devices are created that layer ontop of the
traditional block devices.  This level of indirection may cause your
lookup_bdev() check to go on to succeed (if access constraints were not
established on the upper level dm or md device?).  I'm just thinking
outloud here: but have you verified your changes work as intended on
devices created with either lvm2 or mdadm?

What layer establishes access rights to historically root-only
priviledged block devices?  Is it user namespaces?

I haven't kept up with user namespaces as it relates to stacking block
drivers like DM.  But I'm happy to come up to speed and at the same time
help you verify all works as expected with DM blocks devices...

Mike
Seth Forshee Oct. 1, 2015, 2:41 p.m. UTC | #4
On Thu, Oct 01, 2015 at 09:40:52AM -0400, Mike Snitzer wrote:
> On Thu, Oct 01 2015 at  8:55am -0400,
> Seth Forshee <seth.forshee@canonical.com> wrote:
> 
> > On Wed, Sep 30, 2015 at 07:42:15PM -0400, Mike Snitzer wrote:
> > > On Wed, Sep 30 2015 at  4:15pm -0400,
> > > Seth Forshee <seth.forshee@canonical.com> wrote:
> > > 
> > > > When mounting a filesystem on a block device there is currently
> > > > no verification that the user has appropriate access to the
> > > > device file passed to mount. This has not been an issue so far
> > > > since the user in question has always been root, but this must
> > > > be changed before allowing unprivileged users to mount in user
> > > > namespaces.
> > > > 
> > > > To fix this, add an argument to lookup_bdev() to specify the
> > > > required permissions. If the mask of permissions is zero, or
> > > > if the user has CAP_SYS_ADMIN, the permission check is skipped,
> > > > otherwise the lookup fails if the user does not have the
> > > > specified access rights for the inode at the supplied path.
> > > > 
> > > > Callers associated with mounting are updated to pass permission
> > > > masks to lookup_bdev() so that these mounts will fail for an
> > > > unprivileged user who lacks permissions for the block device
> > > > inode. All other callers pass 0 to maintain their current
> > > > behaviors.
> > > > 
> > > > Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> > > > ---
> > > >  drivers/md/bcache/super.c |  2 +-
> > > >  drivers/md/dm-table.c     |  2 +-
> > > >  drivers/mtd/mtdsuper.c    |  6 +++++-
> > > >  fs/block_dev.c            | 18 +++++++++++++++---
> > > >  fs/quota/quota.c          |  2 +-
> > > >  include/linux/fs.h        |  2 +-
> > > >  6 files changed, 24 insertions(+), 8 deletions(-)
> > > > 
> > > > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > > > index e76ed003769e..35bb3ea4cbe2 100644
> > > > --- a/drivers/md/dm-table.c
> > > > +++ b/drivers/md/dm-table.c
> > > > @@ -380,7 +380,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> > > >  	BUG_ON(!t);
> > > >  
> > > >  	/* convert the path to a device */
> > > > -	bdev = lookup_bdev(path);
> > > > +	bdev = lookup_bdev(path, 0);
> > > >  	if (IS_ERR(bdev)) {
> > > >  		dev = name_to_dev_t(path);
> > > >  		if (!dev)
> > > 
> > > Given dm_get_device() is passed @mode why not have it do something like
> > > you did in blkdev_get_by_path()? e.g.:
> > 
> > I only dealt with code related to mounting in this patch since that's
> > what I'm working on. I have it on my TODO list to consider converting
> > other callers of lookup_bdev. But if you're sure doing so makes sense
> > for dm_get_device and that it won't cause regressions then I could add a
> > patch for it.
> 
> OK, dm_get_device() is called in DM device activation path (by tools
> like lvm2).
> 
> After lookup_bdev() it goes on to call blkdev_get_by_dev() with this
> call chain: 
>   dm_get_device -> dm_get_table_device -> open_table_device -> blkdev_get_by_dev
> 
> Not immediately clear to me why we'd need to augment blkdev_get_by_dev()
> to do this checking also.
> 
> However, thinking further: In a device stack (e.g. dm/lvm2, md, etc)
> new virtual block devices are created that layer ontop of the
> traditional block devices.  This level of indirection may cause your
> lookup_bdev() check to go on to succeed (if access constraints were not
> established on the upper level dm or md device?).  I'm just thinking
> outloud here: but have you verified your changes work as intended on
> devices created with either lvm2 or mdadm?
> 
> What layer establishes access rights to historically root-only
> priviledged block devices?  Is it user namespaces?

I'm going to start with this last question and work my way backwards.

Who determines access rights to the block devices varies to some degree.
Any normal user could unshare the user/mount namespaces and still see
all the block devices in /dev. If we're going to allow that user to then
mount block devices in their private namespaces, the kernel must verify
that the user has appropriate permissions for the block device inode.
That's the point of this patch. In a container context the host process
which sets up the container might share some block devices into the
container via bind mounts, in which case it would be responsible for
setting up access rights (both via inode permissions and potentially via
devcgroups). I also have some patches I'm working on for a loop psuedo
filesystem which would allow a container to allocate loop devices for
its own use (via the loop-control device), in which case the kernel
creates a device node in the loopfs filesystem from which the new loop
device was allocated, owned by the root user in the user namespace
associated with the loopfs superblock.

Obviously a DM block device is more complicated than a traditional block
device. This patch should ensure that if an unprivileged user tries to
mount a DM blkdev that it fails if the user lacks permissions for the DM
blkdev inode. There's also the blkdevs behind the DM blkdev, and I'm not
sure whether we need to additionally verify that the user has access to
those devices. Maybe it's enough that someone with access to them set up
the DM device, and someone with sufficient privileges gave that user
access to the DM device. Do you have any thoughts?

As for activating a DM device, I assume only root is permitted to do
this. In that case lookup_bdev will skip the inode permission check
even if a non-zero permission mask is passed.

> I haven't kept up with user namespaces as it relates to stacking block
> drivers like DM.  But I'm happy to come up to speed and at the same time
> help you verify all works as expected with DM blocks devices...

That's great, I really appreciate it.

Seth
Eric W. Biederman Oct. 1, 2015, 3:40 p.m. UTC | #5
Seth Forshee <seth.forshee@canonical.com> writes:

> When mounting a filesystem on a block device there is currently
> no verification that the user has appropriate access to the
> device file passed to mount. This has not been an issue so far
> since the user in question has always been root, but this must
> be changed before allowing unprivileged users to mount in user
> namespaces.
>
> To fix this, add an argument to lookup_bdev() to specify the
> required permissions. If the mask of permissions is zero, or
> if the user has CAP_SYS_ADMIN, the permission check is skipped,
> otherwise the lookup fails if the user does not have the
> specified access rights for the inode at the supplied path.
>
> Callers associated with mounting are updated to pass permission
> masks to lookup_bdev() so that these mounts will fail for an
> unprivileged user who lacks permissions for the block device
> inode. All other callers pass 0 to maintain their current
> behaviors.
>

Seth can you split this patch?

One patch to add an argument to lookup_bdev,
and then for each kind of callsite a follow-on patch (if we are ready
for that).

That will separate the logical changes and make things easier to track
via bisect and more importantly easier to review things.

Eric


> Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> ---
>  drivers/md/bcache/super.c |  2 +-
>  drivers/md/dm-table.c     |  2 +-
>  drivers/mtd/mtdsuper.c    |  6 +++++-
>  fs/block_dev.c            | 18 +++++++++++++++---
>  fs/quota/quota.c          |  2 +-
>  include/linux/fs.h        |  2 +-
>  6 files changed, 24 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 679a093a3bf6..e8287b0d1dac 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1926,7 +1926,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
>  				  sb);
>  	if (IS_ERR(bdev)) {
>  		if (bdev == ERR_PTR(-EBUSY)) {
> -			bdev = lookup_bdev(strim(path));
> +			bdev = lookup_bdev(strim(path), 0);
>  			mutex_lock(&bch_register_lock);
>  			if (!IS_ERR(bdev) && bch_is_open(bdev))
>  				err = "device already registered";
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index e76ed003769e..35bb3ea4cbe2 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -380,7 +380,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
>  	BUG_ON(!t);
>  
>  	/* convert the path to a device */
> -	bdev = lookup_bdev(path);
> +	bdev = lookup_bdev(path, 0);
>  	if (IS_ERR(bdev)) {
>  		dev = name_to_dev_t(path);
>  		if (!dev)
> diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
> index 20c02a3b7417..5d7e7705fed8 100644
> --- a/drivers/mtd/mtdsuper.c
> +++ b/drivers/mtd/mtdsuper.c
> @@ -125,6 +125,7 @@ struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
>  #ifdef CONFIG_BLOCK
>  	struct block_device *bdev;
>  	int ret, major;
> +	int perm;
>  #endif
>  	int mtdnr;
>  
> @@ -176,7 +177,10 @@ struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
>  	/* try the old way - the hack where we allowed users to mount
>  	 * /dev/mtdblock$(n) but didn't actually _use_ the blockdev
>  	 */
> -	bdev = lookup_bdev(dev_name);
> +	perm = MAY_READ;
> +	if (!(flags & MS_RDONLY))
> +		perm |= MAY_WRITE;
> +	bdev = lookup_bdev(dev_name, perm);
>  	if (IS_ERR(bdev)) {
>  		ret = PTR_ERR(bdev);
>  		pr_debug("MTDSB: lookup_bdev() returned %d\n", ret);
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 26cee058dc02..54d94cd64577 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1394,9 +1394,14 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
>  					void *holder)
>  {
>  	struct block_device *bdev;
> +	int perm = 0;
>  	int err;
>  
> -	bdev = lookup_bdev(path);
> +	if (mode & FMODE_READ)
> +		perm |= MAY_READ;
> +	if (mode & FMODE_WRITE)
> +		perm |= MAY_WRITE;
> +	bdev = lookup_bdev(path, perm);
>  	if (IS_ERR(bdev))
>  		return bdev;
>  
> @@ -1706,12 +1711,14 @@ EXPORT_SYMBOL(ioctl_by_bdev);
>  /**
>   * lookup_bdev  - lookup a struct block_device by name
>   * @pathname:	special file representing the block device
> + * @mask:	rights to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
>   *
>   * Get a reference to the blockdevice at @pathname in the current
>   * namespace if possible and return it.  Return ERR_PTR(error)
> - * otherwise.
> + * otherwise.  If @mask is non-zero, check for access rights to the
> + * inode at @pathname.
>   */
> -struct block_device *lookup_bdev(const char *pathname)
> +struct block_device *lookup_bdev(const char *pathname, int mask)
>  {
>  	struct block_device *bdev;
>  	struct inode *inode;
> @@ -1726,6 +1733,11 @@ struct block_device *lookup_bdev(const char *pathname)
>  		return ERR_PTR(error);
>  
>  	inode = d_backing_inode(path.dentry);
> +	if (mask != 0 && !capable(CAP_SYS_ADMIN)) {
> +		error = __inode_permission(inode, mask);
> +		if (error)
> +			goto fail;
> +	}
>  	error = -ENOTBLK;
>  	if (!S_ISBLK(inode->i_mode))
>  		goto fail;
> diff --git a/fs/quota/quota.c b/fs/quota/quota.c
> index 3746367098fd..a40eaecbd5cc 100644
> --- a/fs/quota/quota.c
> +++ b/fs/quota/quota.c
> @@ -733,7 +733,7 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
>  
>  	if (IS_ERR(tmp))
>  		return ERR_CAST(tmp);
> -	bdev = lookup_bdev(tmp->name);
> +	bdev = lookup_bdev(tmp->name, 0);
>  	putname(tmp);
>  	if (IS_ERR(bdev))
>  		return ERR_CAST(bdev);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 458ee7b213be..cc18dfb0b98e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2388,7 +2388,7 @@ static inline void unregister_chrdev(unsigned int major, const char *name)
>  #define BLKDEV_MAJOR_HASH_SIZE	255
>  extern const char *__bdevname(dev_t, char *buffer);
>  extern const char *bdevname(struct block_device *bdev, char *buffer);
> -extern struct block_device *lookup_bdev(const char *);
> +extern struct block_device *lookup_bdev(const char *, int mask);
>  extern void blkdev_show(struct seq_file *,off_t);
>  
>  #else
Seth Forshee Oct. 1, 2015, 3:55 p.m. UTC | #6
On Thu, Oct 01, 2015 at 10:40:08AM -0500, Eric W. Biederman wrote:
> Seth Forshee <seth.forshee@canonical.com> writes:
> 
> > When mounting a filesystem on a block device there is currently
> > no verification that the user has appropriate access to the
> > device file passed to mount. This has not been an issue so far
> > since the user in question has always been root, but this must
> > be changed before allowing unprivileged users to mount in user
> > namespaces.
> >
> > To fix this, add an argument to lookup_bdev() to specify the
> > required permissions. If the mask of permissions is zero, or
> > if the user has CAP_SYS_ADMIN, the permission check is skipped,
> > otherwise the lookup fails if the user does not have the
> > specified access rights for the inode at the supplied path.
> >
> > Callers associated with mounting are updated to pass permission
> > masks to lookup_bdev() so that these mounts will fail for an
> > unprivileged user who lacks permissions for the block device
> > inode. All other callers pass 0 to maintain their current
> > behaviors.
> >
> 
> Seth can you split this patch?
> 
> One patch to add an argument to lookup_bdev,
> and then for each kind of callsite a follow-on patch (if we are ready
> for that).
> 
> That will separate the logical changes and make things easier to track
> via bisect and more importantly easier to review things.

Sure, I'll do that.

Seth
Eric W. Biederman Oct. 1, 2015, 3:55 p.m. UTC | #7
Mike Snitzer <snitzer@redhat.com> writes:

> What layer establishes access rights to historically root-only
> priviledged block devices?  Is it user namespaces?

Block devices are weird.

Mounts historically have not checked the permissions on the block
devices because a mounter has CAP_SYS_ADMIN.

Unprivileged users are allowes to read/write block devices if
someone has given them permissions on the device node in the
filesystem.

The thinking with this patchset is to start performing the normal
block device access permission checks when mounting filesystems
when the mounter does not have the global CAP_SYS_ADMIN permission.

The truth is we are not much past the point of realizing that there were
no permission checks to use the actual block device passed in to mount,
so we could still be missing something. There is a lot going on with dm,
md, and lvm.  I don't know if the model of just look at the block device
inode and perform the permission checks is good enough.

> I haven't kept up with user namespaces as it relates to stacking block
> drivers like DM.  But I'm happy to come up to speed and at the same time
> help you verify all works as expected with DM blocks devices...

We are just getting there.  But if you can help that would be great.
The primary concern with dm is what happens when unprivileged users get
ahold of the code, and what happens when evil users corrupt the on-disk
format.

In principle dm like loop should be safe to use if there are not bugs
that make it unsafe for unprivileged users to access the code.

The goal if possible is to run things like docker without needed to be
root or even more fun to run docker in a container, and in general
enable nested containers.

Eric
Jan Kara Oct. 1, 2015, 11:07 p.m. UTC | #8
On Thu 01-10-15 10:55:50, Eric W. Biederman wrote:
> The goal if possible is to run things like docker without needed to be
> root or even more fun to run docker in a container, and in general
> enable nested containers.

Frankly at the filesystem side we are rather far from being able to safely
mount untrusted device and I don't think we'll ever be robust enough to
tolerate e.g. user changing the disk while fs is using it. So would this be
FUSE-only thing or is someone still hoping that general purpose filesystems
will be robust enough in future?

								Honza
Seth Forshee Oct. 5, 2015, 2:26 p.m. UTC | #9
On Fri, Oct 02, 2015 at 01:07:00AM +0200, Jan Kara wrote:
> On Thu 01-10-15 10:55:50, Eric W. Biederman wrote:
> > The goal if possible is to run things like docker without needed to be
> > root or even more fun to run docker in a container, and in general
> > enable nested containers.
> 
> Frankly at the filesystem side we are rather far from being able to safely
> mount untrusted device and I don't think we'll ever be robust enough to
> tolerate e.g. user changing the disk while fs is using it. So would this be
> FUSE-only thing or is someone still hoping that general purpose filesystems
> will be robust enough in future?

FUSE will almost certainly be first. I've also been working with ext4,
and I would like to see that eventually supported to some degree.

Seth
Seth Forshee Oct. 8, 2015, 3:41 p.m. UTC | #10
On Thu, Oct 01, 2015 at 09:41:37AM -0500, Seth Forshee wrote:
> On Thu, Oct 01, 2015 at 09:40:52AM -0400, Mike Snitzer wrote:
> > On Thu, Oct 01 2015 at  8:55am -0400,
> > Seth Forshee <seth.forshee@canonical.com> wrote:
> > 
> > > On Wed, Sep 30, 2015 at 07:42:15PM -0400, Mike Snitzer wrote:
> > > > On Wed, Sep 30 2015 at  4:15pm -0400,
> > > > Seth Forshee <seth.forshee@canonical.com> wrote:
> > > > 
> > > > > When mounting a filesystem on a block device there is currently
> > > > > no verification that the user has appropriate access to the
> > > > > device file passed to mount. This has not been an issue so far
> > > > > since the user in question has always been root, but this must
> > > > > be changed before allowing unprivileged users to mount in user
> > > > > namespaces.
> > > > > 
> > > > > To fix this, add an argument to lookup_bdev() to specify the
> > > > > required permissions. If the mask of permissions is zero, or
> > > > > if the user has CAP_SYS_ADMIN, the permission check is skipped,
> > > > > otherwise the lookup fails if the user does not have the
> > > > > specified access rights for the inode at the supplied path.
> > > > > 
> > > > > Callers associated with mounting are updated to pass permission
> > > > > masks to lookup_bdev() so that these mounts will fail for an
> > > > > unprivileged user who lacks permissions for the block device
> > > > > inode. All other callers pass 0 to maintain their current
> > > > > behaviors.
> > > > > 
> > > > > Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> > > > > ---
> > > > >  drivers/md/bcache/super.c |  2 +-
> > > > >  drivers/md/dm-table.c     |  2 +-
> > > > >  drivers/mtd/mtdsuper.c    |  6 +++++-
> > > > >  fs/block_dev.c            | 18 +++++++++++++++---
> > > > >  fs/quota/quota.c          |  2 +-
> > > > >  include/linux/fs.h        |  2 +-
> > > > >  6 files changed, 24 insertions(+), 8 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > > > > index e76ed003769e..35bb3ea4cbe2 100644
> > > > > --- a/drivers/md/dm-table.c
> > > > > +++ b/drivers/md/dm-table.c
> > > > > @@ -380,7 +380,7 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
> > > > >  	BUG_ON(!t);
> > > > >  
> > > > >  	/* convert the path to a device */
> > > > > -	bdev = lookup_bdev(path);
> > > > > +	bdev = lookup_bdev(path, 0);
> > > > >  	if (IS_ERR(bdev)) {
> > > > >  		dev = name_to_dev_t(path);
> > > > >  		if (!dev)
> > > > 
> > > > Given dm_get_device() is passed @mode why not have it do something like
> > > > you did in blkdev_get_by_path()? e.g.:
> > > 
> > > I only dealt with code related to mounting in this patch since that's
> > > what I'm working on. I have it on my TODO list to consider converting
> > > other callers of lookup_bdev. But if you're sure doing so makes sense
> > > for dm_get_device and that it won't cause regressions then I could add a
> > > patch for it.
> > 
> > OK, dm_get_device() is called in DM device activation path (by tools
> > like lvm2).
> > 
> > After lookup_bdev() it goes on to call blkdev_get_by_dev() with this
> > call chain: 
> >   dm_get_device -> dm_get_table_device -> open_table_device -> blkdev_get_by_dev
> > 
> > Not immediately clear to me why we'd need to augment blkdev_get_by_dev()
> > to do this checking also.
> > 
> > However, thinking further: In a device stack (e.g. dm/lvm2, md, etc)
> > new virtual block devices are created that layer ontop of the
> > traditional block devices.  This level of indirection may cause your
> > lookup_bdev() check to go on to succeed (if access constraints were not
> > established on the upper level dm or md device?).  I'm just thinking
> > outloud here: but have you verified your changes work as intended on
> > devices created with either lvm2 or mdadm?
> > 
> > What layer establishes access rights to historically root-only
> > priviledged block devices?  Is it user namespaces?
> 
> I'm going to start with this last question and work my way backwards.
> 
> Who determines access rights to the block devices varies to some degree.
> Any normal user could unshare the user/mount namespaces and still see
> all the block devices in /dev. If we're going to allow that user to then
> mount block devices in their private namespaces, the kernel must verify
> that the user has appropriate permissions for the block device inode.
> That's the point of this patch. In a container context the host process
> which sets up the container might share some block devices into the
> container via bind mounts, in which case it would be responsible for
> setting up access rights (both via inode permissions and potentially via
> devcgroups). I also have some patches I'm working on for a loop psuedo
> filesystem which would allow a container to allocate loop devices for
> its own use (via the loop-control device), in which case the kernel
> creates a device node in the loopfs filesystem from which the new loop
> device was allocated, owned by the root user in the user namespace
> associated with the loopfs superblock.
> 
> Obviously a DM block device is more complicated than a traditional block
> device. This patch should ensure that if an unprivileged user tries to
> mount a DM blkdev that it fails if the user lacks permissions for the DM
> blkdev inode. There's also the blkdevs behind the DM blkdev, and I'm not
> sure whether we need to additionally verify that the user has access to
> those devices. Maybe it's enough that someone with access to them set up
> the DM device, and someone with sufficient privileges gave that user
> access to the DM device. Do you have any thoughts?
> 
> As for activating a DM device, I assume only root is permitted to do
> this. In that case lookup_bdev will skip the inode permission check
> even if a non-zero permission mask is passed.

I've been looking into this more and doing some testing.

As far as mounting goes, it ends up behaving as I expected. As long as
the user has permission to access the inode for the DM block device, the
device can be mounted. Access to the block devices behind the DM block
device is not required. I think this makes sense - it's consistent with
how read/write access to the DM block device works already independent
of user namespaces.

One thing I noticed that might be a concern (I'm not sure) are ioctls on
DM block devices. It looks to me like often these will be passed through
to the underlying block device.

It appears that all code paths which call dm_get_device originate with
ioctls on /dev/mapper/control. Doing this requires CAP_SYS_ADMIN, in
which case the permission check in lookup_bdev is skipped. So there's no
benefit to having dm_get_device pass a permission mask to
blkdev_get_by_path, nor is there any harm. It will only matter if at
some point userns-root is allowed to set up DM devices.

Seth
diff mbox

Patch

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 679a093a3bf6..e8287b0d1dac 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1926,7 +1926,7 @@  static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 				  sb);
 	if (IS_ERR(bdev)) {
 		if (bdev == ERR_PTR(-EBUSY)) {
-			bdev = lookup_bdev(strim(path));
+			bdev = lookup_bdev(strim(path), 0);
 			mutex_lock(&bch_register_lock);
 			if (!IS_ERR(bdev) && bch_is_open(bdev))
 				err = "device already registered";
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e76ed003769e..35bb3ea4cbe2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -380,7 +380,7 @@  int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 	BUG_ON(!t);
 
 	/* convert the path to a device */
-	bdev = lookup_bdev(path);
+	bdev = lookup_bdev(path, 0);
 	if (IS_ERR(bdev)) {
 		dev = name_to_dev_t(path);
 		if (!dev)
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 20c02a3b7417..5d7e7705fed8 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -125,6 +125,7 @@  struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
 #ifdef CONFIG_BLOCK
 	struct block_device *bdev;
 	int ret, major;
+	int perm;
 #endif
 	int mtdnr;
 
@@ -176,7 +177,10 @@  struct dentry *mount_mtd(struct file_system_type *fs_type, int flags,
 	/* try the old way - the hack where we allowed users to mount
 	 * /dev/mtdblock$(n) but didn't actually _use_ the blockdev
 	 */
-	bdev = lookup_bdev(dev_name);
+	perm = MAY_READ;
+	if (!(flags & MS_RDONLY))
+		perm |= MAY_WRITE;
+	bdev = lookup_bdev(dev_name, perm);
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
 		pr_debug("MTDSB: lookup_bdev() returned %d\n", ret);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 26cee058dc02..54d94cd64577 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1394,9 +1394,14 @@  struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 					void *holder)
 {
 	struct block_device *bdev;
+	int perm = 0;
 	int err;
 
-	bdev = lookup_bdev(path);
+	if (mode & FMODE_READ)
+		perm |= MAY_READ;
+	if (mode & FMODE_WRITE)
+		perm |= MAY_WRITE;
+	bdev = lookup_bdev(path, perm);
 	if (IS_ERR(bdev))
 		return bdev;
 
@@ -1706,12 +1711,14 @@  EXPORT_SYMBOL(ioctl_by_bdev);
 /**
  * lookup_bdev  - lookup a struct block_device by name
  * @pathname:	special file representing the block device
+ * @mask:	rights to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  *
  * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
- * otherwise.
+ * otherwise.  If @mask is non-zero, check for access rights to the
+ * inode at @pathname.
  */
-struct block_device *lookup_bdev(const char *pathname)
+struct block_device *lookup_bdev(const char *pathname, int mask)
 {
 	struct block_device *bdev;
 	struct inode *inode;
@@ -1726,6 +1733,11 @@  struct block_device *lookup_bdev(const char *pathname)
 		return ERR_PTR(error);
 
 	inode = d_backing_inode(path.dentry);
+	if (mask != 0 && !capable(CAP_SYS_ADMIN)) {
+		error = __inode_permission(inode, mask);
+		if (error)
+			goto fail;
+	}
 	error = -ENOTBLK;
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 3746367098fd..a40eaecbd5cc 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -733,7 +733,7 @@  static struct super_block *quotactl_block(const char __user *special, int cmd)
 
 	if (IS_ERR(tmp))
 		return ERR_CAST(tmp);
-	bdev = lookup_bdev(tmp->name);
+	bdev = lookup_bdev(tmp->name, 0);
 	putname(tmp);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 458ee7b213be..cc18dfb0b98e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2388,7 +2388,7 @@  static inline void unregister_chrdev(unsigned int major, const char *name)
 #define BLKDEV_MAJOR_HASH_SIZE	255
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
-extern struct block_device *lookup_bdev(const char *);
+extern struct block_device *lookup_bdev(const char *, int mask);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else