diff mbox series

[v4,3/5] 9p: Added virtfs option "remap_inodes"

Message ID 91b9f8920735847e6c0e84ab6dc2c689aed13cc7.1561575449.git.qemu_oss@crudebyte.com
State New
Headers show
Series 9p: Fix file ID collisions | expand

Commit Message

Cameron Esfahani via June 26, 2019, 6:42 p.m. UTC
To support multiple devices on the 9p share, and avoid
qid path collisions we take the device id as input
to generate a unique QID path. The lowest 48 bits of
the path will be set equal to the file inode, and the
top bits will be uniquely assigned based on the top
16 bits of the inode and the device id.

Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 fsdev/file-op-9p.h      |   1 +
 fsdev/qemu-fsdev-opts.c |   7 +++-
 fsdev/qemu-fsdev.c      |   6 +++
 hw/9pfs/9p.c            | 105 ++++++++++++++++++++++++++++++++++++++++++------
 hw/9pfs/9p.h            |  12 ++++++
 qemu-options.hx         |  17 +++++++-
 vl.c                    |   3 ++
 7 files changed, 135 insertions(+), 16 deletions(-)

Comments

Greg Kurz June 28, 2019, 10:09 a.m. UTC | #1
On Wed, 26 Jun 2019 20:42:13 +0200
Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:

> To support multiple devices on the 9p share, and avoid
> qid path collisions we take the device id as input
> to generate a unique QID path. The lowest 48 bits of
> the path will be set equal to the file inode, and the
> top bits will be uniquely assigned based on the top
> 16 bits of the inode and the device id.
> 
> Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>

Same remark about changes to the original patch.

BTW, I had a concern with the way v9fs_do_readdir() open-codes QID
generation without calling stat_to_qid().

See discussion here:

https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02724.html

I guess you should ensure in a preliminary patch that QIDs only
come out of stat_to_qid().

> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  fsdev/file-op-9p.h      |   1 +
>  fsdev/qemu-fsdev-opts.c |   7 +++-
>  fsdev/qemu-fsdev.c      |   6 +++
>  hw/9pfs/9p.c            | 105 ++++++++++++++++++++++++++++++++++++++++++------
>  hw/9pfs/9p.h            |  12 ++++++
>  qemu-options.hx         |  17 +++++++-
>  vl.c                    |   3 ++
>  7 files changed, 135 insertions(+), 16 deletions(-)
> 
> diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h
> index c757c8099f..6c1663c252 100644
> --- a/fsdev/file-op-9p.h
> +++ b/fsdev/file-op-9p.h
> @@ -59,6 +59,7 @@ typedef struct ExtendedOps {
>  #define V9FS_RDONLY                 0x00000040
>  #define V9FS_PROXY_SOCK_FD          0x00000080
>  #define V9FS_PROXY_SOCK_NAME        0x00000100
> +#define V9FS_REMAP_INODES           0x00000200
>  
>  #define V9FS_SEC_MASK               0x0000003C
>  
> diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c
> index 7c31ffffaf..64a8052266 100644
> --- a/fsdev/qemu-fsdev-opts.c
> +++ b/fsdev/qemu-fsdev-opts.c
> @@ -31,7 +31,9 @@ static QemuOptsList qemu_fsdev_opts = {
>          }, {
>              .name = "readonly",
>              .type = QEMU_OPT_BOOL,
> -
> +        }, {
> +            .name = "remap_inodes",
> +            .type = QEMU_OPT_BOOL,
>          }, {
>              .name = "socket",
>              .type = QEMU_OPT_STRING,
> @@ -76,6 +78,9 @@ static QemuOptsList qemu_virtfs_opts = {
>              .name = "readonly",
>              .type = QEMU_OPT_BOOL,
>          }, {
> +            .name = "remap_inodes",
> +            .type = QEMU_OPT_BOOL,
> +        }, {
>              .name = "socket",
>              .type = QEMU_OPT_STRING,
>          }, {
> diff --git a/fsdev/qemu-fsdev.c b/fsdev/qemu-fsdev.c
> index 077a8c4e2b..b6fa9799be 100644
> --- a/fsdev/qemu-fsdev.c
> +++ b/fsdev/qemu-fsdev.c
> @@ -121,6 +121,7 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
>      const char *fsdev_id = qemu_opts_id(opts);
>      const char *fsdriver = qemu_opt_get(opts, "fsdriver");
>      const char *writeout = qemu_opt_get(opts, "writeout");
> +    bool remap_inodes = qemu_opt_get_bool(opts, "remap_inodes", 0);
>      bool ro = qemu_opt_get_bool(opts, "readonly", 0);
>  
>      if (!fsdev_id) {
> @@ -161,6 +162,11 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
>      } else {
>          fsle->fse.export_flags &= ~V9FS_RDONLY;
>      }
> +    if (remap_inodes) {
> +        fsle->fse.export_flags |= V9FS_REMAP_INODES;
> +    } else {
> +        fsle->fse.export_flags &= ~V9FS_REMAP_INODES;
> +    }
>  
>      if (fsle->fse.ops->parse_opts) {
>          if (fsle->fse.ops->parse_opts(opts, &fsle->fse, errp)) {
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index cbaa212625..7ccc68a829 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -25,6 +25,7 @@
>  #include "trace.h"
>  #include "migration/blocker.h"
>  #include "sysemu/qtest.h"
> +#include "qemu/xxhash.h"
>  
>  int open_fd_hw;
>  int total_open_fd;
> @@ -571,24 +572,96 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
>                                  P9_STAT_MODE_NAMED_PIPE |   \
>                                  P9_STAT_MODE_SOCKET)
>  
> -/* This is the algorithm from ufs in spfs */
> +
> +/* creative abuse of tb_hash_func7, which is based on xxhash */
> +static uint32_t qpp_hash(QppEntry e)
> +{
> +    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
> +}
> +
> +static bool qpp_lookup_func(const void *obj, const void *userp)
> +{
> +    const QppEntry *e1 = obj, *e2 = userp;
> +    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
> +}
> +
> +static void qpp_table_remove(void *p, uint32_t h, void *up)
> +{
> +    g_free(p);
> +}
> +
> +static void qpp_table_destroy(struct qht *ht)
> +{
> +    qht_iter(ht, qpp_table_remove, NULL);
> +    qht_destroy(ht);
> +}
> +
> +/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
> + * to a unique QID path (64 bits). To avoid having to map and keep track
> + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
> + * the device id to the 16 highest bits of the QID path. The 48 lowest bits
> + * of the QID path equal to the lowest bits of the inode number.
> + *
> + * This takes advantage of the fact that inode number are usually not
> + * random but allocated sequentially, so we have fewer items to keep
> + * track of.
> + */
> +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> +                                uint64_t *path)
> +{
> +    QppEntry lookup = {
> +        .dev = stbuf->st_dev,
> +        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> +    }, *val;
> +    uint32_t hash = qpp_hash(lookup);
> +
> +    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
> +
> +    if (!val) {
> +        if (pdu->s->qp_prefix_next == 0) {
> +            /* we ran out of prefixes */

And we won't ever be able to allocate a new one. Maybe worth
adding an error_report_once() to inform the user ?

> +            return -ENFILE;
> +        }
> +
> +        val = g_malloc0(sizeof(QppEntry));
> +        *val = lookup;
> +
> +        /* new unique inode prefix and device combo */
> +        val->qp_prefix = pdu->s->qp_prefix_next++;
> +        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
> +    }
> +
> +    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
> +    return 0;
> +}
> +
>  static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
>  {
> -    size_t size;
> +    int err;
>  
> -    if (pdu->s->dev_id == 0) {
> -        pdu->s->dev_id = stbuf->st_dev;
> -    } else if (pdu->s->dev_id != stbuf->st_dev) {
> -        error_report_once(
> -            "9p: Multiple devices detected in same VirtFS export. "
> -            "You must use a separate export for each device."
> -        );
> -        return -ENOSYS;
> +    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> +        /* map inode+device to qid path (fast path) */
> +        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> +        if (err) {
> +            return err;
> +        }
> +    } else {
> +        if (pdu->s->dev_id == 0) {
> +            pdu->s->dev_id = stbuf->st_dev;
> +        } else if (pdu->s->dev_id != stbuf->st_dev) {
> +            error_report_once(
> +                "9p: Multiple devices detected in same VirtFS export. "
> +                "You must either use a separate export for each device "
> +                "shared from host or enable virtfs option 'remap_inodes'."
> +            );
> +            return -ENOSYS;
> +        }
> +        size_t size;

From CODING_STYLE:

5. Declarations

Mixed declarations (interleaving statements and declarations within
blocks) are generally not allowed; declarations should be at the beginning
of blocks.

Please do so for "size" and add an extra blank line.

> +        memset(&qidp->path, 0, sizeof(qidp->path));
> +        size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
> +        memcpy(&qidp->path, &stbuf->st_ino, size);
>      }
>  
> -    memset(&qidp->path, 0, sizeof(qidp->path));
> -    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
> -    memcpy(&qidp->path, &stbuf->st_ino, size);
>      qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
>      qidp->type = 0;
>      if (S_ISDIR(stbuf->st_mode)) {
> @@ -3676,6 +3749,10 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
>  
>      s->dev_id = 0;
>  
> +    /* QID path hash table. 1 entry ought to be enough for anybody ;) */
> +    qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
> +    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
> +
>      s->ctx.fst = &fse->fst;
>      fsdev_throttle_init(s->ctx.fst);
>  
> @@ -3689,6 +3766,7 @@ out:
>          }
>          g_free(s->tag);
>          g_free(s->ctx.fs_root);
> +        qpp_table_destroy(&s->qpp_table);
>          v9fs_path_free(&path);
>      }
>      return rc;
> @@ -3701,6 +3779,7 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
>      }
>      fsdev_throttle_cleanup(s->ctx.fst);
>      g_free(s->tag);
> +    qpp_table_destroy(&s->qpp_table);
>      g_free(s->ctx.fs_root);
>  }
>  
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 5e316178d5..0200e04176 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -8,6 +8,7 @@
>  #include "fsdev/9p-iov-marshal.h"
>  #include "qemu/thread.h"
>  #include "qemu/coroutine.h"
> +#include "qemu/qht.h"
>  
>  enum {
>      P9_TLERROR = 6,
> @@ -235,6 +236,15 @@ struct V9fsFidState
>      V9fsFidState *rclm_lst;
>  };
>  
> +#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)

This won't give the expected result on a 32-bit host. Since this
is a mask for 64-bit entities, it should rather be:

#define QPATH_INO_MASK        ((1ULL << 48) - 1)

> +
> +/* QID path prefix entry, see stat_to_qid */
> +typedef struct {
> +    dev_t dev;
> +    uint16_t ino_prefix;
> +    uint16_t qp_prefix;
> +} QppEntry;
> +
>  struct V9fsState
>  {
>      QLIST_HEAD(, V9fsPDU) free_list;
> @@ -257,6 +267,8 @@ struct V9fsState
>      V9fsConf fsconf;
>      V9fsQID root_qid;
>      dev_t dev_id;
> +    struct qht qpp_table;
> +    uint16_t qp_prefix_next;
>  };
>  
>  /* 9p2000.L open flags */
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 0d8beb4afd..e7ea136da1 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -1334,7 +1334,7 @@ ETEXI
>  
>  DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
>      "-virtfs local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-file|passthrough|none\n"
> -    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n"
> +    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,remap_inodes]\n"

This feature applies to all backends IIUC. We don't really care for the
synth backend since it generates non-colliding inode numbers by design,
but the proxy backend has the same issue as local. So...

>      "-virtfs proxy,mount_tag=tag,socket=socket[,id=id][,writeout=immediate][,readonly]\n"
>      "-virtfs proxy,mount_tag=tag,sock_fd=sock_fd[,id=id][,writeout=immediate][,readonly]\n"

... please update these two ^^ as well.

>      "-virtfs synth,mount_tag=tag[,id=id][,readonly]\n",
> @@ -1342,7 +1342,7 @@ DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
>  
>  STEXI
>  
> -@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}]
> +@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}][,remap_inodes]
>  @itemx -virtfs proxy,socket=@var{socket},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
>  @itemx -virtfs proxy,sock_fd=@var{sock_fd},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]

Ditto.

>  @itemx -virtfs synth,mount_tag=@var{mount_tag}
> @@ -1398,6 +1398,19 @@ Specifies the default mode for newly created directories on the host. Works
>  only with security models "mapped-xattr" and "mapped-file".
>  @item mount_tag=@var{mount_tag}
>  Specifies the tag name to be used by the guest to mount this export point.
> +@item remap_inodes
> +By default virtfs 9p supports only one device per export in order to avoid
> +file ID collisions on guest which may otherwise happen because the original
> +device IDs from host are not passed and exposed on guest. Instead all files
> +of an export shared with virtfs do have the same device id on guest. So two
> +files with identical inode numbers but from actually different devices on
> +host would otherwise cause a file ID collision and hence potential
> +misbehaviours on guest. For that reason it is recommended to create a
> +separate virtfs export for each device to be shared with guests. However
> +you may also enable "remap_inodes" which allows you to share multiple
> +devices with only one export instead, which is achieved by remapping the
> +original inode numbers from host to guest in a way that would prevent such
> +collisions.
>  @end table
>  ETEXI
>  
> diff --git a/vl.c b/vl.c
> index 99a56b5556..de9d7b846c 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -3457,6 +3457,9 @@ int main(int argc, char **argv, char **envp)
>                  qemu_opt_set_bool(fsdev, "readonly",
>                                    qemu_opt_get_bool(opts, "readonly", 0),
>                                    &error_abort);
> +                qemu_opt_set_bool(fsdev, "remap_inodes",
> +                                  qemu_opt_get_bool(opts, "remap_inodes", 0),
> +                                  &error_abort);
>                  device = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
>                                            &error_abort);
>                  qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);
Cameron Esfahani via June 28, 2019, 1:47 p.m. UTC | #2
On Freitag, 28. Juni 2019 12:09:31 CEST Greg Kurz wrote:
> On Wed, 26 Jun 2019 20:42:13 +0200
> 
> Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > To support multiple devices on the 9p share, and avoid
> > qid path collisions we take the device id as input
> > to generate a unique QID path. The lowest 48 bits of
> > the path will be set equal to the file inode, and the
> > top bits will be uniquely assigned based on the top
> > 16 bits of the inode and the device id.
> > 
> > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> 
> Same remark about changes to the original patch.

ack_once();   :)

> BTW, I had a concern with the way v9fs_do_readdir() open-codes QID
> generation without calling stat_to_qid().
> 
> See discussion here:
> 
> https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02724.html
> 
> I guess you should ensure in a preliminary patch that QIDs only
> come out of stat_to_qid().

Mja, actually I first omitted your suggestion consciously, because I first 
thought it was an overkill pure visibility issue lmited to the default case 
remap_inodes==false, but now that I look at it again, it is actually an issue 
even when remap_inodes==true since dirent would expose wrong inode numbers on 
guest as well.

I will see what to do about it. However about your other concern here, quote:

	"Also, if we hit a collision while reading the directory, I'm
	 afraid the remaining entries won't be read at all. I'm not
	 sure this is really what we want."

That's however still a concern here that I would consider overkill to address. 
I mean if a user gets into that situation then because of a configuration error 
that must be corrected by user; the point of this patch set is to prevent 
undefined behaviour and to make the user aware about the root cause of the 
overall issue; the purpose is not to address all possible issues while there 
is still a configuration error.

> > +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> > +                                uint64_t *path)
> > +{
> > +    QppEntry lookup = {
> > +        .dev = stbuf->st_dev,
> > +        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> > +    }, *val;
> > +    uint32_t hash = qpp_hash(lookup);
> > +
> > +    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
> > +
> > +    if (!val) {
> > +        if (pdu->s->qp_prefix_next == 0) {
> > +            /* we ran out of prefixes */
> 
> And we won't ever be able to allocate a new one. Maybe worth
> adding an error_report_once() to inform the user ?

Yeah, I thought about that as well. Will do.

> >  static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID
> >  *qidp) {
> > 
> > -    size_t size;
> > +    int err;
> > 
> > -    if (pdu->s->dev_id == 0) {
> > -        pdu->s->dev_id = stbuf->st_dev;
> > -    } else if (pdu->s->dev_id != stbuf->st_dev) {
> > -        error_report_once(
> > -            "9p: Multiple devices detected in same VirtFS export. "
> > -            "You must use a separate export for each device."
> > -        );
> > -        return -ENOSYS;
> > +    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> > +        /* map inode+device to qid path (fast path) */
> > +        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> > +        if (err) {
> > +            return err;
> > +        }
> > +    } else {
> > +        if (pdu->s->dev_id == 0) {
> > +            pdu->s->dev_id = stbuf->st_dev;
> > +        } else if (pdu->s->dev_id != stbuf->st_dev) {
> > +            error_report_once(
> > +                "9p: Multiple devices detected in same VirtFS export. "
> > +                "You must either use a separate export for each device "
> > +                "shared from host or enable virtfs option
> > 'remap_inodes'."
> > +            );
> > +            return -ENOSYS;
> > +        }
> > +        size_t size;
> 
> From CODING_STYLE:
> 
> 5. Declarations
> 
> Mixed declarations (interleaving statements and declarations within
> blocks) are generally not allowed; declarations should be at the beginning
> of blocks.
> 
> Please do so for "size" and add an extra blank line.

Ok.

> > +#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
> 
> This won't give the expected result on a 32-bit host. Since this
> is a mask for 64-bit entities, it should rather be:
> 
> #define QPATH_INO_MASK        ((1ULL << 48) - 1)

Correct, will fix it.

> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 0d8beb4afd..e7ea136da1 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -1334,7 +1334,7 @@ ETEXI
> > 
> >  DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
> >  
> >      "-virtfs
> >      local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-fil
> >      e|passthrough|none\n"> 
> > -    "       
> > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n" +
> >    "       
> > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,rem
> > ap_inodes]\n"
> This feature applies to all backends IIUC. We don't really care for the
> synth backend since it generates non-colliding inode numbers by design,
> but the proxy backend has the same issue as local. So...

Yeah, I was not sure about these, because I did not even know what these two 
were for exactly. :)  [ lazyness disclaimer end]

Will do for the other manual locations you mentioned as well.

Best regards,
Christian Schoenebeck
Greg Kurz June 28, 2019, 2:23 p.m. UTC | #3
On Fri, 28 Jun 2019 15:47:52 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Freitag, 28. Juni 2019 12:09:31 CEST Greg Kurz wrote:
> > On Wed, 26 Jun 2019 20:42:13 +0200
> > 
> > Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > > To support multiple devices on the 9p share, and avoid
> > > qid path collisions we take the device id as input
> > > to generate a unique QID path. The lowest 48 bits of
> > > the path will be set equal to the file inode, and the
> > > top bits will be uniquely assigned based on the top
> > > 16 bits of the inode and the device id.
> > > 
> > > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> > 
> > Same remark about changes to the original patch.
> 
> ack_once();   :)
> 

:)

> > BTW, I had a concern with the way v9fs_do_readdir() open-codes QID
> > generation without calling stat_to_qid().
> > 
> > See discussion here:
> > 
> > https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02724.html
> > 
> > I guess you should ensure in a preliminary patch that QIDs only
> > come out of stat_to_qid().
> 
> Mja, actually I first omitted your suggestion consciously, because I first 
> thought it was an overkill pure visibility issue lmited to the default case 
> remap_inodes==false, but now that I look at it again, it is actually an issue 
> even when remap_inodes==true since dirent would expose wrong inode numbers on 
> guest as well.
> 
> I will see what to do about it. However about your other concern here, quote:
> 
> 	"Also, if we hit a collision while reading the directory, I'm
> 	 afraid the remaining entries won't be read at all. I'm not
> 	 sure this is really what we want."
> 
> That's however still a concern here that I would consider overkill to address. 
> I mean if a user gets into that situation then because of a configuration error 
> that must be corrected by user; the point of this patch set is to prevent 
> undefined behaviour and to make the user aware about the root cause of the 
> overall issue; the purpose is not to address all possible issues while there 
> is still a configuration error.
> 

Fair enough. And anyway, if we really need to address that, it can be done
later.

> > > +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> > > +                                uint64_t *path)
> > > +{
> > > +    QppEntry lookup = {
> > > +        .dev = stbuf->st_dev,
> > > +        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> > > +    }, *val;
> > > +    uint32_t hash = qpp_hash(lookup);
> > > +
> > > +    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
> > > +
> > > +    if (!val) {
> > > +        if (pdu->s->qp_prefix_next == 0) {
> > > +            /* we ran out of prefixes */
> > 
> > And we won't ever be able to allocate a new one. Maybe worth
> > adding an error_report_once() to inform the user ?
> 
> Yeah, I thought about that as well. Will do.
> 
> > >  static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID
> > >  *qidp) {
> > > 
> > > -    size_t size;
> > > +    int err;
> > > 
> > > -    if (pdu->s->dev_id == 0) {
> > > -        pdu->s->dev_id = stbuf->st_dev;
> > > -    } else if (pdu->s->dev_id != stbuf->st_dev) {
> > > -        error_report_once(
> > > -            "9p: Multiple devices detected in same VirtFS export. "
> > > -            "You must use a separate export for each device."
> > > -        );
> > > -        return -ENOSYS;
> > > +    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> > > +        /* map inode+device to qid path (fast path) */
> > > +        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> > > +        if (err) {
> > > +            return err;
> > > +        }
> > > +    } else {
> > > +        if (pdu->s->dev_id == 0) {
> > > +            pdu->s->dev_id = stbuf->st_dev;
> > > +        } else if (pdu->s->dev_id != stbuf->st_dev) {
> > > +            error_report_once(
> > > +                "9p: Multiple devices detected in same VirtFS export. "
> > > +                "You must either use a separate export for each device "
> > > +                "shared from host or enable virtfs option
> > > 'remap_inodes'."
> > > +            );
> > > +            return -ENOSYS;
> > > +        }
> > > +        size_t size;
> > 
> > From CODING_STYLE:
> > 
> > 5. Declarations
> > 
> > Mixed declarations (interleaving statements and declarations within
> > blocks) are generally not allowed; declarations should be at the beginning
> > of blocks.
> > 
> > Please do so for "size" and add an extra blank line.
> 
> Ok.
> 
> > > +#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
> > 
> > This won't give the expected result on a 32-bit host. Since this
> > is a mask for 64-bit entities, it should rather be:
> > 
> > #define QPATH_INO_MASK        ((1ULL << 48) - 1)
> 
> Correct, will fix it.
> 
> > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > index 0d8beb4afd..e7ea136da1 100644
> > > --- a/qemu-options.hx
> > > +++ b/qemu-options.hx
> > > @@ -1334,7 +1334,7 @@ ETEXI
> > > 
> > >  DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
> > >  
> > >      "-virtfs
> > >      local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-fil
> > >      e|passthrough|none\n"> 
> > > -    "       
> > > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n" +
> > >    "       
> > > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,rem
> > > ap_inodes]\n"
> > This feature applies to all backends IIUC. We don't really care for the
> > synth backend since it generates non-colliding inode numbers by design,
> > but the proxy backend has the same issue as local. So...
> 
> Yeah, I was not sure about these, because I did not even know what these two 
> were for exactly. :)  [ lazyness disclaimer end]
> 

"proxy" is a backend where all I/O accesses are performed by a separate
process running the virtfs-proxy-helper command. It runs with root
privileges, which provides the same level of functionality as "local"
with security_model=passthrough. It also chroot() into the shared
folder for extra security. But it is slower since it all requests
still go through the virtio-9p device in QEMU. This would call
for a vhost-9p implementation, but it's yet another story.

"synth" is a software pseudo-backend, currently used to test 9pfs
with QTest (see tests/virtio-9p-test.c).

> Will do for the other manual locations you mentioned as well.
> 
> Best regards,
> Christian Schoenebeck
Cameron Esfahani via June 29, 2019, 10:20 a.m. UTC | #4
On Freitag, 28. Juni 2019 16:23:08 CEST Greg Kurz wrote:
> > > This feature applies to all backends IIUC. We don't really care for the
> > > synth backend since it generates non-colliding inode numbers by design,
> > > but the proxy backend has the same issue as local. So...
> > 
> > Yeah, I was not sure about these, because I did not even know what these
> > two were for exactly. :)  [ lazyness disclaimer end]
> 
> "proxy" is a backend where all I/O accesses are performed by a separate
> process running the virtfs-proxy-helper command. It runs with root
> privileges, which provides the same level of functionality as "local"
> with security_model=passthrough. It also chroot() into the shared
> folder for extra security. But it is slower since it all requests
> still go through the virtio-9p device in QEMU. This would call
> for a vhost-9p implementation, but it's yet another story.
> 
> "synth" is a software pseudo-backend, currently used to test 9pfs
> with QTest (see tests/virtio-9p-test.c).

Thanks for the clarification!

So the proxy backend sounds like an idea that has not been implemented fully 
to its end. I guess it is not really used in production environments, right? 
What is the actual motivation for this proxy backend?

And now that I look at it, I am a bit surprised that there is this pure Unix 
pipe socket based proxy variant, but no TCPIP network socket variant. I mean 
the latter is AFAIK the original idea behind the 9p protocol and IMO might be 
interesting to physical separate pure storage backends that way.

Best regards,
Christian Schoenebeck
Greg Kurz July 2, 2019, 8:01 a.m. UTC | #5
On Sat, 29 Jun 2019 12:20:49 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Freitag, 28. Juni 2019 16:23:08 CEST Greg Kurz wrote:
> > > > This feature applies to all backends IIUC. We don't really care for the
> > > > synth backend since it generates non-colliding inode numbers by design,
> > > > but the proxy backend has the same issue as local. So...
> > > 
> > > Yeah, I was not sure about these, because I did not even know what these
> > > two were for exactly. :)  [ lazyness disclaimer end]
> > 
> > "proxy" is a backend where all I/O accesses are performed by a separate
> > process running the virtfs-proxy-helper command. It runs with root
> > privileges, which provides the same level of functionality as "local"
> > with security_model=passthrough. It also chroot() into the shared
> > folder for extra security. But it is slower since it all requests
> > still go through the virtio-9p device in QEMU. This would call
> > for a vhost-9p implementation, but it's yet another story.
> > 
> > "synth" is a software pseudo-backend, currently used to test 9pfs
> > with QTest (see tests/virtio-9p-test.c).
> 
> Thanks for the clarification!
> 
> So the proxy backend sounds like an idea that has not been implemented fully 
> to its end. I guess it is not really used in production environments, right? 

I don't have any feedback unfortunately... The biggest problem with proxy is
likely it's poor performance : every request needs to go through many hops.

guest->QEMU->proxy->QEMU->guest 

> What is the actual motivation for this proxy backend?
> 

The motivation is security: only the proxy helper runs with privileges (we
generally don't want to run QEMU as root), the helper can chroot() and thus
prevent the guest to access anything outside the shared folder.

> And now that I look at it, I am a bit surprised that there is this pure Unix 
> pipe socket based proxy variant, but no TCPIP network socket variant. I mean 

The Unix socket is required in order to pass open file descriptors between
QEMU and the proxy, using SCM_RIGHTS ancillary messages. There's no such
thing with TCPIP sockets.

> the latter is AFAIK the original idea behind the 9p protocol and IMO might be 
> interesting to physical separate pure storage backends that way.
> 

The right thing to do would be to have the "proxy" process to directly
access the rings of the virtio-9p device (ie, vhost), so that requests
only go through:

guest->proxy->guest

> Best regards,
> Christian Schoenebeck
diff mbox series

Patch

diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h
index c757c8099f..6c1663c252 100644
--- a/fsdev/file-op-9p.h
+++ b/fsdev/file-op-9p.h
@@ -59,6 +59,7 @@  typedef struct ExtendedOps {
 #define V9FS_RDONLY                 0x00000040
 #define V9FS_PROXY_SOCK_FD          0x00000080
 #define V9FS_PROXY_SOCK_NAME        0x00000100
+#define V9FS_REMAP_INODES           0x00000200
 
 #define V9FS_SEC_MASK               0x0000003C
 
diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c
index 7c31ffffaf..64a8052266 100644
--- a/fsdev/qemu-fsdev-opts.c
+++ b/fsdev/qemu-fsdev-opts.c
@@ -31,7 +31,9 @@  static QemuOptsList qemu_fsdev_opts = {
         }, {
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
-
+        }, {
+            .name = "remap_inodes",
+            .type = QEMU_OPT_BOOL,
         }, {
             .name = "socket",
             .type = QEMU_OPT_STRING,
@@ -76,6 +78,9 @@  static QemuOptsList qemu_virtfs_opts = {
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
         }, {
+            .name = "remap_inodes",
+            .type = QEMU_OPT_BOOL,
+        }, {
             .name = "socket",
             .type = QEMU_OPT_STRING,
         }, {
diff --git a/fsdev/qemu-fsdev.c b/fsdev/qemu-fsdev.c
index 077a8c4e2b..b6fa9799be 100644
--- a/fsdev/qemu-fsdev.c
+++ b/fsdev/qemu-fsdev.c
@@ -121,6 +121,7 @@  int qemu_fsdev_add(QemuOpts *opts, Error **errp)
     const char *fsdev_id = qemu_opts_id(opts);
     const char *fsdriver = qemu_opt_get(opts, "fsdriver");
     const char *writeout = qemu_opt_get(opts, "writeout");
+    bool remap_inodes = qemu_opt_get_bool(opts, "remap_inodes", 0);
     bool ro = qemu_opt_get_bool(opts, "readonly", 0);
 
     if (!fsdev_id) {
@@ -161,6 +162,11 @@  int qemu_fsdev_add(QemuOpts *opts, Error **errp)
     } else {
         fsle->fse.export_flags &= ~V9FS_RDONLY;
     }
+    if (remap_inodes) {
+        fsle->fse.export_flags |= V9FS_REMAP_INODES;
+    } else {
+        fsle->fse.export_flags &= ~V9FS_REMAP_INODES;
+    }
 
     if (fsle->fse.ops->parse_opts) {
         if (fsle->fse.ops->parse_opts(opts, &fsle->fse, errp)) {
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index cbaa212625..7ccc68a829 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -25,6 +25,7 @@ 
 #include "trace.h"
 #include "migration/blocker.h"
 #include "sysemu/qtest.h"
+#include "qemu/xxhash.h"
 
 int open_fd_hw;
 int total_open_fd;
@@ -571,24 +572,96 @@  static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_NAMED_PIPE |   \
                                 P9_STAT_MODE_SOCKET)
 
-/* This is the algorithm from ufs in spfs */
+
+/* creative abuse of tb_hash_func7, which is based on xxhash */
+static uint32_t qpp_hash(QppEntry e)
+{
+    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
+}
+
+static bool qpp_lookup_func(const void *obj, const void *userp)
+{
+    const QppEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
+}
+
+static void qpp_table_remove(void *p, uint32_t h, void *up)
+{
+    g_free(p);
+}
+
+static void qpp_table_destroy(struct qht *ht)
+{
+    qht_iter(ht, qpp_table_remove, NULL);
+    qht_destroy(ht);
+}
+
+/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+ * to a unique QID path (64 bits). To avoid having to map and keep track
+ * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
+ * the device id to the 16 highest bits of the QID path. The 48 lowest bits
+ * of the QID path equal to the lowest bits of the inode number.
+ *
+ * This takes advantage of the fact that inode number are usually not
+ * random but allocated sequentially, so we have fewer items to keep
+ * track of.
+ */
+static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
+                                uint64_t *path)
+{
+    QppEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+    }, *val;
+    uint32_t hash = qpp_hash(lookup);
+
+    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_prefix_next == 0) {
+            /* we ran out of prefixes */
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        *val = lookup;
+
+        /* new unique inode prefix and device combo */
+        val->qp_prefix = pdu->s->qp_prefix_next++;
+        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+    }
+
+    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+    return 0;
+}
+
 static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
 {
-    size_t size;
+    int err;
 
-    if (pdu->s->dev_id == 0) {
-        pdu->s->dev_id = stbuf->st_dev;
-    } else if (pdu->s->dev_id != stbuf->st_dev) {
-        error_report_once(
-            "9p: Multiple devices detected in same VirtFS export. "
-            "You must use a separate export for each device."
-        );
-        return -ENOSYS;
+    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
+        /* map inode+device to qid path (fast path) */
+        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
+        if (err) {
+            return err;
+        }
+    } else {
+        if (pdu->s->dev_id == 0) {
+            pdu->s->dev_id = stbuf->st_dev;
+        } else if (pdu->s->dev_id != stbuf->st_dev) {
+            error_report_once(
+                "9p: Multiple devices detected in same VirtFS export. "
+                "You must either use a separate export for each device "
+                "shared from host or enable virtfs option 'remap_inodes'."
+            );
+            return -ENOSYS;
+        }
+        size_t size;
+        memset(&qidp->path, 0, sizeof(qidp->path));
+        size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
+        memcpy(&qidp->path, &stbuf->st_ino, size);
     }
 
-    memset(&qidp->path, 0, sizeof(qidp->path));
-    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
-    memcpy(&qidp->path, &stbuf->st_ino, size);
     qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
     qidp->type = 0;
     if (S_ISDIR(stbuf->st_mode)) {
@@ -3676,6 +3749,10 @@  int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
 
     s->dev_id = 0;
 
+    /* QID path hash table. 1 entry ought to be enough for anybody ;) */
+    qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
+    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
 
@@ -3689,6 +3766,7 @@  out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
+        qpp_table_destroy(&s->qpp_table);
         v9fs_path_free(&path);
     }
     return rc;
@@ -3701,6 +3779,7 @@  void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
+    qpp_table_destroy(&s->qpp_table);
     g_free(s->ctx.fs_root);
 }
 
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 5e316178d5..0200e04176 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -8,6 +8,7 @@ 
 #include "fsdev/9p-iov-marshal.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine.h"
+#include "qemu/qht.h"
 
 enum {
     P9_TLERROR = 6,
@@ -235,6 +236,15 @@  struct V9fsFidState
     V9fsFidState *rclm_lst;
 };
 
+#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+
+/* QID path prefix entry, see stat_to_qid */
+typedef struct {
+    dev_t dev;
+    uint16_t ino_prefix;
+    uint16_t qp_prefix;
+} QppEntry;
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
@@ -257,6 +267,8 @@  struct V9fsState
     V9fsConf fsconf;
     V9fsQID root_qid;
     dev_t dev_id;
+    struct qht qpp_table;
+    uint16_t qp_prefix_next;
 };
 
 /* 9p2000.L open flags */
diff --git a/qemu-options.hx b/qemu-options.hx
index 0d8beb4afd..e7ea136da1 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1334,7 +1334,7 @@  ETEXI
 
 DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
     "-virtfs local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-file|passthrough|none\n"
-    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n"
+    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,remap_inodes]\n"
     "-virtfs proxy,mount_tag=tag,socket=socket[,id=id][,writeout=immediate][,readonly]\n"
     "-virtfs proxy,mount_tag=tag,sock_fd=sock_fd[,id=id][,writeout=immediate][,readonly]\n"
     "-virtfs synth,mount_tag=tag[,id=id][,readonly]\n",
@@ -1342,7 +1342,7 @@  DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
 
 STEXI
 
-@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}]
+@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}][,remap_inodes]
 @itemx -virtfs proxy,socket=@var{socket},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
 @itemx -virtfs proxy,sock_fd=@var{sock_fd},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
 @itemx -virtfs synth,mount_tag=@var{mount_tag}
@@ -1398,6 +1398,19 @@  Specifies the default mode for newly created directories on the host. Works
 only with security models "mapped-xattr" and "mapped-file".
 @item mount_tag=@var{mount_tag}
 Specifies the tag name to be used by the guest to mount this export point.
+@item remap_inodes
+By default virtfs 9p supports only one device per export in order to avoid
+file ID collisions on guest which may otherwise happen because the original
+device IDs from host are not passed and exposed on guest. Instead all files
+of an export shared with virtfs do have the same device id on guest. So two
+files with identical inode numbers but from actually different devices on
+host would otherwise cause a file ID collision and hence potential
+misbehaviours on guest. For that reason it is recommended to create a
+separate virtfs export for each device to be shared with guests. However
+you may also enable "remap_inodes" which allows you to share multiple
+devices with only one export instead, which is achieved by remapping the
+original inode numbers from host to guest in a way that would prevent such
+collisions.
 @end table
 ETEXI
 
diff --git a/vl.c b/vl.c
index 99a56b5556..de9d7b846c 100644
--- a/vl.c
+++ b/vl.c
@@ -3457,6 +3457,9 @@  int main(int argc, char **argv, char **envp)
                 qemu_opt_set_bool(fsdev, "readonly",
                                   qemu_opt_get_bool(opts, "readonly", 0),
                                   &error_abort);
+                qemu_opt_set_bool(fsdev, "remap_inodes",
+                                  qemu_opt_get_bool(opts, "remap_inodes", 0),
+                                  &error_abort);
                 device = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
                                           &error_abort);
                 qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);