diff mbox series

[v6,3/5] 9pfs: add new function v9fs_co_readdir_many()

Message ID fdb0e29a86d1df6005021a08078d7e69ed0de1a2.1587309014.git.qemu_oss@crudebyte.com
State New
Headers show
Series 9pfs: readdir optimization | expand

Commit Message

Christian Schoenebeck April 19, 2020, 3:02 p.m. UTC
The newly added function v9fs_co_readdir_many() retrieves multiple
directory entries with a single fs driver request. It is intended to
replace uses of v9fs_co_readdir(), the latter only retrives a single
directory entry per fs driver request instead.

The reason for this planned replacement is that for every fs driver
request the coroutine is dispatched from main I/O thread to a
background I/O thread and eventually dispatched back to main I/O
thread. Hopping between threads adds latency. So if a 9pfs Treaddir
request reads a large amount of directory entries, this currently
sums up to huge latencies of several hundred ms or even more. So
using v9fs_co_readdir_many() instead of v9fs_co_readdir() will
provide significant performance improvements.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.h    |  22 ++++++
 hw/9pfs/codir.c | 181 +++++++++++++++++++++++++++++++++++++++++++++---
 hw/9pfs/coth.h  |   3 +
 3 files changed, 195 insertions(+), 11 deletions(-)

Comments

Greg Kurz April 30, 2020, 11:42 a.m. UTC | #1
On Sun, 19 Apr 2020 17:02:27 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> The newly added function v9fs_co_readdir_many() retrieves multiple
> directory entries with a single fs driver request. It is intended to
> replace uses of v9fs_co_readdir(), the latter only retrives a single
> directory entry per fs driver request instead.
> 
> The reason for this planned replacement is that for every fs driver
> request the coroutine is dispatched from main I/O thread to a
> background I/O thread and eventually dispatched back to main I/O
> thread. Hopping between threads adds latency. So if a 9pfs Treaddir
> request reads a large amount of directory entries, this currently
> sums up to huge latencies of several hundred ms or even more. So
> using v9fs_co_readdir_many() instead of v9fs_co_readdir() will
> provide significant performance improvements.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/9p.h    |  22 ++++++
>  hw/9pfs/codir.c | 181 +++++++++++++++++++++++++++++++++++++++++++++---
>  hw/9pfs/coth.h  |   3 +
>  3 files changed, 195 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 9553700dbb..116977939b 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -215,6 +215,28 @@ static inline void v9fs_readdir_init(V9fsDir *dir)
>      qemu_mutex_init(&dir->readdir_mutex);
>  }
>  
> +/**
> + * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
> + * which is a chained list of directory entries.
> + */
> +typedef struct V9fsDirEnt {
> +    /* mandatory (must not be NULL) information for all readdir requests */
> +    struct dirent *dent;
> +    /*
> +     * optional (may be NULL): A full stat of each directory entry is just
> +     * done if explicitly told to fs driver.
> +     */
> +    struct stat *st;
> +    /*
> +     * instead of an array, directory entries are always returned as
> +     * chained list, that's because the amount of entries retrieved by fs
> +     * drivers is dependent on the individual entries' name (since response
> +     * messages are size limited), so the final amount cannot be estimated
> +     * before hand
> +     */
> +    struct V9fsDirEnt *next;
> +} V9fsDirEnt;
> +
>  /*
>   * Filled by fs driver on open and other
>   * calls.
> diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
> index 73f9a751e1..45c65a8f5b 100644
> --- a/hw/9pfs/codir.c
> +++ b/hw/9pfs/codir.c
> @@ -18,28 +18,187 @@
>  #include "qemu/main-loop.h"
>  #include "coth.h"
>  
> +/*
> + * This is solely executed on a background IO thread.
> + */
> +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent)
> +{
> +    int err = 0;
> +    V9fsState *s = pdu->s;
> +    struct dirent *entry;
> +
> +    errno = 0;
> +    entry = s->ops->readdir(&s->ctx, &fidp->fs);
> +    if (!entry && errno) {
> +        *dent = NULL;
> +        err = -errno;
> +    } else {
> +        *dent = entry;
> +    }
> +    return err;
> +}
> +
> +/*
> + * TODO: This will be removed for performance reasons.
> + * Use v9fs_co_readdir_many() instead.
> + */
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>                                   struct dirent **dent)
>  {
>      int err;
> -    V9fsState *s = pdu->s;
>  
>      if (v9fs_request_cancelled(pdu)) {
>          return -EINTR;
>      }
> -    v9fs_co_run_in_worker(
> -        {
> -            struct dirent *entry;
> +    v9fs_co_run_in_worker({
> +        err = do_readdir(pdu, fidp, dent);
> +    });
> +    return err;
> +}
> +
> +/*
> + * This is solely executed on a background IO thread.
> + *
> + * See v9fs_co_readdir_many() (as its only user) below for details.
> + */
> +static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> +                             struct V9fsDirEnt **entries,
> +                             int32_t maxsize, bool dostat)
> +{
> +    V9fsState *s = pdu->s;
> +    V9fsString name;
> +    int len, err = 0;
> +    int32_t size = 0;
> +    off_t saved_dir_pos;
> +    struct dirent *dent;
> +    struct V9fsDirEnt *e = NULL;
> +    V9fsPath path;
> +    struct stat stbuf;
>  
> -            errno = 0;
> -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> -            if (!entry && errno) {
> +    *entries = NULL;
> +    v9fs_path_init(&path);
> +
> +    /*
> +     * TODO: Here should be a warn_report_once() if lock failed.
> +     *
> +     * With a good 9p client we should not get into concurrency here,
> +     * because a good client would not use the same fid for concurrent
> +     * requests. We do the lock here for safety reasons though. However
> +     * the client would then suffer performance issues, so better log that
> +     * issue here.
> +     */
> +    v9fs_readdir_lock(&fidp->fs.dir);

I agree that a client that issues concurrent readdir requests on the
same fid is probably asking for troubles, but this permitted by the
spec. Whether we should detect such conditions and warn or even fail
is discussion for another thread.

The locking is only needed to avoid concurrent accesses to the dirent
structure returned by readdir(), otherwise we could return partially
overwritten file names to the client. It must be done for each individual
call to readdir(), but certainly not for multiple calls.

As discussed privately, I'm working on a patch to better address the
locking and I'd really prefer to merge this before your series. Sorry
for the delay again. I'll try to post ASAP.

Anyway, I have some more remarks.

> +
> +    /* save the directory position */
> +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> +    if (saved_dir_pos < 0) {
> +        err = saved_dir_pos;
> +        goto out;
> +    }
> +
> +    while (true) {
> +        /* get directory entry from fs driver */
> +        err = do_readdir(pdu, fidp, &dent);
> +        if (err || !dent) {
> +            break;
> +        }
> +
> +        /*
> +         * stop this loop as soon as it would exceed the allowed maximum
> +         * response message size for the directory entries collected so far,
> +         * because anything beyond that size would need to be discarded by
> +         * 9p controller (main thread / top half) anyway
> +         */
> +        v9fs_string_init(&name);
> +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> +        len = v9fs_readdir_response_size(&name);
> +        v9fs_string_free(&name);
> +        if (size + len > maxsize) {
> +            /* this is not an error case actually */
> +            break;
> +        }
> +
> +        /* append next node to result chain */
> +        if (!e) {
> +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> +        } else {
> +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> +        }
> +        e->dent = g_malloc0(sizeof(struct dirent));

So we're allocating a bunch of stuff here...

> +        memcpy(e->dent, dent, sizeof(struct dirent));
> +
> +        /* perform a full stat() for directory entry if requested by caller */
> +        if (dostat) {
> +            err = s->ops->name_to_path(
> +                &s->ctx, &fidp->path, dent->d_name, &path
> +            );
> +            if (err < 0) {
>                  err = -errno;
> -            } else {
> -                *dent = entry;
> -                err = 0;
> +                break;

... but we're erroring out there and it seems that we're leaking
all the entries that have been allocated so far.

Also I have the impression that all the if (dostat) { } block could
be done before chaining a new entry.

So, I think I'll just rebase your series to accommodate the locking
fix I've mentioned earlier, re-post the whole thing and let you
add the missing rollback. Sounds like a plan ?

>              }
> -        });
> +
> +            err = s->ops->lstat(&s->ctx, &path, &stbuf);
> +            if (err < 0) {
> +                err = -errno;
> +                break;
> +            }
> +
> +            e->st = g_malloc0(sizeof(struct stat));
> +            memcpy(e->st, &stbuf, sizeof(struct stat));
> +        }
> +
> +        size += len;
> +        saved_dir_pos = dent->d_off;
> +    }
> +
> +    /* restore (last) saved position */
> +    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
> +
> +out:
> +    v9fs_readdir_unlock(&fidp->fs.dir);
> +    v9fs_path_free(&path);
> +    if (err < 0) {
> +        return err;
> +    }
> +    return size;
> +}
> +
> +/**
> + * @brief Reads multiple directory entries in one rush.
> + *
> + * Retrieves the requested (max. amount of) directory entries from the fs
> + * driver. This function must only be called by the main IO thread (top half).
> + * Internally this function call will be dispatched to a background IO thread
> + * (bottom half) where it is eventually executed by the fs driver.
> + *
> + * Acquiring multiple directory entries in one rush from the fs driver,
> + * instead of retrieving each directory entry individually, is very beneficial
> + * from performance point of view. Because for every fs driver request latency
> + * is added, which in practice could lead to overall latencies of several
> + * hundred ms for reading all entries (of just a single directory) if every
> + * directory entry was individually requested from driver.
> + *
> + * @param pdu - the causing 9p (T_readdir) client request
> + * @param fidp - already opened directory where readdir shall be performed on
> + * @param entries - output for directory entries (must not be NULL)
> + * @param maxsize - maximum result message body size (in bytes)
> + * @param dostat - whether a stat() should be performed and returned for
> + *                 each directory entry
> + * @returns resulting response message body size (in bytes) on success,
> + *          negative error code otherwise
> + */
> +int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> +                                      struct V9fsDirEnt **entries,
> +                                      int32_t maxsize, bool dostat)
> +{
> +    int err = 0;
> +
> +    if (v9fs_request_cancelled(pdu)) {
> +        return -EINTR;
> +    }
> +    v9fs_co_run_in_worker({
> +        err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
> +    });
>      return err;
>  }
>  
> diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
> index c2cdc7a9ea..a6851822d5 100644
> --- a/hw/9pfs/coth.h
> +++ b/hw/9pfs/coth.h
> @@ -49,6 +49,9 @@
>  void co_run_in_worker_bh(void *);
>  int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
> +int coroutine_fn v9fs_co_readdir_many(V9fsPDU *, V9fsFidState *,
> +                                      struct V9fsDirEnt **,
> +                                      int32_t, bool);
>  off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
>  void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
>  void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);
Christian Schoenebeck April 30, 2020, 12:50 p.m. UTC | #2
On Donnerstag, 30. April 2020 13:42:35 CEST Greg Kurz wrote:
> > +/*
> > + * This is solely executed on a background IO thread.
> > + *
> > + * See v9fs_co_readdir_many() (as its only user) below for details.
> > + */
> > +static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> > +                             struct V9fsDirEnt **entries,
> > +                             int32_t maxsize, bool dostat)
> > +{
> > +    V9fsState *s = pdu->s;
> > +    V9fsString name;
> > +    int len, err = 0;
> > +    int32_t size = 0;
> > +    off_t saved_dir_pos;
> > +    struct dirent *dent;
> > +    struct V9fsDirEnt *e = NULL;
> > +    V9fsPath path;
> > +    struct stat stbuf;
> > 
> > -            errno = 0;
> > -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > -            if (!entry && errno) {
> > +    *entries = NULL;
> > +    v9fs_path_init(&path);
> > +
> > +    /*
> > +     * TODO: Here should be a warn_report_once() if lock failed.
> > +     *
> > +     * With a good 9p client we should not get into concurrency here,
> > +     * because a good client would not use the same fid for concurrent
> > +     * requests. We do the lock here for safety reasons though. However
> > +     * the client would then suffer performance issues, so better log
> > that
> > +     * issue here.
> > +     */
> > +    v9fs_readdir_lock(&fidp->fs.dir);
> 
> I agree that a client that issues concurrent readdir requests on the
> same fid is probably asking for troubles, but this permitted by the
> spec. Whether we should detect such conditions and warn or even fail
> is discussion for another thread.
> 
> The locking is only needed to avoid concurrent accesses to the dirent
> structure returned by readdir(), otherwise we could return partially
> overwritten file names to the client. It must be done for each individual
> call to readdir(), but certainly not for multiple calls.

Yeah, that would resolve this issue more appropriately for 9p2000.L, since 
Treaddir specifies an offset, but for 9p2000.u the result of a concurrent read 
on a directory (9p2000.u) would still be undefined.

> As discussed privately, I'm working on a patch to better address the
> locking and I'd really prefer to merge this before your series. Sorry
> for the delay again. I'll try to post ASAP.
> 
> Anyway, I have some more remarks.
> 
> > +
> > +    /* save the directory position */
> > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > +    if (saved_dir_pos < 0) {
> > +        err = saved_dir_pos;
> > +        goto out;
> > +    }
> > +
> > +    while (true) {
> > +        /* get directory entry from fs driver */
> > +        err = do_readdir(pdu, fidp, &dent);
> > +        if (err || !dent) {
> > +            break;
> > +        }
> > +
> > +        /*
> > +         * stop this loop as soon as it would exceed the allowed maximum
> > +         * response message size for the directory entries collected so
> > far, +         * because anything beyond that size would need to be
> > discarded by +         * 9p controller (main thread / top half) anyway
> > +         */
> > +        v9fs_string_init(&name);
> > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > +        len = v9fs_readdir_response_size(&name);
> > +        v9fs_string_free(&name);
> > +        if (size + len > maxsize) {
> > +            /* this is not an error case actually */
> > +            break;
> > +        }
> > +
> > +        /* append next node to result chain */
> > +        if (!e) {
> > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > +        } else {
> > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > +        }
> > +        e->dent = g_malloc0(sizeof(struct dirent));
> 
> So we're allocating a bunch of stuff here...
> 
> > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > +
> > +        /* perform a full stat() for directory entry if requested by
> > caller */ +        if (dostat) {
> > +            err = s->ops->name_to_path(
> > +                &s->ctx, &fidp->path, dent->d_name, &path
> > +            );
> > +            if (err < 0) {
> > 
> >                  err = -errno;
> > 
> > -            } else {
> > -                *dent = entry;
> > -                err = 0;
> > +                break;
> 
> ... but we're erroring out there and it seems that we're leaking
> all the entries that have been allocated so far.

No, they are not leaking actually.

You are right that they are not deallocated in do_readdir_many(), but that's 
intentional: in the new implementation of v9fs_do_readdir() you see that 
v9fs_free_dirents(entries) is *always* called at the very end of the function, 
no matter if success or any error. That's one of the measures to simplify 
overall code as much as possible.

As you might have noticed, the previous/current v9fs_do_readdir() 
implementation had quite a bunch of individual error pathes, which is quite 
error prone or at least makes it difficult to maintain. So I think it makes 
sense to strip unnecessary branches as much as possible.

> Also I have the impression that all the if (dostat) { } block could
> be done before chaining a new entry.

Yes, you could move it forward, but what would you buy from that?

I think you mean the case when there's an error inside the if (dostat) {} 
block: The comments on struct V9fsDirEnt already suggest that the "st" member 
is optional and may be NULL. So if there's an error inside if (dostat) {}
then caller still has a valid "dent" field at least and it's up to caller 
whether or not it's a problem for its purpose that "st" is empty. For that 
reason I would not move the block forward.

Best regards,
Christian Schoenebeck
Greg Kurz April 30, 2020, 1:30 p.m. UTC | #3
On Thu, 30 Apr 2020 14:50:31 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 30. April 2020 13:42:35 CEST Greg Kurz wrote:
> > > +/*
> > > + * This is solely executed on a background IO thread.
> > > + *
> > > + * See v9fs_co_readdir_many() (as its only user) below for details.
> > > + */
> > > +static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> > > +                             struct V9fsDirEnt **entries,
> > > +                             int32_t maxsize, bool dostat)
> > > +{
> > > +    V9fsState *s = pdu->s;
> > > +    V9fsString name;
> > > +    int len, err = 0;
> > > +    int32_t size = 0;
> > > +    off_t saved_dir_pos;
> > > +    struct dirent *dent;
> > > +    struct V9fsDirEnt *e = NULL;
> > > +    V9fsPath path;
> > > +    struct stat stbuf;
> > > 
> > > -            errno = 0;
> > > -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > > -            if (!entry && errno) {
> > > +    *entries = NULL;
> > > +    v9fs_path_init(&path);
> > > +
> > > +    /*
> > > +     * TODO: Here should be a warn_report_once() if lock failed.
> > > +     *
> > > +     * With a good 9p client we should not get into concurrency here,
> > > +     * because a good client would not use the same fid for concurrent
> > > +     * requests. We do the lock here for safety reasons though. However
> > > +     * the client would then suffer performance issues, so better log
> > > that
> > > +     * issue here.
> > > +     */
> > > +    v9fs_readdir_lock(&fidp->fs.dir);
> > 
> > I agree that a client that issues concurrent readdir requests on the
> > same fid is probably asking for troubles, but this permitted by the
> > spec. Whether we should detect such conditions and warn or even fail
> > is discussion for another thread.
> > 
> > The locking is only needed to avoid concurrent accesses to the dirent
> > structure returned by readdir(), otherwise we could return partially
> > overwritten file names to the client. It must be done for each individual
> > call to readdir(), but certainly not for multiple calls.
> 
> Yeah, that would resolve this issue more appropriately for 9p2000.L, since 
> Treaddir specifies an offset, but for 9p2000.u the result of a concurrent read 
> on a directory (9p2000.u) would still be undefined.
> 

The bad client behavior you want to tackle has nothing to do with
the locking itself. Since all the code in 9p.c runs serialized in
the context of the QEMU main loop, concurrent readdir requests could
easily be detected up-front with a simple flag in the fid structure.

> > As discussed privately, I'm working on a patch to better address the
> > locking and I'd really prefer to merge this before your series. Sorry
> > for the delay again. I'll try to post ASAP.
> > 
> > Anyway, I have some more remarks.
> > 
> > > +
> > > +    /* save the directory position */
> > > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > > +    if (saved_dir_pos < 0) {
> > > +        err = saved_dir_pos;
> > > +        goto out;
> > > +    }
> > > +
> > > +    while (true) {
> > > +        /* get directory entry from fs driver */
> > > +        err = do_readdir(pdu, fidp, &dent);
> > > +        if (err || !dent) {
> > > +            break;
> > > +        }
> > > +
> > > +        /*
> > > +         * stop this loop as soon as it would exceed the allowed maximum
> > > +         * response message size for the directory entries collected so
> > > far, +         * because anything beyond that size would need to be
> > > discarded by +         * 9p controller (main thread / top half) anyway
> > > +         */
> > > +        v9fs_string_init(&name);
> > > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > > +        len = v9fs_readdir_response_size(&name);
> > > +        v9fs_string_free(&name);
> > > +        if (size + len > maxsize) {
> > > +            /* this is not an error case actually */
> > > +            break;
> > > +        }
> > > +
> > > +        /* append next node to result chain */
> > > +        if (!e) {
> > > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > > +        } else {
> > > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > > +        }
> > > +        e->dent = g_malloc0(sizeof(struct dirent));
> > 
> > So we're allocating a bunch of stuff here...
> > 
> > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > +
> > > +        /* perform a full stat() for directory entry if requested by
> > > caller */ +        if (dostat) {
> > > +            err = s->ops->name_to_path(
> > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > +            );
> > > +            if (err < 0) {
> > > 
> > >                  err = -errno;
> > > 
> > > -            } else {
> > > -                *dent = entry;
> > > -                err = 0;
> > > +                break;
> > 
> > ... but we're erroring out there and it seems that we're leaking
> > all the entries that have been allocated so far.
> 
> No, they are not leaking actually.
> 
> You are right that they are not deallocated in do_readdir_many(), but that's 
> intentional: in the new implementation of v9fs_do_readdir() you see that 
> v9fs_free_dirents(entries) is *always* called at the very end of the function, 
> no matter if success or any error. That's one of the measures to simplify 
> overall code as much as possible.
> 

Hmm... I still don't quite like the idea of having an erroring function
asking for extra cleanup. I suggest you come up with an idem-potent version
of v9fs_free_dirents(), move it to codir.c (I also prefer locality of calls
to g_malloc and g_free in the same unit), make it extern and call it
both on the error path of v9fs_co_readdir_many() and in v9fs_do_readdir().

> As you might have noticed, the previous/current v9fs_do_readdir() 
> implementation had quite a bunch of individual error pathes, which is quite 
> error prone or at least makes it difficult to maintain. So I think it makes 
> sense to strip unnecessary branches as much as possible.
> 
> > Also I have the impression that all the if (dostat) { } block could
> > be done before chaining a new entry.
> 
> Yes, you could move it forward, but what would you buy from that?
> 

It just seems a better practice to do the things that can fail up front.

> I think you mean the case when there's an error inside the if (dostat) {} 
> block: The comments on struct V9fsDirEnt already suggest that the "st" member 
> is optional and may be NULL. So if there's an error inside if (dostat) {}
> then caller still has a valid "dent" field at least and it's up to caller 
> whether or not it's a problem for its purpose that "st" is empty. For that 
> reason I would not move the block forward.
> 

Hrm... the comment is:

    /*
     * optional (may be NULL): A full stat of each directory entry is just
     * done if explicitly told to fs driver.
     */

I don't read that it is optional for the fs driver to populate "st"
if this was required by the caller. Also, looking at the next patch
I see that the condition for calling stat() is V9FS_REMAP_INODES and
the code explicitly requires "st" to be available in this case.

> Best regards,
> Christian Schoenebeck
> 
> 

Cheers,

--
Greg
Christian Schoenebeck May 1, 2020, 2:04 p.m. UTC | #4
On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > I agree that a client that issues concurrent readdir requests on the
> > > same fid is probably asking for troubles, but this permitted by the
> > > spec. Whether we should detect such conditions and warn or even fail
> > > is discussion for another thread.
> > > 
> > > The locking is only needed to avoid concurrent accesses to the dirent
> > > structure returned by readdir(), otherwise we could return partially
> > > overwritten file names to the client. It must be done for each
> > > individual
> > > call to readdir(), but certainly not for multiple calls.
> > 
> > Yeah, that would resolve this issue more appropriately for 9p2000.L, since
> > Treaddir specifies an offset, but for 9p2000.u the result of a concurrent
> > read on a directory (9p2000.u) would still be undefined.
> 
> The bad client behavior you want to tackle has nothing to do with
> the locking itself. Since all the code in 9p.c runs serialized in
> the context of the QEMU main loop, concurrent readdir requests could
> easily be detected up-front with a simple flag in the fid structure.

Well, it's fine with me. I don't really see an issue here right now. But that 
all the code was serialized is not fully true. Most of the 9p.c code is still 
heavily dispatching between main thread and worker threads back and forth. And 
for that reason the order of request processing might change quite arbitrarily 
in between. Just keep that in mind.

> > > > +
> > > > +    /* save the directory position */
> > > > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > > > +    if (saved_dir_pos < 0) {
> > > > +        err = saved_dir_pos;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    while (true) {
> > > > +        /* get directory entry from fs driver */
> > > > +        err = do_readdir(pdu, fidp, &dent);
> > > > +        if (err || !dent) {
> > > > +            break;
> > > > +        }
> > > > +
> > > > +        /*
> > > > +         * stop this loop as soon as it would exceed the allowed
> > > > maximum
> > > > +         * response message size for the directory entries collected
> > > > so
> > > > far, +         * because anything beyond that size would need to be
> > > > discarded by +         * 9p controller (main thread / top half) anyway
> > > > +         */
> > > > +        v9fs_string_init(&name);
> > > > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > > > +        len = v9fs_readdir_response_size(&name);
> > > > +        v9fs_string_free(&name);
> > > > +        if (size + len > maxsize) {
> > > > +            /* this is not an error case actually */
> > > > +            break;
> > > > +        }
> > > > +
> > > > +        /* append next node to result chain */
> > > > +        if (!e) {
> > > > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > > > +        } else {
> > > > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > > > +        }
> > > > +        e->dent = g_malloc0(sizeof(struct dirent));
> > > 
> > > So we're allocating a bunch of stuff here...
> > > 
> > > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > > +
> > > > +        /* perform a full stat() for directory entry if requested by
> > > > caller */ +        if (dostat) {
> > > > +            err = s->ops->name_to_path(
> > > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > > +            );
> > > > +            if (err < 0) {
> > > > 
> > > >                  err = -errno;
> > > > 
> > > > -            } else {
> > > > -                *dent = entry;
> > > > -                err = 0;
> > > > +                break;
> > > 
> > > ... but we're erroring out there and it seems that we're leaking
> > > all the entries that have been allocated so far.
> > 
> > No, they are not leaking actually.
> > 
> > You are right that they are not deallocated in do_readdir_many(), but
> > that's intentional: in the new implementation of v9fs_do_readdir() you
> > see that v9fs_free_dirents(entries) is *always* called at the very end of
> > the function, no matter if success or any error. That's one of the
> > measures to simplify overall code as much as possible.
> 
> Hmm... I still don't quite like the idea of having an erroring function
> asking for extra cleanup. I suggest you come up with an idem-potent version
> of v9fs_free_dirents(), move it to codir.c (I also prefer locality of calls
> to g_malloc and g_free in the same unit), make it extern and call it
> both on the error path of v9fs_co_readdir_many() and in v9fs_do_readdir().

I understand your position of course, but I still won't find that to be a good 
move.

My veto here has a reason: your requested change would prevent an application 
that I had in mind for future purpose actually: Allowing "greedy" fetching 
directory entries, that is returning all successful read entries while some of 
the entries might have been errored for some reason. Think about a directory 
where one entry is another device which errors for some reason, then a user 
probably still would want to see the other entries at least. I know that 
purpose would still need some minor adjustments, but no changes to the current 
structure of this function actually.

But to also make this clear: there is no "extra" cleanup right now. 
v9fs_free_dirents(entries) is *always* called by caller, no matter if success, 
error, allocated list or NULL. It couldn't be more simple. A user of this 
function must call v9fs_free_dirents(entries) at a certain point anyway.

If you have a bad feeling about it, I can make it more clear by an API doc 
comment on v9fs_co_readdir_many() if you want, like e.g. "You @b MUST @ ALWAYS 
call v9fs_free_dirents() after using this function, both on success or 
error.".

> > I think you mean the case when there's an error inside the if (dostat) {}
> > block: The comments on struct V9fsDirEnt already suggest that the "st"
> > member is optional and may be NULL. So if there's an error inside if
> > (dostat) {} then caller still has a valid "dent" field at least and it's
> > up to caller whether or not it's a problem for its purpose that "st" is
> > empty. For that reason I would not move the block forward.
> 
> Hrm... the comment is:
> 
>     /*
>      * optional (may be NULL): A full stat of each directory entry is just
>      * done if explicitly told to fs driver.
>      */
> 
> I don't read that it is optional for the fs driver to populate "st"
> if this was required by the caller.

Well, if you find that ambigious, I could add an additional sentence "It might 
also be NULL if stat failed.".

> Also, looking at the next patch
> I see that the condition for calling stat() is V9FS_REMAP_INODES and
> the code explicitly requires "st" to be available in this case.

Yes, but there is also an if (!st) { ... } in the subsequent patch already. So 
I don't see an issue here.

Changing the order in readdir_many() would prevent future applications of that 
function that I described. Or let's say, order would need to be reverted back 
again later on.

Best regards,
Christian Schoenebeck
Greg Kurz May 4, 2020, 9:18 a.m. UTC | #5
On Fri, 01 May 2020 16:04:41 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > > I agree that a client that issues concurrent readdir requests on the
> > > > same fid is probably asking for troubles, but this permitted by the
> > > > spec. Whether we should detect such conditions and warn or even fail
> > > > is discussion for another thread.
> > > > 
> > > > The locking is only needed to avoid concurrent accesses to the dirent
> > > > structure returned by readdir(), otherwise we could return partially
> > > > overwritten file names to the client. It must be done for each
> > > > individual
> > > > call to readdir(), but certainly not for multiple calls.
> > > 
> > > Yeah, that would resolve this issue more appropriately for 9p2000.L, since
> > > Treaddir specifies an offset, but for 9p2000.u the result of a concurrent
> > > read on a directory (9p2000.u) would still be undefined.
> > 
> > The bad client behavior you want to tackle has nothing to do with
> > the locking itself. Since all the code in 9p.c runs serialized in
> > the context of the QEMU main loop, concurrent readdir requests could
> > easily be detected up-front with a simple flag in the fid structure.
> 
> Well, it's fine with me. I don't really see an issue here right now. But that 
> all the code was serialized is not fully true. Most of the 9p.c code is still 
> heavily dispatching between main thread and worker threads back and forth. And 
> for that reason the order of request processing might change quite arbitrarily 
> in between. Just keep that in mind.
> 

Just to make things clear. The code in 9p.c is ALWAYS exclusively run by
the main thread. Only the code called under v9fs_co_run_in_worker() is
dispatched on worker threads. So, yes the order of individual backend
operations may change, but the start of a new client request is necessarily
serialized with the completion of pending ones, which is the only thing
we care for actually.

> > > > > +
> > > > > +    /* save the directory position */
> > > > > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > > > > +    if (saved_dir_pos < 0) {
> > > > > +        err = saved_dir_pos;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    while (true) {
> > > > > +        /* get directory entry from fs driver */
> > > > > +        err = do_readdir(pdu, fidp, &dent);
> > > > > +        if (err || !dent) {
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        /*
> > > > > +         * stop this loop as soon as it would exceed the allowed
> > > > > maximum
> > > > > +         * response message size for the directory entries collected
> > > > > so
> > > > > far, +         * because anything beyond that size would need to be
> > > > > discarded by +         * 9p controller (main thread / top half) anyway
> > > > > +         */
> > > > > +        v9fs_string_init(&name);
> > > > > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > > > > +        len = v9fs_readdir_response_size(&name);
> > > > > +        v9fs_string_free(&name);
> > > > > +        if (size + len > maxsize) {
> > > > > +            /* this is not an error case actually */
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        /* append next node to result chain */
> > > > > +        if (!e) {
> > > > > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > > > > +        } else {
> > > > > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > > > > +        }
> > > > > +        e->dent = g_malloc0(sizeof(struct dirent));
> > > > 
> > > > So we're allocating a bunch of stuff here...
> > > > 
> > > > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > > > +
> > > > > +        /* perform a full stat() for directory entry if requested by
> > > > > caller */ +        if (dostat) {
> > > > > +            err = s->ops->name_to_path(
> > > > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > > > +            );
> > > > > +            if (err < 0) {
> > > > > 
> > > > >                  err = -errno;
> > > > > 
> > > > > -            } else {
> > > > > -                *dent = entry;
> > > > > -                err = 0;
> > > > > +                break;
> > > > 
> > > > ... but we're erroring out there and it seems that we're leaking
> > > > all the entries that have been allocated so far.
> > > 
> > > No, they are not leaking actually.
> > > 
> > > You are right that they are not deallocated in do_readdir_many(), but
> > > that's intentional: in the new implementation of v9fs_do_readdir() you
> > > see that v9fs_free_dirents(entries) is *always* called at the very end of
> > > the function, no matter if success or any error. That's one of the
> > > measures to simplify overall code as much as possible.
> > 
> > Hmm... I still don't quite like the idea of having an erroring function
> > asking for extra cleanup. I suggest you come up with an idem-potent version
> > of v9fs_free_dirents(), move it to codir.c (I also prefer locality of calls
> > to g_malloc and g_free in the same unit), make it extern and call it
> > both on the error path of v9fs_co_readdir_many() and in v9fs_do_readdir().
> 
> I understand your position of course, but I still won't find that to be a good 
> move.
> 
> My veto here has a reason: your requested change would prevent an application 
> that I had in mind for future purpose actually: Allowing "greedy" fetching 

Are you telling that this series has some kind of hidden agenda related to
a possible future change ?!?

> directory entries, that is returning all successful read entries while some of 
> the entries might have been errored for some reason. Think about a directory 
> where one entry is another device which errors for some reason, then a user 
> probably still would want to see the other entries at least. I know that 

Hrm... if dostat is such a weak requirement that callers might want to
simply ignore the missing stat, then readdir_many() shouldn't error out
in the first place.

> purpose would still need some minor adjustments, but no changes to the current 
> structure of this function actually.
> 
> But to also make this clear: there is no "extra" cleanup right now. 
> v9fs_free_dirents(entries) is *always* called by caller, no matter if success, 
> error, allocated list or NULL. It couldn't be more simple. A user of this 
> function must call v9fs_free_dirents(entries) at a certain point anyway.
> 
> If you have a bad feeling about it, I can make it more clear by an API doc 
> comment on v9fs_co_readdir_many() if you want, like e.g. "You @b MUST @ ALWAYS 
> call v9fs_free_dirents() after using this function, both on success or 
> error.".
> 

No, I just cannot ack the case of a function that returns partially valid
data and an error at the same time. Doesn't look sane to me.

> > > I think you mean the case when there's an error inside the if (dostat) {}
> > > block: The comments on struct V9fsDirEnt already suggest that the "st"
> > > member is optional and may be NULL. So if there's an error inside if
> > > (dostat) {} then caller still has a valid "dent" field at least and it's
> > > up to caller whether or not it's a problem for its purpose that "st" is
> > > empty. For that reason I would not move the block forward.
> > 
> > Hrm... the comment is:
> > 
> >     /*
> >      * optional (may be NULL): A full stat of each directory entry is just
> >      * done if explicitly told to fs driver.
> >      */
> > 
> > I don't read that it is optional for the fs driver to populate "st"
> > if this was required by the caller.
> 
> Well, if you find that ambigious, I could add an additional sentence "It might 
> also be NULL if stat failed.".
> 
> > Also, looking at the next patch
> > I see that the condition for calling stat() is V9FS_REMAP_INODES and
> > the code explicitly requires "st" to be available in this case.
> 
> Yes, but there is also an if (!st) { ... } in the subsequent patch already. So 
> I don't see an issue here.
> 
> Changing the order in readdir_many() would prevent future applications of that 
> function that I described. Or let's say, order would need to be reverted back 
> again later on.
> 
> Best regards,
> Christian Schoenebeck
> 
>
Christian Schoenebeck May 4, 2020, 10:08 a.m. UTC | #6
On Montag, 4. Mai 2020 11:18:34 CEST Greg Kurz wrote:
> > > > > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > > > > +
> > > > > > +        /* perform a full stat() for directory entry if requested
> > > > > > by
> > > > > > caller */ +        if (dostat) {
> > > > > > +            err = s->ops->name_to_path(
> > > > > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > > > > +            );
> > > > > > +            if (err < 0) {
> > > > > > 
> > > > > >                  err = -errno;
> > > > > > 
> > > > > > -            } else {
> > > > > > -                *dent = entry;
> > > > > > -                err = 0;
> > > > > > +                break;
> > > > > 
> > > > > ... but we're erroring out there and it seems that we're leaking
> > > > > all the entries that have been allocated so far.
> > > > 
> > > > No, they are not leaking actually.
> > > > 
> > > > You are right that they are not deallocated in do_readdir_many(), but
> > > > that's intentional: in the new implementation of v9fs_do_readdir() you
> > > > see that v9fs_free_dirents(entries) is *always* called at the very end
> > > > of
> > > > the function, no matter if success or any error. That's one of the
> > > > measures to simplify overall code as much as possible.
> > > 
> > > Hmm... I still don't quite like the idea of having an erroring function
> > > asking for extra cleanup. I suggest you come up with an idem-potent
> > > version
> > > of v9fs_free_dirents(), move it to codir.c (I also prefer locality of
> > > calls
> > > to g_malloc and g_free in the same unit), make it extern and call it
> > > both on the error path of v9fs_co_readdir_many() and in
> > > v9fs_do_readdir().
> > 
> > I understand your position of course, but I still won't find that to be a
> > good move.
> > 
> > My veto here has a reason: your requested change would prevent an
> > application that I had in mind for future purpose actually: Allowing
> > "greedy" fetching
> Are you telling that this series has some kind of hidden agenda related to
> a possible future change ?!?

readdir_many() is written intended as general purpose directory retrieval 
function, that is for other purposes in future in mind, yes.

What I don't do is adding code which is not explicitly needed right now of 
course. That would not make sense and would make code unnecessarily bloated 
and of course too complicated (e.g. readdir_many() is currently simply 
directly calling v9fs_readdir_response_size() to decide whether to terminate 
the loop instead of taking some complicated general-purpose loop end 
"predicate" structure or callback as function argument).

But when it comes to the structure of the code that I have to add NOW, then I 
indeed take potential future changes into account, yes! And this applies 
specifically to the two changes you requested here inside readdir_many(). 
Because I already know, I would need to revert those 2 changes that you 
requested later on. And I don't see any issue whatsover retaining the current 
version concerning those 2.

Best regards,
Christian Schoenebeck
Christian Schoenebeck May 7, 2020, 12:16 p.m. UTC | #7
On Montag, 4. Mai 2020 11:18:34 CEST Greg Kurz wrote:
> On Fri, 01 May 2020 16:04:41 +0200
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > > > I agree that a client that issues concurrent readdir requests on the
> > > > > same fid is probably asking for troubles, but this permitted by the
> > > > > spec. Whether we should detect such conditions and warn or even fail
> > > > > is discussion for another thread.
> > > > > 
> > > > > The locking is only needed to avoid concurrent accesses to the
> > > > > dirent
> > > > > structure returned by readdir(), otherwise we could return partially
> > > > > overwritten file names to the client. It must be done for each
> > > > > individual
> > > > > call to readdir(), but certainly not for multiple calls.
> > > > 
> > > > Yeah, that would resolve this issue more appropriately for 9p2000.L,
> > > > since
> > > > Treaddir specifies an offset, but for 9p2000.u the result of a
> > > > concurrent
> > > > read on a directory (9p2000.u) would still be undefined.
> > > 
> > > The bad client behavior you want to tackle has nothing to do with
> > > the locking itself. Since all the code in 9p.c runs serialized in
> > > the context of the QEMU main loop, concurrent readdir requests could
> > > easily be detected up-front with a simple flag in the fid structure.
> > 
> > Well, it's fine with me. I don't really see an issue here right now. But
> > that all the code was serialized is not fully true. Most of the 9p.c code
> > is still heavily dispatching between main thread and worker threads back
> > and forth. And for that reason the order of request processing might
> > change quite arbitrarily in between. Just keep that in mind.
> 
> Just to make things clear. The code in 9p.c is ALWAYS exclusively run by
> the main thread. Only the code called under v9fs_co_run_in_worker() is
> dispatched on worker threads. So, yes the order of individual backend
> operations may change, but the start of a new client request is necessarily
> serialized with the completion of pending ones, which is the only thing
> we care for actually.

I just looked at this. 9p.c code is called by main I/O thread only, that's 
clear. The start of requests come also in in order, yes, but it seems you 
would think that main I/O thread would not grab the next client request from 
queue before completing the current/previous client request (that is not 
before transmitting result to client). If yes, I am not so sure about this 
claim:

For instance v9fs_path_write_lock() is using a co-mutex, right? So an 
unsuccesful lock would cause main I/O thread to grab the next request before 
completing the current/previous request.

And what happens on any run_in_worker({}) call? If there is another client 
request in the queue, main I/O thread would pull that request from the queue 
before waiting for the worker thread to complete its task, wouldn't it?

Just looked at the code so far, haven't tested it yet ...

Best regards,
Christian Schoenebeck
Greg Kurz May 7, 2020, 3:59 p.m. UTC | #8
On Thu, 07 May 2020 14:16:43 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Montag, 4. Mai 2020 11:18:34 CEST Greg Kurz wrote:
> > On Fri, 01 May 2020 16:04:41 +0200
> > 
> > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > > > > I agree that a client that issues concurrent readdir requests on the
> > > > > > same fid is probably asking for troubles, but this permitted by the
> > > > > > spec. Whether we should detect such conditions and warn or even fail
> > > > > > is discussion for another thread.
> > > > > > 
> > > > > > The locking is only needed to avoid concurrent accesses to the
> > > > > > dirent
> > > > > > structure returned by readdir(), otherwise we could return partially
> > > > > > overwritten file names to the client. It must be done for each
> > > > > > individual
> > > > > > call to readdir(), but certainly not for multiple calls.
> > > > > 
> > > > > Yeah, that would resolve this issue more appropriately for 9p2000.L,
> > > > > since
> > > > > Treaddir specifies an offset, but for 9p2000.u the result of a
> > > > > concurrent
> > > > > read on a directory (9p2000.u) would still be undefined.
> > > > 
> > > > The bad client behavior you want to tackle has nothing to do with
> > > > the locking itself. Since all the code in 9p.c runs serialized in
> > > > the context of the QEMU main loop, concurrent readdir requests could
> > > > easily be detected up-front with a simple flag in the fid structure.
> > > 
> > > Well, it's fine with me. I don't really see an issue here right now. But
> > > that all the code was serialized is not fully true. Most of the 9p.c code
> > > is still heavily dispatching between main thread and worker threads back
> > > and forth. And for that reason the order of request processing might
> > > change quite arbitrarily in between. Just keep that in mind.
> > 
> > Just to make things clear. The code in 9p.c is ALWAYS exclusively run by
> > the main thread. Only the code called under v9fs_co_run_in_worker() is
> > dispatched on worker threads. So, yes the order of individual backend
> > operations may change, but the start of a new client request is necessarily
> > serialized with the completion of pending ones, which is the only thing
> > we care for actually.
> 
> I just looked at this. 9p.c code is called by main I/O thread only, that's 
> clear. The start of requests come also in in order, yes, but it seems you 
> would think that main I/O thread would not grab the next client request from 
> queue before completing the current/previous client request (that is not 
> before transmitting result to client). If yes, I am not so sure about this 
> claim:
> 

Hrm... I don't think I've ever said anything like that :)

What I'm saying is that, thanks to the serialization of
request start and completion, v9fs_readdir() and v9fs_read()
can:
- flag the fid as being involved in a readdir request
- clear the flag when the request is complete or failed
- directly fail or print a warning if the fid already
  has the flag set (ie. a previous request hasn't
  completed yet)

But again, I'm not a big fan of adding code to handle
such an unlikely situation which is even not explicitly
forbidden by the 9p spec.

> For instance v9fs_path_write_lock() is using a co-mutex, right? So an 
> unsuccesful lock would cause main I/O thread to grab the next request before 
> completing the current/previous request.
> 

An unsuccessful lock would cause the main I/O thread to switch
to some other task.

> And what happens on any run_in_worker({}) call? If there is another client 
> request in the queue, main I/O thread would pull that request from the queue 
> before waiting for the worker thread to complete its task, wouldn't it?
> 

The main I/O thread won't wait for anything, so, yes, it will probably
start handling the new request until it yields.

> Just looked at the code so far, haven't tested it yet ...
> 
> Best regards,
> Christian Schoenebeck
> 
>
diff mbox series

Patch

diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 9553700dbb..116977939b 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -215,6 +215,28 @@  static inline void v9fs_readdir_init(V9fsDir *dir)
     qemu_mutex_init(&dir->readdir_mutex);
 }
 
+/**
+ * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
+ * which is a chained list of directory entries.
+ */
+typedef struct V9fsDirEnt {
+    /* mandatory (must not be NULL) information for all readdir requests */
+    struct dirent *dent;
+    /*
+     * optional (may be NULL): A full stat of each directory entry is just
+     * done if explicitly told to fs driver.
+     */
+    struct stat *st;
+    /*
+     * instead of an array, directory entries are always returned as
+     * chained list, that's because the amount of entries retrieved by fs
+     * drivers is dependent on the individual entries' name (since response
+     * messages are size limited), so the final amount cannot be estimated
+     * before hand
+     */
+    struct V9fsDirEnt *next;
+} V9fsDirEnt;
+
 /*
  * Filled by fs driver on open and other
  * calls.
diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
index 73f9a751e1..45c65a8f5b 100644
--- a/hw/9pfs/codir.c
+++ b/hw/9pfs/codir.c
@@ -18,28 +18,187 @@ 
 #include "qemu/main-loop.h"
 #include "coth.h"
 
+/*
+ * This is solely executed on a background IO thread.
+ */
+static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent)
+{
+    int err = 0;
+    V9fsState *s = pdu->s;
+    struct dirent *entry;
+
+    errno = 0;
+    entry = s->ops->readdir(&s->ctx, &fidp->fs);
+    if (!entry && errno) {
+        *dent = NULL;
+        err = -errno;
+    } else {
+        *dent = entry;
+    }
+    return err;
+}
+
+/*
+ * TODO: This will be removed for performance reasons.
+ * Use v9fs_co_readdir_many() instead.
+ */
 int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
                                  struct dirent **dent)
 {
     int err;
-    V9fsState *s = pdu->s;
 
     if (v9fs_request_cancelled(pdu)) {
         return -EINTR;
     }
-    v9fs_co_run_in_worker(
-        {
-            struct dirent *entry;
+    v9fs_co_run_in_worker({
+        err = do_readdir(pdu, fidp, dent);
+    });
+    return err;
+}
+
+/*
+ * This is solely executed on a background IO thread.
+ *
+ * See v9fs_co_readdir_many() (as its only user) below for details.
+ */
+static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
+                             struct V9fsDirEnt **entries,
+                             int32_t maxsize, bool dostat)
+{
+    V9fsState *s = pdu->s;
+    V9fsString name;
+    int len, err = 0;
+    int32_t size = 0;
+    off_t saved_dir_pos;
+    struct dirent *dent;
+    struct V9fsDirEnt *e = NULL;
+    V9fsPath path;
+    struct stat stbuf;
 
-            errno = 0;
-            entry = s->ops->readdir(&s->ctx, &fidp->fs);
-            if (!entry && errno) {
+    *entries = NULL;
+    v9fs_path_init(&path);
+
+    /*
+     * TODO: Here should be a warn_report_once() if lock failed.
+     *
+     * With a good 9p client we should not get into concurrency here,
+     * because a good client would not use the same fid for concurrent
+     * requests. We do the lock here for safety reasons though. However
+     * the client would then suffer performance issues, so better log that
+     * issue here.
+     */
+    v9fs_readdir_lock(&fidp->fs.dir);
+
+    /* save the directory position */
+    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
+    if (saved_dir_pos < 0) {
+        err = saved_dir_pos;
+        goto out;
+    }
+
+    while (true) {
+        /* get directory entry from fs driver */
+        err = do_readdir(pdu, fidp, &dent);
+        if (err || !dent) {
+            break;
+        }
+
+        /*
+         * stop this loop as soon as it would exceed the allowed maximum
+         * response message size for the directory entries collected so far,
+         * because anything beyond that size would need to be discarded by
+         * 9p controller (main thread / top half) anyway
+         */
+        v9fs_string_init(&name);
+        v9fs_string_sprintf(&name, "%s", dent->d_name);
+        len = v9fs_readdir_response_size(&name);
+        v9fs_string_free(&name);
+        if (size + len > maxsize) {
+            /* this is not an error case actually */
+            break;
+        }
+
+        /* append next node to result chain */
+        if (!e) {
+            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
+        } else {
+            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
+        }
+        e->dent = g_malloc0(sizeof(struct dirent));
+        memcpy(e->dent, dent, sizeof(struct dirent));
+
+        /* perform a full stat() for directory entry if requested by caller */
+        if (dostat) {
+            err = s->ops->name_to_path(
+                &s->ctx, &fidp->path, dent->d_name, &path
+            );
+            if (err < 0) {
                 err = -errno;
-            } else {
-                *dent = entry;
-                err = 0;
+                break;
             }
-        });
+
+            err = s->ops->lstat(&s->ctx, &path, &stbuf);
+            if (err < 0) {
+                err = -errno;
+                break;
+            }
+
+            e->st = g_malloc0(sizeof(struct stat));
+            memcpy(e->st, &stbuf, sizeof(struct stat));
+        }
+
+        size += len;
+        saved_dir_pos = dent->d_off;
+    }
+
+    /* restore (last) saved position */
+    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
+
+out:
+    v9fs_readdir_unlock(&fidp->fs.dir);
+    v9fs_path_free(&path);
+    if (err < 0) {
+        return err;
+    }
+    return size;
+}
+
+/**
+ * @brief Reads multiple directory entries in one rush.
+ *
+ * Retrieves the requested (max. amount of) directory entries from the fs
+ * driver. This function must only be called by the main IO thread (top half).
+ * Internally this function call will be dispatched to a background IO thread
+ * (bottom half) where it is eventually executed by the fs driver.
+ *
+ * Acquiring multiple directory entries in one rush from the fs driver,
+ * instead of retrieving each directory entry individually, is very beneficial
+ * from performance point of view. Because for every fs driver request latency
+ * is added, which in practice could lead to overall latencies of several
+ * hundred ms for reading all entries (of just a single directory) if every
+ * directory entry was individually requested from driver.
+ *
+ * @param pdu - the causing 9p (T_readdir) client request
+ * @param fidp - already opened directory where readdir shall be performed on
+ * @param entries - output for directory entries (must not be NULL)
+ * @param maxsize - maximum result message body size (in bytes)
+ * @param dostat - whether a stat() should be performed and returned for
+ *                 each directory entry
+ * @returns resulting response message body size (in bytes) on success,
+ *          negative error code otherwise
+ */
+int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
+                                      struct V9fsDirEnt **entries,
+                                      int32_t maxsize, bool dostat)
+{
+    int err = 0;
+
+    if (v9fs_request_cancelled(pdu)) {
+        return -EINTR;
+    }
+    v9fs_co_run_in_worker({
+        err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
+    });
     return err;
 }
 
diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
index c2cdc7a9ea..a6851822d5 100644
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -49,6 +49,9 @@ 
 void co_run_in_worker_bh(void *);
 int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
 int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
+int coroutine_fn v9fs_co_readdir_many(V9fsPDU *, V9fsFidState *,
+                                      struct V9fsDirEnt **,
+                                      int32_t, bool);
 off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
 void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
 void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);