diff mbox series

[v4,10/11] 9pfs: T_readdir latency optimization

Message ID cbd48794ab09613589719b37b147589cda5af6dd.1579567020.git.qemu_oss@crudebyte.com
State New
Headers show
Series 9pfs: readdir optimization | expand

Commit Message

Christian Schoenebeck Jan. 21, 2020, 12:30 a.m. UTC
Make top half really top half and bottom half really bottom half:

Each T_readdir request handling is hopping between threads (main
I/O thread and background I/O driver threads) several times for
every individual directory entry, which sums up to huge latencies
for handling just a single T_readdir request.

Instead of doing that, collect now all required directory entries
(including all potentially required stat buffers for each entry) in
one rush on a background I/O thread from fs driver, then assemble
the entire resulting network response message for the readdir
request on main I/O thread. The fs driver is still aborting the
directory entry retrieval loop (on the background I/O thread) as
soon as it would exceed the client's requested maximum R_readdir
response size. So we should not have any performance penalty by
doing this.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c    | 124 +++++++++++++++-----------------
 hw/9pfs/9p.h    |  23 ++++++
 hw/9pfs/codir.c | 183 +++++++++++++++++++++++++++++++++++++++++++++---
 hw/9pfs/coth.h  |   3 +
 4 files changed, 254 insertions(+), 79 deletions(-)

Comments

Greg Kurz Jan. 23, 2020, 11:33 a.m. UTC | #1
On Tue, 21 Jan 2020 01:30:10 +0100
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> Make top half really top half and bottom half really bottom half:
> 
> Each T_readdir request handling is hopping between threads (main
> I/O thread and background I/O driver threads) several times for
> every individual directory entry, which sums up to huge latencies
> for handling just a single T_readdir request.
> 
> Instead of doing that, collect now all required directory entries
> (including all potentially required stat buffers for each entry) in
> one rush on a background I/O thread from fs driver, then assemble
> the entire resulting network response message for the readdir
> request on main I/O thread. The fs driver is still aborting the
> directory entry retrieval loop (on the background I/O thread) as
> soon as it would exceed the client's requested maximum R_readdir
> response size. So we should not have any performance penalty by
> doing this.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---

Ok so this is it. Not reviewed this huge patch yet but I could at
least give a try. The gain is impressive indeed:

[greg@bahia qemu-9p]$ (cd .mbuild-$(stg branch)/obj ; export QTEST_QEMU_BINARY='x86_64-softmmu/qemu-system-x86_64'; make all tests/qtest/qos-test && for i in {1..100}; do tests/qtest/qos-test -p $(tests/qtest/qos-test -l | grep readdir/basic); done) |& awk '/IMPORTANT/ { print $10 }' | sed -e 's/s//' -e 's/^/n+=1;x+=/;$ascale=6;x/n' | bc
.009806

instead of .055654, i.e. nearly 6 times faster ! This sounds promising :)

Now I need to find time to do a decent review... :-\

>  hw/9pfs/9p.c    | 124 +++++++++++++++-----------------
>  hw/9pfs/9p.h    |  23 ++++++
>  hw/9pfs/codir.c | 183 +++++++++++++++++++++++++++++++++++++++++++++---
>  hw/9pfs/coth.h  |   3 +
>  4 files changed, 254 insertions(+), 79 deletions(-)
> 
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 18370183c4..e0ca45d46b 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -971,30 +971,6 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
>      return 0;
>  }
>  
> -static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
> -                                      struct dirent *dent, V9fsQID *qidp)
> -{
> -    struct stat stbuf;
> -    V9fsPath path;
> -    int err;
> -
> -    v9fs_path_init(&path);
> -
> -    err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path);
> -    if (err < 0) {
> -        goto out;
> -    }
> -    err = v9fs_co_lstat(pdu, &path, &stbuf);
> -    if (err < 0) {
> -        goto out;
> -    }
> -    err = stat_to_qid(pdu, &stbuf, qidp);
> -
> -out:
> -    v9fs_path_free(&path);
> -    return err;
> -}
> -
>  V9fsPDU *pdu_alloc(V9fsState *s)
>  {
>      V9fsPDU *pdu = NULL;
> @@ -2314,7 +2290,7 @@ out_nofid:
>      pdu_complete(pdu, err);
>  }
>  
> -static size_t v9fs_readdir_data_size(V9fsString *name)
> +size_t v9fs_readdir_response_size(V9fsString *name)
>  {
>      /*
>       * Size of each dirent on the wire: size of qid (13) + size of offset (8)
> @@ -2323,6 +2299,18 @@ static size_t v9fs_readdir_data_size(V9fsString *name)
>      return 24 + v9fs_string_size(name);
>  }
>  
> +static void v9fs_free_dirents(struct V9fsDirEnt *e)
> +{
> +    struct V9fsDirEnt *next = NULL;
> +
> +    for (; e; e = next) {
> +        next = e->next;
> +        g_free(e->dent);
> +        g_free(e->st);
> +        g_free(e);
> +    }
> +}
> +
>  static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>                                          int32_t max_count)
>  {
> @@ -2331,54 +2319,53 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>      V9fsString name;
>      int len, err = 0;
>      int32_t count = 0;
> -    off_t saved_dir_pos;
>      struct dirent *dent;
> +    struct stat *st;
> +    struct V9fsDirEnt *entries = NULL;
>  
> -    /* save the directory position */
> -    saved_dir_pos = v9fs_co_telldir(pdu, fidp);
> -    if (saved_dir_pos < 0) {
> -        return saved_dir_pos;
> -    }
> -
> -    while (1) {
> -        v9fs_readdir_lock(&fidp->fs.dir);
> +    /*
> +     * inode remapping requires the device id, which in turn might be
> +     * different for different directory entries, so if inode remapping is
> +     * enabled we have to make a full stat for each directory entry
> +     */
> +    const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
>  
> -        err = v9fs_co_readdir(pdu, fidp, &dent);
> -        if (err || !dent) {
> -            break;
> -        }
> -        v9fs_string_init(&name);
> -        v9fs_string_sprintf(&name, "%s", dent->d_name);
> -        if ((count + v9fs_readdir_data_size(&name)) > max_count) {
> -            v9fs_readdir_unlock(&fidp->fs.dir);
> +    /*
> +     * Fetch all required directory entries altogether on a background IO
> +     * thread from fs driver. We don't want to do that for each entry
> +     * individually, because hopping between threads (this main IO thread
> +     * and background IO driver thread) would sum up to huge latencies.
> +     */
> +    count = v9fs_co_readdir_lowlat(pdu, fidp, &entries, max_count, dostat);
> +    if (count < 0) {
> +        err = count;
> +        count = 0;
> +        goto out;
> +    }
> +    count = 0;
>  
> -            /* Ran out of buffer. Set dir back to old position and return */
> -            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> -            v9fs_string_free(&name);
> -            return count;
> -        }
> +    for (struct V9fsDirEnt *e = entries; e; e = e->next) {
> +        dent = e->dent;
>  
>          if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> -            /*
> -             * dirent_to_qid() implies expensive stat call for each entry,
> -             * we must do that here though since inode remapping requires
> -             * the device id, which in turn might be different for
> -             * different entries; we cannot make any assumption to avoid
> -             * that here.
> -             */
> -            err = dirent_to_qid(pdu, fidp, dent, &qid);
> +            st = e->st;
> +            /* e->st should never be NULL, but just to be sure */
> +            if (!st) {
> +                err = -1;
> +                break;
> +            }
> +
> +            /* remap inode */
> +            err = stat_to_qid(pdu, st, &qid);
>              if (err < 0) {
> -                v9fs_readdir_unlock(&fidp->fs.dir);
> -                v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> -                v9fs_string_free(&name);
> -                return err;
> +                break;
>              }
>          } else {
>              /*
>               * Fill up just the path field of qid because the client uses
>               * only that. To fill the entire qid structure we will have
>               * to stat each dirent found, which is expensive. For the
> -             * latter reason we don't call dirent_to_qid() here. Only drawback
> +             * latter reason we don't call stat_to_qid() here. Only drawback
>               * is that no multi-device export detection of stat_to_qid()
>               * would be done and provided as error to the user here. But
>               * user would get that error anyway when accessing those
> @@ -2391,25 +2378,26 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>              qid.version = 0;
>          }
>  
> +        v9fs_string_init(&name);
> +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> +
>          /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
>          len = pdu_marshal(pdu, 11 + count, "Qqbs",
>                            &qid, dent->d_off,
>                            dent->d_type, &name);
>  
> -        v9fs_readdir_unlock(&fidp->fs.dir);
> +        v9fs_string_free(&name);
>  
>          if (len < 0) {
> -            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> -            v9fs_string_free(&name);
> -            return len;
> +            err = len;
> +            break;
>          }
> +
>          count += len;
> -        v9fs_string_free(&name);
> -        saved_dir_pos = dent->d_off;
>      }
>  
> -    v9fs_readdir_unlock(&fidp->fs.dir);
> -
> +out:
> +    v9fs_free_dirents(entries);
>      if (err < 0) {
>          return err;
>      }
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 6fffe44f5a..1dbb0ad189 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -215,6 +215,28 @@ static inline void v9fs_readdir_init(V9fsDir *dir)
>      qemu_mutex_init(&dir->readdir_mutex);
>  }
>  
> +/*
> + * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
> + * which is a chained list of directory entries.
> + */
> +typedef struct V9fsDirEnt {
> +    /* mandatory (must not be NULL) information for all readdir requests */
> +    struct dirent *dent;
> +    /*
> +     * optional (may be NULL): A full stat of each directory entry is just
> +     * done if explicitly told to fs driver.
> +     */
> +    struct stat *st;
> +    /*
> +     * instead of an array, directory entries are always returned as
> +     * chained list, that's because the amount of entries retrieved by fs
> +     * drivers is dependent on the individual entries' name (since response
> +     * messages are size limited), so the final amount cannot be estimated
> +     * before hand
> +     */
> +    struct V9fsDirEnt *next;
> +} V9fsDirEnt;
> +
>  /*
>   * Filled by fs driver on open and other
>   * calls.
> @@ -419,6 +441,7 @@ void v9fs_path_init(V9fsPath *path);
>  void v9fs_path_free(V9fsPath *path);
>  void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...);
>  void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src);
> +size_t v9fs_readdir_response_size(V9fsString *name);
>  int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
>                        const char *name, V9fsPath *path);
>  int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
> diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
> index 73f9a751e1..6ce7dc8cde 100644
> --- a/hw/9pfs/codir.c
> +++ b/hw/9pfs/codir.c
> @@ -18,28 +18,189 @@
>  #include "qemu/main-loop.h"
>  #include "coth.h"
>  
> +/*
> + * This is solely executed on a background IO thread.
> + */
> +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent)
> +{
> +    int err = 0;
> +    V9fsState *s = pdu->s;
> +    struct dirent *entry;
> +
> +    errno = 0;
> +    entry = s->ops->readdir(&s->ctx, &fidp->fs);
> +    if (!entry && errno) {
> +        *dent = NULL;
> +        err = -errno;
> +    } else {
> +        *dent = entry;
> +    }
> +    return err;
> +}
> +
> +/*
> + * TODO: This will be removed for performance reasons.
> + * Use v9fs_co_readdir_lowlat() instead.
> + */
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>                                   struct dirent **dent)
>  {
>      int err;
> -    V9fsState *s = pdu->s;
>  
>      if (v9fs_request_cancelled(pdu)) {
>          return -EINTR;
>      }
> -    v9fs_co_run_in_worker(
> -        {
> -            struct dirent *entry;
> +    v9fs_co_run_in_worker({
> +        err = do_readdir(pdu, fidp, dent);
> +    });
> +    return err;
> +}
> +
> +/*
> + * This is solely executed on a background IO thread.
> + *
> + * See v9fs_co_readdir_lowlat() (as its only user) below for details.
> + */
> +static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> +                             struct V9fsDirEnt **entries,
> +                             int32_t maxsize, bool dostat)
> +{
> +    V9fsState *s = pdu->s;
> +    V9fsString name;
> +    int len, err = 0;
> +    int32_t size = 0;
> +    off_t saved_dir_pos;
> +    struct dirent *dent;
> +    struct V9fsDirEnt *e = NULL;
> +    V9fsPath path;
> +    struct stat stbuf;
> +
> +    *entries = NULL;
> +    v9fs_path_init(&path);
> +
> +    /*
> +     * TODO: Here should be a warn_report_once() if lock failed.
> +     *
> +     * With a good 9p client we should not get into concurrency here,
> +     * because a good client would not use the same fid for concurrent
> +     * requests. We do the lock here for safety reasons though. However
> +     * the client would then suffer performance issues, so better log that
> +     * issue here.
> +     */
> +    v9fs_readdir_lock(&fidp->fs.dir);
> +
> +    /* save the directory position */
> +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> +    if (saved_dir_pos < 0) {
> +        err = saved_dir_pos;
> +        goto out;
> +    }
> +
> +    while (true) {
> +        /* get directory entry from fs driver */
> +        err = do_readdir(pdu, fidp, &dent);
> +        if (err || !dent) {
> +            break;
> +        }
> +
> +        /*
> +         * stop this loop as soon as it would exceed the allowed maximum
> +         * response message size for the directory entries collected so far,
> +         * because anything beyond that size would need to be discarded by
> +         * 9p controller (main thread / top half) anyway
> +         */
> +        v9fs_string_init(&name);
> +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> +        len = v9fs_readdir_response_size(&name);
> +        v9fs_string_free(&name);
> +        if (size + len > maxsize) {
> +            /* this is not an error case actually */
> +            break;
> +        }
> +
> +        /* append next node to result chain */
> +        if (!e) {
> +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> +        } else {
> +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> +        }
> +        e->dent = g_malloc0(sizeof(struct dirent));
> +        memcpy(e->dent, dent, sizeof(struct dirent));
>  
> -            errno = 0;
> -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> -            if (!entry && errno) {
> +        /* perform a full stat() for directory entry if requested by caller */
> +        if (dostat) {
> +            err = s->ops->name_to_path(
> +                &s->ctx, &fidp->path, dent->d_name, &path
> +            );
> +            if (err < 0) {
>                  err = -errno;
> -            } else {
> -                *dent = entry;
> -                err = 0;
> +                break;
>              }
> -        });
> +
> +            err = s->ops->lstat(&s->ctx, &path, &stbuf);
> +            if (err < 0) {
> +                err = -errno;
> +                break;
> +            }
> +
> +            e->st = g_malloc0(sizeof(struct stat));
> +            memcpy(e->st, &stbuf, sizeof(struct stat));
> +        }
> +
> +        size += len;
> +        saved_dir_pos = dent->d_off;
> +    }
> +
> +    /* restore (last) saved position */
> +    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
> +
> +out:
> +    v9fs_readdir_unlock(&fidp->fs.dir);
> +    v9fs_path_free(&path);
> +    if (err < 0) {
> +        return err;
> +    }
> +    return size;
> +}
> +
> +/**
> + * @brief Low latency variant of fs driver readdir handling.
> + *
> + * Retrieves the requested (max. amount of) directory entries from the fs
> + * driver. This function must only be called by the main IO thread (top half).
> + * Internally this function call will be dispatched to a background IO thread
> + * (bottom half) where it is eventually executed by the fs driver.
> + *
> + * The old readdir implementation above just retrieves always one dir entry
> + * per call. The problem of that implementation above is that latency is
> + * added for (retrieval of) each directory entry, which in practice lead to
> + * latencies of several hundred ms for readdir of only one directory.
> + *
> + * This is avoided in this function by letting the fs driver retrieve all
> + * required directory entries with only call of this function and hence with
> + * only a single fs driver request.
> + *
> + * @param pdu - the causing 9p (T_readdir) client request
> + * @param fidp - already opened directory where readdir shall be performed on
> + * @param entries - output for directory entries (must not be NULL)
> + * @param maxsize - maximum result message body size (in bytes)
> + * @param dostat - whether a stat() should be performed and returned for
> + *                 each directory entry
> + * @returns resulting response message body size (in bytes) on success,
> + *          negative error code otherwise
> + */
> +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> +                                        struct V9fsDirEnt **entries,
> +                                        int32_t maxsize, bool dostat)
> +{
> +    int err = 0;
> +
> +    if (v9fs_request_cancelled(pdu)) {
> +        return -EINTR;
> +    }
> +    v9fs_co_run_in_worker({
> +        err = do_readdir_lowlat(pdu, fidp, entries, maxsize, dostat);
> +    });
>      return err;
>  }
>  
> diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
> index c2cdc7a9ea..1249dbe6df 100644
> --- a/hw/9pfs/coth.h
> +++ b/hw/9pfs/coth.h
> @@ -49,6 +49,9 @@
>  void co_run_in_worker_bh(void *);
>  int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
> +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *, V9fsFidState *,
> +                                        struct V9fsDirEnt **,
> +                                        int32_t, bool);
>  off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
>  void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
>  void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);
Christian Schoenebeck Jan. 23, 2020, 12:57 p.m. UTC | #2
On Donnerstag, 23. Januar 2020 12:33:42 CET Greg Kurz wrote:
> On Tue, 21 Jan 2020 01:30:10 +0100
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > Make top half really top half and bottom half really bottom half:
> > 
> > Each T_readdir request handling is hopping between threads (main
> > I/O thread and background I/O driver threads) several times for
> > every individual directory entry, which sums up to huge latencies
> > for handling just a single T_readdir request.
> > 
> > Instead of doing that, collect now all required directory entries
> > (including all potentially required stat buffers for each entry) in
> > one rush on a background I/O thread from fs driver, then assemble
> > the entire resulting network response message for the readdir
> > request on main I/O thread. The fs driver is still aborting the
> > directory entry retrieval loop (on the background I/O thread) as
> > soon as it would exceed the client's requested maximum R_readdir
> > response size. So we should not have any performance penalty by
> > doing this.
> > 
> > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > ---
> 
> Ok so this is it. Not reviewed this huge patch yet but I could at
> least give a try. The gain is impressive indeed:

Tseses, so much scepticism. :)

> [greg@bahia qemu-9p]$ (cd .mbuild-$(stg branch)/obj ; export
> QTEST_QEMU_BINARY='x86_64-softmmu/qemu-system-x86_64'; make all
> tests/qtest/qos-test && for i in {1..100}; do tests/qtest/qos-test -p
> $(tests/qtest/qos-test -l | grep readdir/basic); done) |& awk '/IMPORTANT/
> { print $10 }' | sed -e 's/s//' -e 's/^/n+=1;x+=/;$ascale=6;x/n' | bc
> .009806
> 
> instead of .055654, i.e. nearly 6 times faster ! This sounds promising :)

Like mentioned in the other email, performance improvement by this patch is 
actually far more than factor 6 since you probably just dropped the n-square 
driver hack in your benchmarks (which tainted your benchmark results):

Unoptimized readdir, with n-square correction hack:
Time client spent for waiting for reply from server: 0.082539s [MOST 
IMPORTANT]

Optimized readdir, with n-square correction hack:
Time 9p server spent on synth_readdir() I/O only (synth driver): 0.001576s
Time 9p server spent on entire T_readdir request: 0.002244s [IMPORTANT]
Time client spent for waiting for reply from server: 0.002566s [MOST 
IMPORTANT]

So in this particular test run performance improvement by around factor 32, 
but I also observed factors around 40 before in my tests.

> Now I need to find time to do a decent review... :-\

Sure, take your time! But as you can see, it is really worth it.

And it's not just the performance improvement. This patch also reduces program 
flow complexity significantly, e.g. there is just one lock and one unlock; 
entry name allocation is immediately freed without any potential branch in 
between, and much more. In other words: it adds safety.

Best regards,
Christian Schoenebeck
Christian Schoenebeck March 9, 2020, 2:09 p.m. UTC | #3
On Dienstag, 21. Januar 2020 01:30:10 CET Christian Schoenebeck wrote:
> Make top half really top half and bottom half really bottom half:
> 
> Each T_readdir request handling is hopping between threads (main
> I/O thread and background I/O driver threads) several times for
> every individual directory entry, which sums up to huge latencies
> for handling just a single T_readdir request.
> 
> Instead of doing that, collect now all required directory entries
> (including all potentially required stat buffers for each entry) in
> one rush on a background I/O thread from fs driver, then assemble
> the entire resulting network response message for the readdir
> request on main I/O thread. The fs driver is still aborting the
> directory entry retrieval loop (on the background I/O thread) as
> soon as it would exceed the client's requested maximum R_readdir
> response size. So we should not have any performance penalty by
> doing this.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---

PING! Idle for 7 weeks.

Could anybody help out reviewing this particular patch 10?

Complete thread:
https://lists.gnu.org/archive/html/qemu-devel/2020-01/msg04418.html

Patches 1..5 are already on master. Subsequent patches already have been 
reviewed, this patch 10 is the only one not being reviewed at all yet.

Test cases and benchmark patches are provided by thread, instructions to run 
them as well.

>  hw/9pfs/9p.c    | 124 +++++++++++++++-----------------
>  hw/9pfs/9p.h    |  23 ++++++
>  hw/9pfs/codir.c | 183 +++++++++++++++++++++++++++++++++++++++++++++---
>  hw/9pfs/coth.h  |   3 +
>  4 files changed, 254 insertions(+), 79 deletions(-)
> 
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 18370183c4..e0ca45d46b 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -971,30 +971,6 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu,
> V9fsFidState *fidp, return 0;
>  }
> 
> -static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
> -                                      struct dirent *dent, V9fsQID *qidp)
> -{
> -    struct stat stbuf;
> -    V9fsPath path;
> -    int err;
> -
> -    v9fs_path_init(&path);
> -
> -    err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path);
> -    if (err < 0) {
> -        goto out;
> -    }
> -    err = v9fs_co_lstat(pdu, &path, &stbuf);
> -    if (err < 0) {
> -        goto out;
> -    }
> -    err = stat_to_qid(pdu, &stbuf, qidp);
> -
> -out:
> -    v9fs_path_free(&path);
> -    return err;
> -}
> -
>  V9fsPDU *pdu_alloc(V9fsState *s)
>  {
>      V9fsPDU *pdu = NULL;
> @@ -2314,7 +2290,7 @@ out_nofid:
>      pdu_complete(pdu, err);
>  }
> 
> -static size_t v9fs_readdir_data_size(V9fsString *name)
> +size_t v9fs_readdir_response_size(V9fsString *name)
>  {
>      /*
>       * Size of each dirent on the wire: size of qid (13) + size of offset
> (8) @@ -2323,6 +2299,18 @@ static size_t v9fs_readdir_data_size(V9fsString
> *name) return 24 + v9fs_string_size(name);
>  }
> 
> +static void v9fs_free_dirents(struct V9fsDirEnt *e)
> +{
> +    struct V9fsDirEnt *next = NULL;
> +
> +    for (; e; e = next) {
> +        next = e->next;
> +        g_free(e->dent);
> +        g_free(e->st);
> +        g_free(e);
> +    }
> +}
> +
>  static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>                                          int32_t max_count)
>  {
> @@ -2331,54 +2319,53 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU
> *pdu, V9fsFidState *fidp, V9fsString name;
>      int len, err = 0;
>      int32_t count = 0;
> -    off_t saved_dir_pos;
>      struct dirent *dent;
> +    struct stat *st;
> +    struct V9fsDirEnt *entries = NULL;
> 
> -    /* save the directory position */
> -    saved_dir_pos = v9fs_co_telldir(pdu, fidp);
> -    if (saved_dir_pos < 0) {
> -        return saved_dir_pos;
> -    }
> -
> -    while (1) {
> -        v9fs_readdir_lock(&fidp->fs.dir);
> +    /*
> +     * inode remapping requires the device id, which in turn might be
> +     * different for different directory entries, so if inode remapping is
> +     * enabled we have to make a full stat for each directory entry
> +     */
> +    const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
> 
> -        err = v9fs_co_readdir(pdu, fidp, &dent);
> -        if (err || !dent) {
> -            break;
> -        }
> -        v9fs_string_init(&name);
> -        v9fs_string_sprintf(&name, "%s", dent->d_name);
> -        if ((count + v9fs_readdir_data_size(&name)) > max_count) {
> -            v9fs_readdir_unlock(&fidp->fs.dir);
> +    /*
> +     * Fetch all required directory entries altogether on a background IO
> +     * thread from fs driver. We don't want to do that for each entry
> +     * individually, because hopping between threads (this main IO thread
> +     * and background IO driver thread) would sum up to huge latencies.
> +     */
> +    count = v9fs_co_readdir_lowlat(pdu, fidp, &entries, max_count, dostat);
> +    if (count < 0) {
> +        err = count;
> +        count = 0;
> +        goto out;
> +    }
> +    count = 0;
> 
> -            /* Ran out of buffer. Set dir back to old position and return
> */ -            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> -            v9fs_string_free(&name);
> -            return count;
> -        }
> +    for (struct V9fsDirEnt *e = entries; e; e = e->next) {
> +        dent = e->dent;
> 
>          if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> -            /*
> -             * dirent_to_qid() implies expensive stat call for each entry,
> -             * we must do that here though since inode remapping requires
> -             * the device id, which in turn might be different for
> -             * different entries; we cannot make any assumption to avoid
> -             * that here.
> -             */
> -            err = dirent_to_qid(pdu, fidp, dent, &qid);
> +            st = e->st;
> +            /* e->st should never be NULL, but just to be sure */
> +            if (!st) {
> +                err = -1;
> +                break;
> +            }
> +
> +            /* remap inode */
> +            err = stat_to_qid(pdu, st, &qid);
>              if (err < 0) {
> -                v9fs_readdir_unlock(&fidp->fs.dir);
> -                v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> -                v9fs_string_free(&name);
> -                return err;
> +                break;
>              }
>          } else {
>              /*
>               * Fill up just the path field of qid because the client uses
>               * only that. To fill the entire qid structure we will have
>               * to stat each dirent found, which is expensive. For the
> -             * latter reason we don't call dirent_to_qid() here. Only
> drawback +             * latter reason we don't call stat_to_qid() here.
> Only drawback * is that no multi-device export detection of stat_to_qid() *
> would be done and provided as error to the user here. But * user would get
> that error anyway when accessing those @@ -2391,25 +2378,26 @@ static int
> coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, qid.version
> = 0;
>          }
> 
> +        v9fs_string_init(&name);
> +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> +
>          /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
>          len = pdu_marshal(pdu, 11 + count, "Qqbs",
>                            &qid, dent->d_off,
>                            dent->d_type, &name);
> 
> -        v9fs_readdir_unlock(&fidp->fs.dir);
> +        v9fs_string_free(&name);
> 
>          if (len < 0) {
> -            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> -            v9fs_string_free(&name);
> -            return len;
> +            err = len;
> +            break;
>          }
> +
>          count += len;
> -        v9fs_string_free(&name);
> -        saved_dir_pos = dent->d_off;
>      }
> 
> -    v9fs_readdir_unlock(&fidp->fs.dir);
> -
> +out:
> +    v9fs_free_dirents(entries);
>      if (err < 0) {
>          return err;
>      }
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 6fffe44f5a..1dbb0ad189 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -215,6 +215,28 @@ static inline void v9fs_readdir_init(V9fsDir *dir)
>      qemu_mutex_init(&dir->readdir_mutex);
>  }
> 
> +/*
> + * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
> + * which is a chained list of directory entries.
> + */
> +typedef struct V9fsDirEnt {
> +    /* mandatory (must not be NULL) information for all readdir requests */
> +    struct dirent *dent;
> +    /*
> +     * optional (may be NULL): A full stat of each directory entry is just
> +     * done if explicitly told to fs driver.
> +     */
> +    struct stat *st;
> +    /*
> +     * instead of an array, directory entries are always returned as
> +     * chained list, that's because the amount of entries retrieved by fs
> +     * drivers is dependent on the individual entries' name (since response
> +     * messages are size limited), so the final amount cannot be estimated
> +     * before hand
> +     */
> +    struct V9fsDirEnt *next;
> +} V9fsDirEnt;
> +
>  /*
>   * Filled by fs driver on open and other
>   * calls.
> @@ -419,6 +441,7 @@ void v9fs_path_init(V9fsPath *path);
>  void v9fs_path_free(V9fsPath *path);
>  void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...);
>  void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src);
> +size_t v9fs_readdir_response_size(V9fsString *name);
>  int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
>                        const char *name, V9fsPath *path);
>  int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
> diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
> index 73f9a751e1..6ce7dc8cde 100644
> --- a/hw/9pfs/codir.c
> +++ b/hw/9pfs/codir.c
> @@ -18,28 +18,189 @@
>  #include "qemu/main-loop.h"
>  #include "coth.h"
> 
> +/*
> + * This is solely executed on a background IO thread.
> + */
> +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent
> **dent) +{
> +    int err = 0;
> +    V9fsState *s = pdu->s;
> +    struct dirent *entry;
> +
> +    errno = 0;
> +    entry = s->ops->readdir(&s->ctx, &fidp->fs);
> +    if (!entry && errno) {
> +        *dent = NULL;
> +        err = -errno;
> +    } else {
> +        *dent = entry;
> +    }
> +    return err;
> +}
> +
> +/*
> + * TODO: This will be removed for performance reasons.
> + * Use v9fs_co_readdir_lowlat() instead.
> + */
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>                                   struct dirent **dent)
>  {
>      int err;
> -    V9fsState *s = pdu->s;
> 
>      if (v9fs_request_cancelled(pdu)) {
>          return -EINTR;
>      }
> -    v9fs_co_run_in_worker(
> -        {
> -            struct dirent *entry;
> +    v9fs_co_run_in_worker({
> +        err = do_readdir(pdu, fidp, dent);
> +    });
> +    return err;
> +}
> +
> +/*
> + * This is solely executed on a background IO thread.
> + *
> + * See v9fs_co_readdir_lowlat() (as its only user) below for details.
> + */
> +static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> +                             struct V9fsDirEnt **entries,
> +                             int32_t maxsize, bool dostat)
> +{
> +    V9fsState *s = pdu->s;
> +    V9fsString name;
> +    int len, err = 0;
> +    int32_t size = 0;
> +    off_t saved_dir_pos;
> +    struct dirent *dent;
> +    struct V9fsDirEnt *e = NULL;
> +    V9fsPath path;
> +    struct stat stbuf;
> +
> +    *entries = NULL;
> +    v9fs_path_init(&path);
> +
> +    /*
> +     * TODO: Here should be a warn_report_once() if lock failed.
> +     *
> +     * With a good 9p client we should not get into concurrency here,
> +     * because a good client would not use the same fid for concurrent
> +     * requests. We do the lock here for safety reasons though. However
> +     * the client would then suffer performance issues, so better log that
> +     * issue here.
> +     */
> +    v9fs_readdir_lock(&fidp->fs.dir);
> +
> +    /* save the directory position */
> +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> +    if (saved_dir_pos < 0) {
> +        err = saved_dir_pos;
> +        goto out;
> +    }
> +
> +    while (true) {
> +        /* get directory entry from fs driver */
> +        err = do_readdir(pdu, fidp, &dent);
> +        if (err || !dent) {
> +            break;
> +        }
> +
> +        /*
> +         * stop this loop as soon as it would exceed the allowed maximum
> +         * response message size for the directory entries collected so
> far, +         * because anything beyond that size would need to be
> discarded by +         * 9p controller (main thread / top half) anyway
> +         */
> +        v9fs_string_init(&name);
> +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> +        len = v9fs_readdir_response_size(&name);
> +        v9fs_string_free(&name);
> +        if (size + len > maxsize) {
> +            /* this is not an error case actually */
> +            break;
> +        }
> +
> +        /* append next node to result chain */
> +        if (!e) {
> +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> +        } else {
> +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> +        }
> +        e->dent = g_malloc0(sizeof(struct dirent));
> +        memcpy(e->dent, dent, sizeof(struct dirent));
> 
> -            errno = 0;
> -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> -            if (!entry && errno) {
> +        /* perform a full stat() for directory entry if requested by caller
> */ +        if (dostat) {
> +            err = s->ops->name_to_path(
> +                &s->ctx, &fidp->path, dent->d_name, &path
> +            );
> +            if (err < 0) {
>                  err = -errno;
> -            } else {
> -                *dent = entry;
> -                err = 0;
> +                break;
>              }
> -        });
> +
> +            err = s->ops->lstat(&s->ctx, &path, &stbuf);
> +            if (err < 0) {
> +                err = -errno;
> +                break;
> +            }
> +
> +            e->st = g_malloc0(sizeof(struct stat));
> +            memcpy(e->st, &stbuf, sizeof(struct stat));
> +        }
> +
> +        size += len;
> +        saved_dir_pos = dent->d_off;
> +    }
> +
> +    /* restore (last) saved position */
> +    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
> +
> +out:
> +    v9fs_readdir_unlock(&fidp->fs.dir);
> +    v9fs_path_free(&path);
> +    if (err < 0) {
> +        return err;
> +    }
> +    return size;
> +}
> +
> +/**
> + * @brief Low latency variant of fs driver readdir handling.
> + *
> + * Retrieves the requested (max. amount of) directory entries from the fs
> + * driver. This function must only be called by the main IO thread (top
> half). + * Internally this function call will be dispatched to a background
> IO thread + * (bottom half) where it is eventually executed by the fs
> driver. + *
> + * The old readdir implementation above just retrieves always one dir entry
> + * per call. The problem of that implementation above is that latency is +
> * added for (retrieval of) each directory entry, which in practice lead to
> + * latencies of several hundred ms for readdir of only one directory. + *
> + * This is avoided in this function by letting the fs driver retrieve all
> + * required directory entries with only call of this function and hence
> with + * only a single fs driver request.
> + *
> + * @param pdu - the causing 9p (T_readdir) client request
> + * @param fidp - already opened directory where readdir shall be performed
> on + * @param entries - output for directory entries (must not be NULL) + *
> @param maxsize - maximum result message body size (in bytes)
> + * @param dostat - whether a stat() should be performed and returned for
> + *                 each directory entry
> + * @returns resulting response message body size (in bytes) on success,
> + *          negative error code otherwise
> + */
> +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> +                                        struct V9fsDirEnt **entries,
> +                                        int32_t maxsize, bool dostat)
> +{
> +    int err = 0;
> +
> +    if (v9fs_request_cancelled(pdu)) {
> +        return -EINTR;
> +    }
> +    v9fs_co_run_in_worker({
> +        err = do_readdir_lowlat(pdu, fidp, entries, maxsize, dostat);
> +    });
>      return err;
>  }
> 
> diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
> index c2cdc7a9ea..1249dbe6df 100644
> --- a/hw/9pfs/coth.h
> +++ b/hw/9pfs/coth.h
> @@ -49,6 +49,9 @@
>  void co_run_in_worker_bh(void *);
>  int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent
> **); +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *, V9fsFidState *, + 
>                                       struct V9fsDirEnt **,
> +                                        int32_t, bool);
>  off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
>  void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
>  void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);
Greg Kurz March 9, 2020, 3:42 p.m. UTC | #4
On Mon, 09 Mar 2020 15:09:21 +0100
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Dienstag, 21. Januar 2020 01:30:10 CET Christian Schoenebeck wrote:
> > Make top half really top half and bottom half really bottom half:
> > 
> > Each T_readdir request handling is hopping between threads (main
> > I/O thread and background I/O driver threads) several times for
> > every individual directory entry, which sums up to huge latencies
> > for handling just a single T_readdir request.
> > 
> > Instead of doing that, collect now all required directory entries
> > (including all potentially required stat buffers for each entry) in
> > one rush on a background I/O thread from fs driver, then assemble
> > the entire resulting network response message for the readdir
> > request on main I/O thread. The fs driver is still aborting the
> > directory entry retrieval loop (on the background I/O thread) as
> > soon as it would exceed the client's requested maximum R_readdir
> > response size. So we should not have any performance penalty by
> > doing this.
> > 
> > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > ---
> 
> PING! Idle for 7 weeks.
> 

Yeah sorry for that, I couldn't find enough time yet...

> Could anybody help out reviewing this particular patch 10?
> 

The main issue I see with this patch is that it is too big. At
least from my POV, with the little time I have for 9p, that's
the first thing that pushes me back... So if you could split
this patch down to some reviewable chunks, this may help
going forward.

Anyway, here's a first high level shot.

> Complete thread:
> https://lists.gnu.org/archive/html/qemu-devel/2020-01/msg04418.html
> 
> Patches 1..5 are already on master. Subsequent patches already have been 
> reviewed, this patch 10 is the only one not being reviewed at all yet.
> 
> Test cases and benchmark patches are provided by thread, instructions to run 
> them as well.
> 
> >  hw/9pfs/9p.c    | 124 +++++++++++++++-----------------
> >  hw/9pfs/9p.h    |  23 ++++++
> >  hw/9pfs/codir.c | 183 +++++++++++++++++++++++++++++++++++++++++++++---
> >  hw/9pfs/coth.h  |   3 +
> >  4 files changed, 254 insertions(+), 79 deletions(-)
> > 
> > diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> > index 18370183c4..e0ca45d46b 100644
> > --- a/hw/9pfs/9p.c
> > +++ b/hw/9pfs/9p.c
> > @@ -971,30 +971,6 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu,
> > V9fsFidState *fidp, return 0;
> >  }
> > 
> > -static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
> > -                                      struct dirent *dent, V9fsQID *qidp)
> > -{
> > -    struct stat stbuf;
> > -    V9fsPath path;
> > -    int err;
> > -
> > -    v9fs_path_init(&path);
> > -
> > -    err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path);
> > -    if (err < 0) {
> > -        goto out;
> > -    }
> > -    err = v9fs_co_lstat(pdu, &path, &stbuf);
> > -    if (err < 0) {
> > -        goto out;
> > -    }
> > -    err = stat_to_qid(pdu, &stbuf, qidp);
> > -
> > -out:
> > -    v9fs_path_free(&path);
> > -    return err;
> > -}
> > -
> >  V9fsPDU *pdu_alloc(V9fsState *s)
> >  {
> >      V9fsPDU *pdu = NULL;
> > @@ -2314,7 +2290,7 @@ out_nofid:
> >      pdu_complete(pdu, err);
> >  }
> > 
> > -static size_t v9fs_readdir_data_size(V9fsString *name)
> > +size_t v9fs_readdir_response_size(V9fsString *name)
> >  {
> >      /*
> >       * Size of each dirent on the wire: size of qid (13) + size of offset
> > (8) @@ -2323,6 +2299,18 @@ static size_t v9fs_readdir_data_size(V9fsString
> > *name) return 24 + v9fs_string_size(name);
> >  }
> > 
> > +static void v9fs_free_dirents(struct V9fsDirEnt *e)
> > +{
> > +    struct V9fsDirEnt *next = NULL;
> > +
> > +    for (; e; e = next) {
> > +        next = e->next;
> > +        g_free(e->dent);
> > +        g_free(e->st);
> > +        g_free(e);
> > +    }
> > +}
> > +
> >  static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
> >                                          int32_t max_count)
> >  {
> > @@ -2331,54 +2319,53 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU
> > *pdu, V9fsFidState *fidp, V9fsString name;
> >      int len, err = 0;
> >      int32_t count = 0;
> > -    off_t saved_dir_pos;
> >      struct dirent *dent;
> > +    struct stat *st;
> > +    struct V9fsDirEnt *entries = NULL;
> > 
> > -    /* save the directory position */
> > -    saved_dir_pos = v9fs_co_telldir(pdu, fidp);
> > -    if (saved_dir_pos < 0) {
> > -        return saved_dir_pos;
> > -    }
> > -
> > -    while (1) {
> > -        v9fs_readdir_lock(&fidp->fs.dir);
> > +    /*
> > +     * inode remapping requires the device id, which in turn might be
> > +     * different for different directory entries, so if inode remapping is
> > +     * enabled we have to make a full stat for each directory entry
> > +     */
> > +    const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
> > 
> > -        err = v9fs_co_readdir(pdu, fidp, &dent);
> > -        if (err || !dent) {
> > -            break;
> > -        }
> > -        v9fs_string_init(&name);
> > -        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > -        if ((count + v9fs_readdir_data_size(&name)) > max_count) {
> > -            v9fs_readdir_unlock(&fidp->fs.dir);
> > +    /*
> > +     * Fetch all required directory entries altogether on a background IO
> > +     * thread from fs driver. We don't want to do that for each entry
> > +     * individually, because hopping between threads (this main IO thread
> > +     * and background IO driver thread) would sum up to huge latencies.
> > +     */
> > +    count = v9fs_co_readdir_lowlat(pdu, fidp, &entries, max_count, dostat);
> > +    if (count < 0) {
> > +        err = count;
> > +        count = 0;
> > +        goto out;
> > +    }
> > +    count = 0;
> > 
> > -            /* Ran out of buffer. Set dir back to old position and return
> > */ -            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> > -            v9fs_string_free(&name);
> > -            return count;
> > -        }
> > +    for (struct V9fsDirEnt *e = entries; e; e = e->next) {
> > +        dent = e->dent;
> > 
> >          if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> > -            /*
> > -             * dirent_to_qid() implies expensive stat call for each entry,
> > -             * we must do that here though since inode remapping requires
> > -             * the device id, which in turn might be different for
> > -             * different entries; we cannot make any assumption to avoid
> > -             * that here.
> > -             */
> > -            err = dirent_to_qid(pdu, fidp, dent, &qid);
> > +            st = e->st;
> > +            /* e->st should never be NULL, but just to be sure */
> > +            if (!st) {
> > +                err = -1;
> > +                break;
> > +            }
> > +
> > +            /* remap inode */
> > +            err = stat_to_qid(pdu, st, &qid);
> >              if (err < 0) {
> > -                v9fs_readdir_unlock(&fidp->fs.dir);
> > -                v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> > -                v9fs_string_free(&name);
> > -                return err;
> > +                break;
> >              }
> >          } else {
> >              /*
> >               * Fill up just the path field of qid because the client uses
> >               * only that. To fill the entire qid structure we will have
> >               * to stat each dirent found, which is expensive. For the
> > -             * latter reason we don't call dirent_to_qid() here. Only
> > drawback +             * latter reason we don't call stat_to_qid() here.
> > Only drawback * is that no multi-device export detection of stat_to_qid() *
> > would be done and provided as error to the user here. But * user would get
> > that error anyway when accessing those @@ -2391,25 +2378,26 @@ static int
> > coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, qid.version
> > = 0;
> >          }
> > 
> > +        v9fs_string_init(&name);
> > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > +
> >          /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
> >          len = pdu_marshal(pdu, 11 + count, "Qqbs",
> >                            &qid, dent->d_off,
> >                            dent->d_type, &name);
> > 
> > -        v9fs_readdir_unlock(&fidp->fs.dir);
> > +        v9fs_string_free(&name);
> > 
> >          if (len < 0) {
> > -            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
> > -            v9fs_string_free(&name);
> > -            return len;
> > +            err = len;
> > +            break;
> >          }
> > +
> >          count += len;
> > -        v9fs_string_free(&name);
> > -        saved_dir_pos = dent->d_off;
> >      }
> > 
> > -    v9fs_readdir_unlock(&fidp->fs.dir);
> > -
> > +out:
> > +    v9fs_free_dirents(entries);
> >      if (err < 0) {
> >          return err;
> >      }
> > diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> > index 6fffe44f5a..1dbb0ad189 100644
> > --- a/hw/9pfs/9p.h
> > +++ b/hw/9pfs/9p.h
> > @@ -215,6 +215,28 @@ static inline void v9fs_readdir_init(V9fsDir *dir)
> >      qemu_mutex_init(&dir->readdir_mutex);
> >  }
> > 
> > +/*
> > + * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
> > + * which is a chained list of directory entries.
> > + */
> > +typedef struct V9fsDirEnt {
> > +    /* mandatory (must not be NULL) information for all readdir requests */
> > +    struct dirent *dent;
> > +    /*
> > +     * optional (may be NULL): A full stat of each directory entry is just
> > +     * done if explicitly told to fs driver.
> > +     */
> > +    struct stat *st;
> > +    /*
> > +     * instead of an array, directory entries are always returned as
> > +     * chained list, that's because the amount of entries retrieved by fs
> > +     * drivers is dependent on the individual entries' name (since response
> > +     * messages are size limited), so the final amount cannot be estimated
> > +     * before hand
> > +     */
> > +    struct V9fsDirEnt *next;
> > +} V9fsDirEnt;
> > +
> >  /*
> >   * Filled by fs driver on open and other
> >   * calls.
> > @@ -419,6 +441,7 @@ void v9fs_path_init(V9fsPath *path);
> >  void v9fs_path_free(V9fsPath *path);
> >  void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...);
> >  void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src);
> > +size_t v9fs_readdir_response_size(V9fsString *name);
> >  int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
> >                        const char *name, V9fsPath *path);
> >  int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
> > diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
> > index 73f9a751e1..6ce7dc8cde 100644
> > --- a/hw/9pfs/codir.c
> > +++ b/hw/9pfs/codir.c
> > @@ -18,28 +18,189 @@
> >  #include "qemu/main-loop.h"
> >  #include "coth.h"
> > 
> > +/*
> > + * This is solely executed on a background IO thread.
> > + */
> > +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent
> > **dent) +{
> > +    int err = 0;
> > +    V9fsState *s = pdu->s;
> > +    struct dirent *entry;
> > +
> > +    errno = 0;
> > +    entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > +    if (!entry && errno) {
> > +        *dent = NULL;
> > +        err = -errno;
> > +    } else {
> > +        *dent = entry;
> > +    }
> > +    return err;
> > +}
> > +
> > +/*
> > + * TODO: This will be removed for performance reasons.
> > + * Use v9fs_co_readdir_lowlat() instead.

Oh, so we'd still have the current implementation being used, even
with this patch applied... This would be okay for a RFC patch but
in the end I'd really like to merge something that also converts
v9fs_do_readdir_with_stat().

> > + */
> >  int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
> >                                   struct dirent **dent)
> >  {
> >      int err;
> > -    V9fsState *s = pdu->s;
> > 
> >      if (v9fs_request_cancelled(pdu)) {
> >          return -EINTR;
> >      }
> > -    v9fs_co_run_in_worker(
> > -        {
> > -            struct dirent *entry;
> > +    v9fs_co_run_in_worker({
> > +        err = do_readdir(pdu, fidp, dent);
> > +    });
> > +    return err;
> > +}
> > +
> > +/*
> > + * This is solely executed on a background IO thread.
> > + *
> > + * See v9fs_co_readdir_lowlat() (as its only user) below for details.

Since the only user is quite small, maybe just open-code this.

> > + */
> > +static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> > +                             struct V9fsDirEnt **entries,
> > +                             int32_t maxsize, bool dostat)
> > +{
> > +    V9fsState *s = pdu->s;
> > +    V9fsString name;
> > +    int len, err = 0;
> > +    int32_t size = 0;
> > +    off_t saved_dir_pos;
> > +    struct dirent *dent;
> > +    struct V9fsDirEnt *e = NULL;
> > +    V9fsPath path;
> > +    struct stat stbuf;
> > +
> > +    *entries = NULL;
> > +    v9fs_path_init(&path);
> > +
> > +    /*
> > +     * TODO: Here should be a warn_report_once() if lock failed.
> > +     *
> > +     * With a good 9p client we should not get into concurrency here,
> > +     * because a good client would not use the same fid for concurrent
> > +     * requests. We do the lock here for safety reasons though. However
> > +     * the client would then suffer performance issues, so better log that
> > +     * issue here.
> > +     */

This should probably be done with qemu_mutex_trylock() in a loop directly
in v9fs_readdir_lock().

> > +    v9fs_readdir_lock(&fidp->fs.dir);
> > +
> > +    /* save the directory position */
> > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > +    if (saved_dir_pos < 0) {
> > +        err = saved_dir_pos;
> > +        goto out;
> > +    }
> > +
> > +    while (true) {
> > +        /* get directory entry from fs driver */
> > +        err = do_readdir(pdu, fidp, &dent);
> > +        if (err || !dent) {
> > +            break;
> > +        }
> > +
> > +        /*
> > +         * stop this loop as soon as it would exceed the allowed maximum
> > +         * response message size for the directory entries collected so
> > far, +         * because anything beyond that size would need to be
> > discarded by +         * 9p controller (main thread / top half) anyway
> > +         */
> > +        v9fs_string_init(&name);
> > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > +        len = v9fs_readdir_response_size(&name);
> > +        v9fs_string_free(&name);
> > +        if (size + len > maxsize) {
> > +            /* this is not an error case actually */
> > +            break;
> > +        }
> > +
> > +        /* append next node to result chain */
> > +        if (!e) {
> > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > +        } else {
> > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > +        }
> > +        e->dent = g_malloc0(sizeof(struct dirent));
> > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > 
> > -            errno = 0;
> > -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > -            if (!entry && errno) {
> > +        /* perform a full stat() for directory entry if requested by caller
> > */ +        if (dostat) {
> > +            err = s->ops->name_to_path(
> > +                &s->ctx, &fidp->path, dent->d_name, &path
> > +            );
> > +            if (err < 0) {
> >                  err = -errno;
> > -            } else {
> > -                *dent = entry;
> > -                err = 0;
> > +                break;
> >              }
> > -        });
> > +
> > +            err = s->ops->lstat(&s->ctx, &path, &stbuf);
> > +            if (err < 0) {
> > +                err = -errno;
> > +                break;
> > +            }
> > +
> > +            e->st = g_malloc0(sizeof(struct stat));
> > +            memcpy(e->st, &stbuf, sizeof(struct stat));
> > +        }
> > +
> > +        size += len;
> > +        saved_dir_pos = dent->d_off;
> > +    }
> > +
> > +    /* restore (last) saved position */
> > +    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
> > +
> > +out:
> > +    v9fs_readdir_unlock(&fidp->fs.dir);
> > +    v9fs_path_free(&path);
> > +    if (err < 0) {
> > +        return err;
> > +    }
> > +    return size;
> > +}
> > +
> > +/**
> > + * @brief Low latency variant of fs driver readdir handling.
> > + *

Not sure it brings much to mention latency here... telling what this
function does actually ie. "read multiple directory entries in a row"
would be more informative.

> > + * Retrieves the requested (max. amount of) directory entries from the fs
> > + * driver. This function must only be called by the main IO thread (top
> > half). + * Internally this function call will be dispatched to a background
> > IO thread + * (bottom half) where it is eventually executed by the fs
> > driver. + *
> > + * The old readdir implementation above just retrieves always one dir entry
> > + * per call. The problem of that implementation above is that latency is +
> > * added for (retrieval of) each directory entry, which in practice lead to
> > + * latencies of several hundred ms for readdir of only one directory. + *
> > + * This is avoided in this function by letting the fs driver retrieve all
> > + * required directory entries with only call of this function and hence
> > with + * only a single fs driver request.

True but these kind of considerations rather belong to the changelog...
otherwise, we'll have to not forget to kill these lines when the "old
readdir implementation" is no more.

> > + *
> > + * @param pdu - the causing 9p (T_readdir) client request
> > + * @param fidp - already opened directory where readdir shall be performed
> > on + * @param entries - output for directory entries (must not be NULL) + *
> > @param maxsize - maximum result message body size (in bytes)
> > + * @param dostat - whether a stat() should be performed and returned for
> > + *                 each directory entry
> > + * @returns resulting response message body size (in bytes) on success,
> > + *          negative error code otherwise
> > + */
> > +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,

Not really a fan of the _lowlat (low fat ?) wording.

v9fs_co_readdir_many() or anything else that suggests we're going to
read several entries in a row.


> > +                                        struct V9fsDirEnt **entries,
> > +                                        int32_t maxsize, bool dostat)

s/maxsize/max_count since this is the wording use in the caller.

> > +{
> > +    int err = 0;
> > +
> > +    if (v9fs_request_cancelled(pdu)) {
> > +        return -EINTR;
> > +    }
> > +    v9fs_co_run_in_worker({
> > +        err = do_readdir_lowlat(pdu, fidp, entries, maxsize, dostat);
> > +    });
> >      return err;
> >  }
> > 
> > diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
> > index c2cdc7a9ea..1249dbe6df 100644
> > --- a/hw/9pfs/coth.h
> > +++ b/hw/9pfs/coth.h
> > @@ -49,6 +49,9 @@
> >  void co_run_in_worker_bh(void *);
> >  int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
> >  int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent
> > **); +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *, V9fsFidState *, + 
> >                                       struct V9fsDirEnt **,
> > +                                        int32_t, bool);
> >  off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
> >  void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
> >  void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);
> 
>
Christian Schoenebeck March 10, 2020, 3:10 p.m. UTC | #5
On Montag, 9. März 2020 16:42:59 CET Greg Kurz wrote:
> The main issue I see with this patch is that it is too big. At
> least from my POV, with the little time I have for 9p, that's
> the first thing that pushes me back... So if you could split
> this patch down to some reviewable chunks, this may help
> going forward.

This patch is also too big for my preference, but I don't see a viable way to 
split it further into separate patches. I already separated all the patches I 
could. If you have suggestions, very much appreciated!

The reason for this is that in order to fix these issues with current 
T_readdir implementation, it requires to separate what's currently one task 
(i.e. one function) into two separate tasks (i.e. two functions). There is no 
sensible way to do that.

But don't be too scared about this patch; if you look just at the diff 
directly then yes, the diff is not very human readable. However if you apply 
the patch and look at the resulting code, you'll notice that there are 
actually only 2 functions (v9fs_do_readdir() in 9p.c and do_readdir_lowlat() 
in codir.c) which require careful reviewing and that their resulting code is 
actually very, very straight forward, and not long either.

Current code on master is much more tricky and error prone due to the huge 
amount of potential branches, individual error/cleanup handlers, high amount 
of thread dispatching and much more. In the patched version all these code 
complexities and error sources are removed.

> > > diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
> > > index 73f9a751e1..6ce7dc8cde 100644
> > > --- a/hw/9pfs/codir.c
> > > +++ b/hw/9pfs/codir.c
> > > @@ -18,28 +18,189 @@
> > > 
> > >  #include "qemu/main-loop.h"
> > >  #include "coth.h"
> > > 
> > > +/*
> > > + * This is solely executed on a background IO thread.
> > > + */
> > > +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent
> > > **dent) +{
> > > +    int err = 0;
> > > +    V9fsState *s = pdu->s;
> > > +    struct dirent *entry;
> > > +
> > > +    errno = 0;
> > > +    entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > > +    if (!entry && errno) {
> > > +        *dent = NULL;
> > > +        err = -errno;
> > > +    } else {
> > > +        *dent = entry;
> > > +    }
> > > +    return err;
> > > +}
> > > +
> > > +/*
> > > + * TODO: This will be removed for performance reasons.
> > > + * Use v9fs_co_readdir_lowlat() instead.
> 
> Oh, so we'd still have the current implementation being used, even
> with this patch applied... This would be okay for a RFC patch but
> in the end I'd really like to merge something that also converts
> v9fs_do_readdir_with_stat().

Yes, I know, but I would not recommend mixing these things at this point, 
because it would be an entire effort on its own.

v9fs_do_readdir_with_stat() is used for 9P2000.u, while v9fs_do_readdir() is 
used for 9P2000.L. They're behaving very differently, so it would not only 
require me to update v9fs_do_readdir_with_stat() and v9fs_read(), I would also 
need to write their own test cases (plural, since there are none at all yet) 
and benchmarks, and of course somebody what need to review all that additional 
amount of code, and who would actually test it? I am actively testing my 
9P2000.L changes, but I am not actually using 9P2000.u, so I could not 
guarantee for the same quality.

Considering the current commitment situation regarding 9pfs, we can only bring 
things forward with compromises. Hence I suggest I concentrate on fixing the 
worst performance issues on 9P2000.L side first, and later on if there are 
really people interested in seeing these improvements on 9P2000.u side and 
willing to at least help out with testing, then I can definitely also adjust 
9P2000.u side accordingly as well.

But to also make this clear: this patch 10 is not introducing any behaviour 
change for 9P2000.u side.

> > >  int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
> > >  
> > >                                   struct dirent **dent)
> > >  
> > >  {
> > >  
> > >      int err;
> > > 
> > > -    V9fsState *s = pdu->s;
> > > 
> > >      if (v9fs_request_cancelled(pdu)) {
> > >      
> > >          return -EINTR;
> > >      
> > >      }
> > > 
> > > -    v9fs_co_run_in_worker(
> > > -        {
> > > -            struct dirent *entry;
> > > +    v9fs_co_run_in_worker({
> > > +        err = do_readdir(pdu, fidp, dent);
> > > +    });
> > > +    return err;
> > > +}
> > > +
> > > +/*
> > > + * This is solely executed on a background IO thread.
> > > + *
> > > + * See v9fs_co_readdir_lowlat() (as its only user) below for details.
> 
> Since the only user is quite small, maybe just open-code this.

Mmm... maybe later on, as a cleanup patch or something. Current version is 
tested. I would like to avoid to make things more complicated than they 
already are (i.e. accidental introduction of some bug just due to this).

> 
> > > + */
> > > +static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> > > +                             struct V9fsDirEnt **entries,
> > > +                             int32_t maxsize, bool dostat)
> > > +{
> > > +    V9fsState *s = pdu->s;
> > > +    V9fsString name;
> > > +    int len, err = 0;
> > > +    int32_t size = 0;
> > > +    off_t saved_dir_pos;
> > > +    struct dirent *dent;
> > > +    struct V9fsDirEnt *e = NULL;
> > > +    V9fsPath path;
> > > +    struct stat stbuf;
> > > +
> > > +    *entries = NULL;
> > > +    v9fs_path_init(&path);
> > > +
> > > +    /*
> > > +     * TODO: Here should be a warn_report_once() if lock failed.
> > > +     *
> > > +     * With a good 9p client we should not get into concurrency here,
> > > +     * because a good client would not use the same fid for concurrent
> > > +     * requests. We do the lock here for safety reasons though. However
> > > +     * the client would then suffer performance issues, so better log
> > > that
> > > +     * issue here.
> > > +     */
> 
> This should probably be done with qemu_mutex_trylock() in a loop directly
> in v9fs_readdir_lock().

No definitely not in the loop. That's intentionally *one* lock *outside* of 
the loop. The normal case is not requiring a lock at all, like the comment 
describes. Doing multiple locks inside the loop unnecessaririly reduces 
performance for no reason.

About *_trylock() instead of v9fs_readdir_lock(): might be a candidate to 
address the TODO comment, but I would like to pospone that for the same 
reasons: it is an odd/ill case encountering concurrency here. So for all well 
implemented clients this is not an issue at all, and security (concerning 
malicious clients) is provided already by this lock. Addressing this TODO 
already now would potentially unnecessarily introduce bugs due to added 
complexity, so really I would postpone that.

> > > +/**
> > > + * @brief Low latency variant of fs driver readdir handling.
> > > + *
> 
> Not sure it brings much to mention latency here... telling what this
> function does actually ie. "read multiple directory entries in a row"
> would be more informative.

NP

> > > + * Retrieves the requested (max. amount of) directory entries from the
> > > fs
> > > + * driver. This function must only be called by the main IO thread (top
> > > half). + * Internally this function call will be dispatched to a
> > > background
> > > IO thread + * (bottom half) where it is eventually executed by the fs
> > > driver. + *
> > > + * The old readdir implementation above just retrieves always one dir
> > > entry + * per call. The problem of that implementation above is that
> > > latency is + * added for (retrieval of) each directory entry, which in
> > > practice lead to + * latencies of several hundred ms for readdir of
> > > only one directory. + * + * This is avoided in this function by letting
> > > the fs driver retrieve all + * required directory entries with only
> > > call of this function and hence with + * only a single fs driver
> > > request.
> 
> True but these kind of considerations rather belong to the changelog...
> otherwise, we'll have to not forget to kill these lines when the "old
> readdir implementation" is no more.

Mmm... I do think this overall latency issue should be clarified somewhere as 
a comment. Where exactly is not that important to me. For instance it could 
also be described on v9fs_co_run_in_worker() macro definition in coth.h 
instead, as general rule of thumb when dispatching stuff.

The thing is: for >10 years several developers obviously did not realize the 
severe negative performance impact of frivolously dispatching tasks to 
different threads too often. It looks like linear code, right? But no, it is 
not.

> > > + *
> > > + * @param pdu - the causing 9p (T_readdir) client request
> > > + * @param fidp - already opened directory where readdir shall be
> > > performed
> > > on + * @param entries - output for directory entries (must not be NULL)
> > > + *
> > > @param maxsize - maximum result message body size (in bytes)
> > > + * @param dostat - whether a stat() should be performed and returned
> > > for
> > > + *                 each directory entry
> > > + * @returns resulting response message body size (in bytes) on success,
> > > + *          negative error code otherwise
> > > + */
> > > +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *pdu, V9fsFidState
> > > *fidp,
> 
> Not really a fan of the _lowlat (low fat ?) wording.
> 
> v9fs_co_readdir_many() or anything else that suggests we're going to
> read several entries in a row.

NP

> 
> > > +                                        struct V9fsDirEnt **entries,
> > > +                                        int32_t maxsize, bool dostat)
> 
> s/maxsize/max_count since this is the wording use in the caller.

Wouldn't do that. This is the driver side, not the 9p protocol request 
handler. As you might have relized before, "max_count" is semantically 
completely wrong. This variable does not count integral entries, it is rather 
a maximum size (in bytes) of the destination buffer being filled.

On 9p request handler side it is ok to use "max_count" instead, because the 
protocol definition uses exactly this term for the request parameter, but on 
driver side it's IMO inappropriate.

Best regards,
Christian Schoenebeck
Greg Kurz March 10, 2020, 6:33 p.m. UTC | #6
On Tue, 10 Mar 2020 16:10:41 +0100
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Montag, 9. März 2020 16:42:59 CET Greg Kurz wrote:
> > The main issue I see with this patch is that it is too big. At
> > least from my POV, with the little time I have for 9p, that's
> > the first thing that pushes me back... So if you could split
> > this patch down to some reviewable chunks, this may help
> > going forward.
> 
> This patch is also too big for my preference, but I don't see a viable way to 
> split it further into separate patches. I already separated all the patches I 
> could. If you have suggestions, very much appreciated!
> 

Well, the patch could be split in two or three parts at least:

(1) introduce the new function that reads multiple entries in codir.c

(2) use it from 9p.c

(3) remove unused stuff if anything remains

This doesn't seem to change much but the smaller diffstats
for each individual patch make them less scary :) and with
(1) applied only it is easier to compare what the old code
in 9p.c and the new one in codir.c do.

> The reason for this is that in order to fix these issues with current 
> T_readdir implementation, it requires to separate what's currently one task 
> (i.e. one function) into two separate tasks (i.e. two functions). There is no 
> sensible way to do that.
> 

Yeah, I won't ask for finer grain.

> But don't be too scared about this patch; if you look just at the diff 
> directly then yes, the diff is not very human readable. However if you apply 
> the patch and look at the resulting code, you'll notice that there are 
> actually only 2 functions (v9fs_do_readdir() in 9p.c and do_readdir_lowlat() 
> in codir.c) which require careful reviewing and that their resulting code is 
> actually very, very straight forward, and not long either.
> 

These are personal opinions. Careful reviewing can take a lot of time :)

> Current code on master is much more tricky and error prone due to the huge 
> amount of potential branches, individual error/cleanup handlers, high amount 
> of thread dispatching and much more. In the patched version all these code 
> complexities and error sources are removed.
> 

Come on :) The 9pfs code has been a can of worms from the beginning.
It produced more than the average amount of security-related bugs,
and most sadly, due to the overall lack of interest, it bitrotted
and missed a lot of cool improvements like an appropriate support of
unlinked files, QOM-ification of fsdev, conversion of proxy fsdev to
vhost-user, hot plug/unplug support, live migration support and
"much more"... The performance aspect of things is a full time job
I never had the opportunity to even consider. So yes, your changes
are likely beneficial but the code base is still extremely fragile
and sensible to huge changes... not breaking things that happen
to work, even in a sub-optimal way, is essentially what I care for
these days.

> > > > diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
> > > > index 73f9a751e1..6ce7dc8cde 100644
> > > > --- a/hw/9pfs/codir.c
> > > > +++ b/hw/9pfs/codir.c
> > > > @@ -18,28 +18,189 @@
> > > > 
> > > >  #include "qemu/main-loop.h"
> > > >  #include "coth.h"
> > > > 
> > > > +/*
> > > > + * This is solely executed on a background IO thread.
> > > > + */
> > > > +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent
> > > > **dent) +{
> > > > +    int err = 0;
> > > > +    V9fsState *s = pdu->s;
> > > > +    struct dirent *entry;
> > > > +
> > > > +    errno = 0;
> > > > +    entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > > > +    if (!entry && errno) {
> > > > +        *dent = NULL;
> > > > +        err = -errno;
> > > > +    } else {
> > > > +        *dent = entry;
> > > > +    }
> > > > +    return err;
> > > > +}
> > > > +
> > > > +/*
> > > > + * TODO: This will be removed for performance reasons.
> > > > + * Use v9fs_co_readdir_lowlat() instead.
> > 
> > Oh, so we'd still have the current implementation being used, even
> > with this patch applied... This would be okay for a RFC patch but
> > in the end I'd really like to merge something that also converts
> > v9fs_do_readdir_with_stat().
> 
> Yes, I know, but I would not recommend mixing these things at this point, 
> because it would be an entire effort on its own.
> 
> v9fs_do_readdir_with_stat() is used for 9P2000.u, while v9fs_do_readdir() is 
> used for 9P2000.L. They're behaving very differently, so it would not only 
> require me to update v9fs_do_readdir_with_stat() and v9fs_read(), I would also 
> need to write their own test cases (plural, since there are none at all yet) 
> and benchmarks, and of course somebody what need to review all that additional 
> amount of code, and who would actually test it? I am actively testing my 
> 9P2000.L changes, but I am not actually using 9P2000.u, so I could not 
> guarantee for the same quality.
> 
> Considering the current commitment situation regarding 9pfs, we can only bring 
> things forward with compromises. Hence I suggest I concentrate on fixing the 
> worst performance issues on 9P2000.L side first, and later on if there are 
> really people interested in seeing these improvements on 9P2000.u side and 
> willing to at least help out with testing, then I can definitely also adjust 
> 9P2000.u side accordingly as well.
> 
> But to also make this clear: this patch 10 is not introducing any behaviour 
> change for 9P2000.u side.
> 

Ok, fair enough to leave 9p2000.U behind for now but I had to ask :)
/me is even wondering if we should start deprecating 9p2000.U since
most clients can use 9p2000.L now...

> > > >  int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
> > > >  
> > > >                                   struct dirent **dent)
> > > >  
> > > >  {
> > > >  
> > > >      int err;
> > > > 
> > > > -    V9fsState *s = pdu->s;
> > > > 
> > > >      if (v9fs_request_cancelled(pdu)) {
> > > >      
> > > >          return -EINTR;
> > > >      
> > > >      }
> > > > 
> > > > -    v9fs_co_run_in_worker(
> > > > -        {
> > > > -            struct dirent *entry;
> > > > +    v9fs_co_run_in_worker({
> > > > +        err = do_readdir(pdu, fidp, dent);
> > > > +    });
> > > > +    return err;
> > > > +}
> > > > +
> > > > +/*
> > > > + * This is solely executed on a background IO thread.
> > > > + *
> > > > + * See v9fs_co_readdir_lowlat() (as its only user) below for details.
> > 
> > Since the only user is quite small, maybe just open-code this.
> 
> Mmm... maybe later on, as a cleanup patch or something. Current version is 
> tested. I would like to avoid to make things more complicated than they 
> already are (i.e. accidental introduction of some bug just due to this).
> 

It seems we might agree on not breaking things that work ;-)

> > 
> > > > + */
> > > > +static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> > > > +                             struct V9fsDirEnt **entries,
> > > > +                             int32_t maxsize, bool dostat)
> > > > +{
> > > > +    V9fsState *s = pdu->s;
> > > > +    V9fsString name;
> > > > +    int len, err = 0;
> > > > +    int32_t size = 0;
> > > > +    off_t saved_dir_pos;
> > > > +    struct dirent *dent;
> > > > +    struct V9fsDirEnt *e = NULL;
> > > > +    V9fsPath path;
> > > > +    struct stat stbuf;
> > > > +
> > > > +    *entries = NULL;
> > > > +    v9fs_path_init(&path);
> > > > +
> > > > +    /*
> > > > +     * TODO: Here should be a warn_report_once() if lock failed.
> > > > +     *
> > > > +     * With a good 9p client we should not get into concurrency here,
> > > > +     * because a good client would not use the same fid for concurrent
> > > > +     * requests. We do the lock here for safety reasons though. However
> > > > +     * the client would then suffer performance issues, so better log
> > > > that
> > > > +     * issue here.
> > > > +     */
> > 
> > This should probably be done with qemu_mutex_trylock() in a loop directly
> > in v9fs_readdir_lock().
> 
> No definitely not in the loop. That's intentionally *one* lock *outside* of 

Not sure we're talking about the same loop here...

I'm just suggesting that v9fs_readdir_lock() could be something like:

static inline void v9fs_readdir_lock(V9fsDir *dir)
{
    while (qemu_mutex_trylock(&dir->readdir_mutex)) {
        warn_report_once("blah");
    }
}

> the loop. The normal case is not requiring a lock at all, like the comment 
> describes. Doing multiple locks inside the loop unnecessaririly reduces 
> performance for no reason.
> 
> About *_trylock() instead of v9fs_readdir_lock(): might be a candidate to 

Hmm... are you suggesting that do_readdir_lowlat() should return if it
can't take the lock ?

> address the TODO comment, but I would like to pospone that for the same 
> reasons: it is an odd/ill case encountering concurrency here. So for all well 
> implemented clients this is not an issue at all, and security (concerning 
> malicious clients) is provided already by this lock. Addressing this TODO 
> already now would potentially unnecessarily introduce bugs due to added 
> complexity, so really I would postpone that.
> 

:)

> > > > +/**
> > > > + * @brief Low latency variant of fs driver readdir handling.
> > > > + *
> > 
> > Not sure it brings much to mention latency here... telling what this
> > function does actually ie. "read multiple directory entries in a row"
> > would be more informative.
> 
> NP
> 
> > > > + * Retrieves the requested (max. amount of) directory entries from the
> > > > fs
> > > > + * driver. This function must only be called by the main IO thread (top
> > > > half). + * Internally this function call will be dispatched to a
> > > > background
> > > > IO thread + * (bottom half) where it is eventually executed by the fs
> > > > driver. + *
> > > > + * The old readdir implementation above just retrieves always one dir
> > > > entry + * per call. The problem of that implementation above is that
> > > > latency is + * added for (retrieval of) each directory entry, which in
> > > > practice lead to + * latencies of several hundred ms for readdir of
> > > > only one directory. + * + * This is avoided in this function by letting
> > > > the fs driver retrieve all + * required directory entries with only
> > > > call of this function and hence with + * only a single fs driver
> > > > request.
> > 
> > True but these kind of considerations rather belong to the changelog...
> > otherwise, we'll have to not forget to kill these lines when the "old
> > readdir implementation" is no more.
> 
> Mmm... I do think this overall latency issue should be clarified somewhere as 

The issue itself is best described in a changelog, but...

> a comment. Where exactly is not that important to me. For instance it could 
> also be described on v9fs_co_run_in_worker() macro definition in coth.h 
> instead, as general rule of thumb when dispatching stuff.

... if you have useful recommendations that don't involve referring to a
piece of code that might go away, a comment in coth.h is certainly a good
idea.

> The thing is: for >10 years several developers obviously did not realize the 
> severe negative performance impact of frivolously dispatching tasks to 

I won't dare to comment on some people I don't know not doing the obvious
right things at the time... what I do know is that for >10 years, nobody
cared for this code. Period. You're lucky it is still functional ;-)

> different threads too often. It looks like linear code, right? But no, it is 
> not.
> 
> > > > + *
> > > > + * @param pdu - the causing 9p (T_readdir) client request
> > > > + * @param fidp - already opened directory where readdir shall be
> > > > performed
> > > > on + * @param entries - output for directory entries (must not be NULL)
> > > > + *
> > > > @param maxsize - maximum result message body size (in bytes)
> > > > + * @param dostat - whether a stat() should be performed and returned
> > > > for
> > > > + *                 each directory entry
> > > > + * @returns resulting response message body size (in bytes) on success,
> > > > + *          negative error code otherwise
> > > > + */
> > > > +int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *pdu, V9fsFidState
> > > > *fidp,
> > 
> > Not really a fan of the _lowlat (low fat ?) wording.
> > 
> > v9fs_co_readdir_many() or anything else that suggests we're going to
> > read several entries in a row.
> 
> NP
> 
> > 
> > > > +                                        struct V9fsDirEnt **entries,
> > > > +                                        int32_t maxsize, bool dostat)
> > 
> > s/maxsize/max_count since this is the wording use in the caller.
> 
> Wouldn't do that. This is the driver side, not the 9p protocol request 
> handler. As you might have relized before, "max_count" is semantically 
> completely wrong. This variable does not count integral entries, it is rather 
> a maximum size (in bytes) of the destination buffer being filled.
> 
> On 9p request handler side it is ok to use "max_count" instead, because the 
> protocol definition uses exactly this term for the request parameter, but on 
> driver side it's IMO inappropriate.
> 

I agree the max_count wording sucks and I don't care that much to
name variables according to protocol definitions, but I do care
to use the same name in caller/callee when it makes sense, for a
better grep or cscope experience.

> Best regards,
> Christian Schoenebeck
> 
>
Christian Schoenebeck March 11, 2020, 1:18 a.m. UTC | #7
On Dienstag, 10. März 2020 19:33:36 CET Greg Kurz wrote:
> > This patch is also too big for my preference, but I don't see a viable way
> > to split it further into separate patches. I already separated all the
> > patches I could. If you have suggestions, very much appreciated!
> 
> Well, the patch could be split in two or three parts at least:
> 
> (1) introduce the new function that reads multiple entries in codir.c
> 
> (2) use it from 9p.c
> 
> (3) remove unused stuff if anything remains
> 
> This doesn't seem to change much but the smaller diffstats
> for each individual patch make them less scary :) and with
> (1) applied only it is easier to compare what the old code
> in 9p.c and the new one in codir.c do.
> 
> > The reason for this is that in order to fix these issues with current
> > T_readdir implementation, it requires to separate what's currently one
> > task
> > (i.e. one function) into two separate tasks (i.e. two functions). There is
> > no sensible way to do that.
> 
> Yeah, I won't ask for finer grain.

Me confused. Does that mean your split suggestion was just theoretical, or do 
you need it?

> > But don't be too scared about this patch; if you look just at the diff
> > directly then yes, the diff is not very human readable. However if you
> > apply the patch and look at the resulting code, you'll notice that there
> > are actually only 2 functions (v9fs_do_readdir() in 9p.c and
> > do_readdir_lowlat() in codir.c) which require careful reviewing and that
> > their resulting code is actually very, very straight forward, and not
> > long either.
> 
> These are personal opinions. Careful reviewing can take a lot of time :)

Well, depends on what precisely you mean with opinion. :) That this patch 
actually reduces code complexity is a fact (less branches, less locks, less 
dispatches, less side effects, less error/cleanup handlers). These are 
objectively measurable quantities.

But yes, nevertheless reviewing it costs time.

> > Current code on master is much more tricky and error prone due to the huge
> > amount of potential branches, individual error/cleanup handlers, high
> > amount of thread dispatching and much more. In the patched version all
> > these code complexities and error sources are removed.
> 
> Come on :) The 9pfs code has been a can of worms from the beginning.
> It produced more than the average amount of security-related bugs,
> and most sadly, due to the overall lack of interest, it bitrotted
> and missed a lot of cool improvements like an appropriate support of
> unlinked files, QOM-ification of fsdev, conversion of proxy fsdev to
> vhost-user, hot plug/unplug support, live migration support and
> "much more"... The performance aspect of things is a full time job

No, the performance issues are actually very managable in case of 9pfs.
I already addressed readdir with this patch (by far the worst performance 
issue), and then there would just be one more severe performance issue: 
walkdir.

My intention is not to squeeze out the last fractional percent of performance 
for 9pfs, but you certainly agree that a simple "ls" blocking for more than 1 
second is something that should be fixed, and fortunately the amount of 
changes involved are far less than I originally feared they would.

> I never had the opportunity to even consider. So yes, your changes
> are likely beneficial but the code base is still extremely fragile
> and sensible to huge changes... not breaking things that happen
> to work, even in a sub-optimal way, is essentially what I care for
> these days.

And I think I'm also very careful not breaking anything. I carefully consider 
what to touch and what not. I wrote test cases and I am actively testing my 
changes with real installations and snapshots as well.

I think the cause of disagreements we have are solely our use cases of 9pfs: 
your personal use case does not seem to require any performance considerations 
or multi-user aspects, whereas my use case requires at least some minimum 
performance grade for utilizing 9pfs for server applications.

> > > Oh, so we'd still have the current implementation being used, even
> > > with this patch applied... This would be okay for a RFC patch but
> > > in the end I'd really like to merge something that also converts
> > > v9fs_do_readdir_with_stat().
> > 
> > Yes, I know, but I would not recommend mixing these things at this point,
> > because it would be an entire effort on its own.
> > 
> > v9fs_do_readdir_with_stat() is used for 9P2000.u, while v9fs_do_readdir()
> > is used for 9P2000.L. They're behaving very differently, so it would not
> > only require me to update v9fs_do_readdir_with_stat() and v9fs_read(), I
> > would also need to write their own test cases (plural, since there are
> > none at all yet) and benchmarks, and of course somebody what need to
> > review all that additional amount of code, and who would actually test
> > it? I am actively testing my 9P2000.L changes, but I am not actually
> > using 9P2000.u, so I could not guarantee for the same quality.
> > 
> > Considering the current commitment situation regarding 9pfs, we can only
> > bring things forward with compromises. Hence I suggest I concentrate on
> > fixing the worst performance issues on 9P2000.L side first, and later on
> > if there are really people interested in seeing these improvements on
> > 9P2000.u side and willing to at least help out with testing, then I can
> > definitely also adjust 9P2000.u side accordingly as well.
> > 
> > But to also make this clear: this patch 10 is not introducing any
> > behaviour
> > change for 9P2000.u side.
> 
> Ok, fair enough to leave 9p2000.U behind for now but I had to ask :)
> /me is even wondering if we should start deprecating 9p2000.U since
> most clients can use 9p2000.L now...

I probably wouldn't. On macOS for instance there's only 9p2000.u. In the end 
9p2000.L is Linux specific. By deprecating 9p2000.u that would mean a shift 
towards only supporting Linux guests.

> > Mmm... maybe later on, as a cleanup patch or something. Current version is
> > tested. I would like to avoid to make things more complicated than they
> > already are (i.e. accidental introduction of some bug just due to this).
> 
> It seems we might agree on not breaking things that work ;-)

Of course! All the things I work on 9pfs are just fixes. But that's probably 
where we start to disagree. :)

> > > > > +static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> > > > > +                             struct V9fsDirEnt **entries,
> > > > > +                             int32_t maxsize, bool dostat)
> > > > > +{
> > > > > +    V9fsState *s = pdu->s;
> > > > > +    V9fsString name;
> > > > > +    int len, err = 0;
> > > > > +    int32_t size = 0;
> > > > > +    off_t saved_dir_pos;
> > > > > +    struct dirent *dent;
> > > > > +    struct V9fsDirEnt *e = NULL;
> > > > > +    V9fsPath path;
> > > > > +    struct stat stbuf;
> > > > > +
> > > > > +    *entries = NULL;
> > > > > +    v9fs_path_init(&path);
> > > > > +
> > > > > +    /*
> > > > > +     * TODO: Here should be a warn_report_once() if lock failed.
> > > > > +     *
> > > > > +     * With a good 9p client we should not get into concurrency
> > > > > here,
> > > > > +     * because a good client would not use the same fid for
> > > > > concurrent
> > > > > +     * requests. We do the lock here for safety reasons though.
> > > > > However
> > > > > +     * the client would then suffer performance issues, so better
> > > > > log
> > > > > that
> > > > > +     * issue here.
> > > > > +     */
> > > 
> > > This should probably be done with qemu_mutex_trylock() in a loop
> > > directly
> > > in v9fs_readdir_lock().
> > 
> > No definitely not in the loop. That's intentionally *one* lock *outside*
> > of
> 
> Not sure we're talking about the same loop here...
> 
> I'm just suggesting that v9fs_readdir_lock() could be something like:
> 
> static inline void v9fs_readdir_lock(V9fsDir *dir)
> {
>     while (qemu_mutex_trylock(&dir->readdir_mutex)) {
>         warn_report_once("blah");
>     }
> }

Ah yeah, in fact I got you wrong on this one. I thought you meant to move the 
lock within do_readdir_lowlat(). About your actual suggestion above ...

> > the loop. The normal case is not requiring a lock at all, like the comment
> > describes. Doing multiple locks inside the loop unnecessaririly reduces
> > performance for no reason.
> > 
> > About *_trylock() instead of v9fs_readdir_lock(): might be a candidate to
> 
> Hmm... are you suggesting that do_readdir_lowlat() should return if it
> can't take the lock ?

... yep, that's what I had in mind for it later on. I would not mind running 
trylock in a loop like you suggested; if it fails, log a warning and return.
Because if the lock fails, then client is buggy. User can read the warning in 
the logs and report the bug for client to be fixed. Not worth to handle that 
case in any fancy way by server.

> > > > > + * Retrieves the requested (max. amount of) directory entries from
> > > > > the
> > > > > fs
> > > > > + * driver. This function must only be called by the main IO thread
> > > > > (top
> > > > > half). + * Internally this function call will be dispatched to a
> > > > > background
> > > > > IO thread + * (bottom half) where it is eventually executed by the
> > > > > fs
> > > > > driver. + *
> > > > > + * The old readdir implementation above just retrieves always one
> > > > > dir
> > > > > entry + * per call. The problem of that implementation above is that
> > > > > latency is + * added for (retrieval of) each directory entry, which
> > > > > in
> > > > > practice lead to + * latencies of several hundred ms for readdir of
> > > > > only one directory. + * + * This is avoided in this function by
> > > > > letting
> > > > > the fs driver retrieve all + * required directory entries with only
> > > > > call of this function and hence with + * only a single fs driver
> > > > > request.
> > > 
> > > True but these kind of considerations rather belong to the changelog...
> > > otherwise, we'll have to not forget to kill these lines when the "old
> > > readdir implementation" is no more.
> > 
> > Mmm... I do think this overall latency issue should be clarified somewhere
> > as
> The issue itself is best described in a changelog, but...
> 
> > a comment. Where exactly is not that important to me. For instance it
> > could
> > also be described on v9fs_co_run_in_worker() macro definition in coth.h
> > instead, as general rule of thumb when dispatching stuff.
> 
> ... if you have useful recommendations that don't involve referring to a
> piece of code that might go away, a comment in coth.h is certainly a good
> idea.

Ok, I'll move it to coth.h then.

> > The thing is: for >10 years several developers obviously did not realize
> > the severe negative performance impact of frivolously dispatching tasks
> > to
> I won't dare to comment on some people I don't know not doing the obvious
> right things at the time... what I do know is that for >10 years, nobody
> cared for this code. Period. You're lucky it is still functional ;-)

This was not about judging individuals, just to express that the latency 
impact of dispatching does not seem to be obvious for people, and trying to 
avoid similar mistakes by placing appropriate comment(s).

> > > > > +                                        struct V9fsDirEnt
> > > > > **entries,
> > > > > +                                        int32_t maxsize, bool
> > > > > dostat)
> > > 
> > > s/maxsize/max_count since this is the wording use in the caller.
> > 
> > Wouldn't do that. This is the driver side, not the 9p protocol request
> > handler. As you might have relized before, "max_count" is semantically
> > completely wrong. This variable does not count integral entries, it is
> > rather a maximum size (in bytes) of the destination buffer being filled.
> > 
> > On 9p request handler side it is ok to use "max_count" instead, because
> > the
> > protocol definition uses exactly this term for the request parameter, but
> > on driver side it's IMO inappropriate.
> 
> I agree the max_count wording sucks and I don't care that much to
> name variables according to protocol definitions, but I do care
> to use the same name in caller/callee when it makes sense, for a
> better grep or cscope experience.

So you insist on that max_count change?

Best regards,
Christian Schoenebeck
Greg Kurz March 11, 2020, 4:14 p.m. UTC | #8
On Wed, 11 Mar 2020 02:18:04 +0100
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Dienstag, 10. März 2020 19:33:36 CET Greg Kurz wrote:
> > > This patch is also too big for my preference, but I don't see a viable way
> > > to split it further into separate patches. I already separated all the
> > > patches I could. If you have suggestions, very much appreciated!
> > 
> > Well, the patch could be split in two or three parts at least:
> > 
> > (1) introduce the new function that reads multiple entries in codir.c
> > 
> > (2) use it from 9p.c
> > 
> > (3) remove unused stuff if anything remains
> > 
> > This doesn't seem to change much but the smaller diffstats
> > for each individual patch make them less scary :) and with
> > (1) applied only it is easier to compare what the old code
> > in 9p.c and the new one in codir.c do.
> > 
> > > The reason for this is that in order to fix these issues with current
> > > T_readdir implementation, it requires to separate what's currently one
> > > task
> > > (i.e. one function) into two separate tasks (i.e. two functions). There is
> > > no sensible way to do that.
> > 
> > Yeah, I won't ask for finer grain.
> 
> Me confused. Does that mean your split suggestion was just theoretical, or do 
> you need it?
> 

I need it and I won't ask for more splitting. Promised ! :)

> > > But don't be too scared about this patch; if you look just at the diff
> > > directly then yes, the diff is not very human readable. However if you
> > > apply the patch and look at the resulting code, you'll notice that there
> > > are actually only 2 functions (v9fs_do_readdir() in 9p.c and
> > > do_readdir_lowlat() in codir.c) which require careful reviewing and that
> > > their resulting code is actually very, very straight forward, and not
> > > long either.
> > 
> > These are personal opinions. Careful reviewing can take a lot of time :)
> 
> Well, depends on what precisely you mean with opinion. :) That this patch 
> actually reduces code complexity is a fact (less branches, less locks, less 
> dispatches, less side effects, less error/cleanup handlers). These are 
> objectively measurable quantities.
> 

You're likely right but that wasn't my point.

> But yes, nevertheless reviewing it costs time.
> 

That is my point: even if the resulting code is a lot better than the
old one in several aspects, it is still a non-trivial change to review
on my side.

> > > Current code on master is much more tricky and error prone due to the huge
> > > amount of potential branches, individual error/cleanup handlers, high
> > > amount of thread dispatching and much more. In the patched version all
> > > these code complexities and error sources are removed.
> > 
> > Come on :) The 9pfs code has been a can of worms from the beginning.
> > It produced more than the average amount of security-related bugs,
> > and most sadly, due to the overall lack of interest, it bitrotted
> > and missed a lot of cool improvements like an appropriate support of
> > unlinked files, QOM-ification of fsdev, conversion of proxy fsdev to
> > vhost-user, hot plug/unplug support, live migration support and
> > "much more"... The performance aspect of things is a full time job
> 
> No, the performance issues are actually very managable in case of 9pfs.
> I already addressed readdir with this patch (by far the worst performance 

They're very manageable if someone cares and spends time. Thanks again
for doing this.

> issue), and then there would just be one more severe performance issue: 
> walkdir.
> 

That doesn't surprise me. It is a big function that may end up doing a
lot of dispatching.

> My intention is not to squeeze out the last fractional percent of performance 
> for 9pfs, but you certainly agree that a simple "ls" blocking for more than 1 
> second is something that should be fixed, and fortunately the amount of 

I never observed such timeouts with my personal toy use of 9p but
you did and this motivated you to step in, which is very welcome.

> changes involved are far less than I originally feared they would.
> 

Fortunately is the word :)

> > I never had the opportunity to even consider. So yes, your changes
> > are likely beneficial but the code base is still extremely fragile
> > and sensible to huge changes... not breaking things that happen
> > to work, even in a sub-optimal way, is essentially what I care for
> > these days.
> 
> And I think I'm also very careful not breaking anything. I carefully consider 
> what to touch and what not. I wrote test cases and I am actively testing my 
> changes with real installations and snapshots as well.
> 

Everyone thinks he's also very careful :)

> I think the cause of disagreements we have are solely our use cases of 9pfs: 
> your personal use case does not seem to require any performance considerations 
> or multi-user aspects, whereas my use case requires at least some minimum 
> performance grade for utilizing 9pfs for server applications.
> 

Your point about the personal use case is right indeed but our disagreements,
if any, aren't uniquely related to that. It's more about maintainership and
available time really. I'm 100% benevolent "Odd fixer" now and I just try
to avoid being forced into fixing things after my PR is merged. If you want
to go faster, then you're welcome to upgrade to maintainer and send PRs.
This would make sense since you care for 9p, you showed a good understanding
of the code and you provided beneficial contributions so far :)

> > > > Oh, so we'd still have the current implementation being used, even
> > > > with this patch applied... This would be okay for a RFC patch but
> > > > in the end I'd really like to merge something that also converts
> > > > v9fs_do_readdir_with_stat().
> > > 
> > > Yes, I know, but I would not recommend mixing these things at this point,
> > > because it would be an entire effort on its own.
> > > 
> > > v9fs_do_readdir_with_stat() is used for 9P2000.u, while v9fs_do_readdir()
> > > is used for 9P2000.L. They're behaving very differently, so it would not
> > > only require me to update v9fs_do_readdir_with_stat() and v9fs_read(), I
> > > would also need to write their own test cases (plural, since there are
> > > none at all yet) and benchmarks, and of course somebody what need to
> > > review all that additional amount of code, and who would actually test
> > > it? I am actively testing my 9P2000.L changes, but I am not actually
> > > using 9P2000.u, so I could not guarantee for the same quality.
> > > 
> > > Considering the current commitment situation regarding 9pfs, we can only
> > > bring things forward with compromises. Hence I suggest I concentrate on
> > > fixing the worst performance issues on 9P2000.L side first, and later on
> > > if there are really people interested in seeing these improvements on
> > > 9P2000.u side and willing to at least help out with testing, then I can
> > > definitely also adjust 9P2000.u side accordingly as well.
> > > 
> > > But to also make this clear: this patch 10 is not introducing any
> > > behaviour
> > > change for 9P2000.u side.
> > 
> > Ok, fair enough to leave 9p2000.U behind for now but I had to ask :)
> > /me is even wondering if we should start deprecating 9p2000.U since
> > most clients can use 9p2000.L now...
> 
> I probably wouldn't. On macOS for instance there's only 9p2000.u. In the end 

Hmm... the only thing I've heard about is:

https://github.com/benavento/mac9p

and unless I'm missing something, this seems to only have a socket based
transport... the server in QEMU is for virtio-9p and Xen 9p devices only.
Unless mac9p seriously plans to implement a driver for either of those,
I'm still not convinced it is worth keeping .U around... and BTW, our
deprecation policy imposes a 2 QEMU versions cooling period before
actually removing the code. This would leave ample time for someone
to speak up.

> 9p2000.L is Linux specific. By deprecating 9p2000.u that would mean a shift 
> towards only supporting Linux guests.
> 
> > > Mmm... maybe later on, as a cleanup patch or something. Current version is
> > > tested. I would like to avoid to make things more complicated than they
> > > already are (i.e. accidental introduction of some bug just due to this).
> > 
> > It seems we might agree on not breaking things that work ;-)
> 
> Of course! All the things I work on 9pfs are just fixes. But that's probably 
> where we start to disagree. :)
> 

:)

> > > > > > +static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
> > > > > > +                             struct V9fsDirEnt **entries,
> > > > > > +                             int32_t maxsize, bool dostat)
> > > > > > +{
> > > > > > +    V9fsState *s = pdu->s;
> > > > > > +    V9fsString name;
> > > > > > +    int len, err = 0;
> > > > > > +    int32_t size = 0;
> > > > > > +    off_t saved_dir_pos;
> > > > > > +    struct dirent *dent;
> > > > > > +    struct V9fsDirEnt *e = NULL;
> > > > > > +    V9fsPath path;
> > > > > > +    struct stat stbuf;
> > > > > > +
> > > > > > +    *entries = NULL;
> > > > > > +    v9fs_path_init(&path);
> > > > > > +
> > > > > > +    /*
> > > > > > +     * TODO: Here should be a warn_report_once() if lock failed.
> > > > > > +     *
> > > > > > +     * With a good 9p client we should not get into concurrency
> > > > > > here,
> > > > > > +     * because a good client would not use the same fid for
> > > > > > concurrent
> > > > > > +     * requests. We do the lock here for safety reasons though.
> > > > > > However
> > > > > > +     * the client would then suffer performance issues, so better
> > > > > > log
> > > > > > that
> > > > > > +     * issue here.
> > > > > > +     */
> > > > 
> > > > This should probably be done with qemu_mutex_trylock() in a loop
> > > > directly
> > > > in v9fs_readdir_lock().
> > > 
> > > No definitely not in the loop. That's intentionally *one* lock *outside*
> > > of
> > 
> > Not sure we're talking about the same loop here...
> > 
> > I'm just suggesting that v9fs_readdir_lock() could be something like:
> > 
> > static inline void v9fs_readdir_lock(V9fsDir *dir)
> > {
> >     while (qemu_mutex_trylock(&dir->readdir_mutex)) {
> >         warn_report_once("blah");
> >     }
> > }
> 
> Ah yeah, in fact I got you wrong on this one. I thought you meant to move the 
> lock within do_readdir_lowlat(). About your actual suggestion above ...
> 
> > > the loop. The normal case is not requiring a lock at all, like the comment
> > > describes. Doing multiple locks inside the loop unnecessaririly reduces
> > > performance for no reason.
> > > 
> > > About *_trylock() instead of v9fs_readdir_lock(): might be a candidate to
> > 
> > Hmm... are you suggesting that do_readdir_lowlat() should return if it
> > can't take the lock ?
> 
> ... yep, that's what I had in mind for it later on. I would not mind running 
> trylock in a loop like you suggested; if it fails, log a warning and return.
> Because if the lock fails, then client is buggy. User can read the warning in 
> the logs and report the bug for client to be fixed. Not worth to handle that 
> case in any fancy way by server.
> 

The locking is just a safety net because readdir(3) isn't necessarily
thread safe. It was never my intention to add an error path for that.
And thinking again, sending concurrent Treaddir requests on the same
fid is certainly a weird thing to do but is it really a bug ?

> > > > > > + * Retrieves the requested (max. amount of) directory entries from
> > > > > > the
> > > > > > fs
> > > > > > + * driver. This function must only be called by the main IO thread
> > > > > > (top
> > > > > > half). + * Internally this function call will be dispatched to a
> > > > > > background
> > > > > > IO thread + * (bottom half) where it is eventually executed by the
> > > > > > fs
> > > > > > driver. + *
> > > > > > + * The old readdir implementation above just retrieves always one
> > > > > > dir
> > > > > > entry + * per call. The problem of that implementation above is that
> > > > > > latency is + * added for (retrieval of) each directory entry, which
> > > > > > in
> > > > > > practice lead to + * latencies of several hundred ms for readdir of
> > > > > > only one directory. + * + * This is avoided in this function by
> > > > > > letting
> > > > > > the fs driver retrieve all + * required directory entries with only
> > > > > > call of this function and hence with + * only a single fs driver
> > > > > > request.
> > > > 
> > > > True but these kind of considerations rather belong to the changelog...
> > > > otherwise, we'll have to not forget to kill these lines when the "old
> > > > readdir implementation" is no more.
> > > 
> > > Mmm... I do think this overall latency issue should be clarified somewhere
> > > as
> > The issue itself is best described in a changelog, but...
> > 
> > > a comment. Where exactly is not that important to me. For instance it
> > > could
> > > also be described on v9fs_co_run_in_worker() macro definition in coth.h
> > > instead, as general rule of thumb when dispatching stuff.
> > 
> > ... if you have useful recommendations that don't involve referring to a
> > piece of code that might go away, a comment in coth.h is certainly a good
> > idea.
> 
> Ok, I'll move it to coth.h then.
> 
> > > The thing is: for >10 years several developers obviously did not realize
> > > the severe negative performance impact of frivolously dispatching tasks
> > > to
> > I won't dare to comment on some people I don't know not doing the obvious
> > right things at the time... what I do know is that for >10 years, nobody
> > cared for this code. Period. You're lucky it is still functional ;-)
> 
> This was not about judging individuals, just to express that the latency 
> impact of dispatching does not seem to be obvious for people, and trying to 

Maybe because it isn't that obvious...

> avoid similar mistakes by placing appropriate comment(s).
> 
> > > > > > +                                        struct V9fsDirEnt
> > > > > > **entries,
> > > > > > +                                        int32_t maxsize, bool
> > > > > > dostat)
> > > > 
> > > > s/maxsize/max_count since this is the wording use in the caller.
> > > 
> > > Wouldn't do that. This is the driver side, not the 9p protocol request
> > > handler. As you might have relized before, "max_count" is semantically
> > > completely wrong. This variable does not count integral entries, it is
> > > rather a maximum size (in bytes) of the destination buffer being filled.
> > > 
> > > On 9p request handler side it is ok to use "max_count" instead, because
> > > the
> > > protocol definition uses exactly this term for the request parameter, but
> > > on driver side it's IMO inappropriate.
> > 
> > I agree the max_count wording sucks and I don't care that much to
> > name variables according to protocol definitions, but I do care
> > to use the same name in caller/callee when it makes sense, for a
> > better grep or cscope experience.
> 
> So you insist on that max_count change?
> 

Rather the opposite. Kill max_count and make it maxsize, in a preparatory
patch with the reasons you've exposed.

> Best regards,
> Christian Schoenebeck
> 
> 

Cheers,

--
Greg
Christian Schoenebeck March 11, 2020, 7:54 p.m. UTC | #9
On Mittwoch, 11. März 2020 17:14:08 CET Greg Kurz wrote:
> On Wed, 11 Mar 2020 02:18:04 +0100
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > On Dienstag, 10. März 2020 19:33:36 CET Greg Kurz wrote:
> > > > This patch is also too big for my preference, but I don't see a viable
> > > > way
> > > > to split it further into separate patches. I already separated all the
> > > > patches I could. If you have suggestions, very much appreciated!
> > > 
> > > Well, the patch could be split in two or three parts at least:
> > > 
> > > (1) introduce the new function that reads multiple entries in codir.c
> > > 
> > > (2) use it from 9p.c
> > > 
> > > (3) remove unused stuff if anything remains
> > > 
> > > This doesn't seem to change much but the smaller diffstats
> > > for each individual patch make them less scary :) and with
> > > (1) applied only it is easier to compare what the old code
> > > in 9p.c and the new one in codir.c do.
> > > 
> > > > The reason for this is that in order to fix these issues with current
> > > > T_readdir implementation, it requires to separate what's currently one
> > > > task
> > > > (i.e. one function) into two separate tasks (i.e. two functions).
> > > > There is
> > > > no sensible way to do that.
> > > 
> > > Yeah, I won't ask for finer grain.
> > 
> > Me confused. Does that mean your split suggestion was just theoretical, or
> > do you need it?
> 
> I need it and I won't ask for more splitting. Promised ! :)

Okay then. :)

> > > > Current code on master is much more tricky and error prone due to the
> > > > huge
> > > > amount of potential branches, individual error/cleanup handlers, high
> > > > amount of thread dispatching and much more. In the patched version all
> > > > these code complexities and error sources are removed.
> > > 
> > > Come on :) The 9pfs code has been a can of worms from the beginning.
> > > It produced more than the average amount of security-related bugs,
> > > and most sadly, due to the overall lack of interest, it bitrotted
> > > and missed a lot of cool improvements like an appropriate support of
> > > unlinked files, QOM-ification of fsdev, conversion of proxy fsdev to
> > > vhost-user, hot plug/unplug support, live migration support and
> > > "much more"... The performance aspect of things is a full time job
> > 
> > No, the performance issues are actually very managable in case of 9pfs.
> > I already addressed readdir with this patch (by far the worst performance
> 
> They're very manageable if someone cares and spends time. Thanks again
> for doing this.

Thanks!

> > My intention is not to squeeze out the last fractional percent of
> > performance for 9pfs, but you certainly agree that a simple "ls" blocking
> > for more than 1 second is something that should be fixed, and fortunately
> > the amount of
> I never observed such timeouts with my personal toy use of 9p but
> you did and this motivated you to step in, which is very welcome.

Probably you don't notice it much because of the dirent cache on guest side. 
If guest's dirent cache is cold, and you do a readdir() ("ls") on some 
directory with e.g. several hundred entries, you should notice it.

> > I think the cause of disagreements we have are solely our use cases of
> > 9pfs: your personal use case does not seem to require any performance
> > considerations or multi-user aspects, whereas my use case requires at
> > least some minimum performance grade for utilizing 9pfs for server
> > applications.
> 
> Your point about the personal use case is right indeed but our
> disagreements, if any, aren't uniquely related to that. It's more about
> maintainership and available time really. I'm 100% benevolent "Odd fixer"
> now and I just try to avoid being forced into fixing things after my PR is
> merged. If you want to go faster, then you're welcome to upgrade to
> maintainer and send PRs. This would make sense since you care for 9p, you
> showed a good understanding of the code and you provided beneficial
> contributions so far :)

That maintainership upgrade is planned. The question is just when. What was 
your idea, co-maintainership?

> > > > > Oh, so we'd still have the current implementation being used, even
> > > > > with this patch applied... This would be okay for a RFC patch but
> > > > > in the end I'd really like to merge something that also converts
> > > > > v9fs_do_readdir_with_stat().
> > > > 
> > > > Yes, I know, but I would not recommend mixing these things at this
> > > > point,
> > > > because it would be an entire effort on its own.
> > > > 
> > > > v9fs_do_readdir_with_stat() is used for 9P2000.u, while
> > > > v9fs_do_readdir()
> > > > is used for 9P2000.L. They're behaving very differently, so it would
> > > > not
> > > > only require me to update v9fs_do_readdir_with_stat() and v9fs_read(),
> > > > I
> > > > would also need to write their own test cases (plural, since there are
> > > > none at all yet) and benchmarks, and of course somebody what need to
> > > > review all that additional amount of code, and who would actually test
> > > > it? I am actively testing my 9P2000.L changes, but I am not actually
> > > > using 9P2000.u, so I could not guarantee for the same quality.
> > > > 
> > > > Considering the current commitment situation regarding 9pfs, we can
> > > > only
> > > > bring things forward with compromises. Hence I suggest I concentrate
> > > > on
> > > > fixing the worst performance issues on 9P2000.L side first, and later
> > > > on
> > > > if there are really people interested in seeing these improvements on
> > > > 9P2000.u side and willing to at least help out with testing, then I
> > > > can
> > > > definitely also adjust 9P2000.u side accordingly as well.
> > > > 
> > > > But to also make this clear: this patch 10 is not introducing any
> > > > behaviour
> > > > change for 9P2000.u side.
> > > 
> > > Ok, fair enough to leave 9p2000.U behind for now but I had to ask :)
> > > /me is even wondering if we should start deprecating 9p2000.U since
> > > most clients can use 9p2000.L now...
> > 
> > I probably wouldn't. On macOS for instance there's only 9p2000.u. In the
> > end
> Hmm... the only thing I've heard about is:
> 
> https://github.com/benavento/mac9p
> 
> and unless I'm missing something, this seems to only have a socket based
> transport... the server in QEMU is for virtio-9p and Xen 9p devices only.
> Unless mac9p seriously plans to implement a driver for either of those,
> I'm still not convinced it is worth keeping .U around... and BTW, our
> deprecation policy imposes a 2 QEMU versions cooling period before
> actually removing the code. This would leave ample time for someone
> to speak up.

Well, I leave that up to you. I could consider adding a transport for macOS 
guests, but that's definitely not going to happen in any near future.

> > > Not sure we're talking about the same loop here...
> > > 
> > > I'm just suggesting that v9fs_readdir_lock() could be something like:
> > > 
> > > static inline void v9fs_readdir_lock(V9fsDir *dir)
> > > {
> > > 
> > >     while (qemu_mutex_trylock(&dir->readdir_mutex)) {
> > >     
> > >         warn_report_once("blah");
> > >     
> > >     }
> > > 
> > > }
> > 
> > Ah yeah, in fact I got you wrong on this one. I thought you meant to move
> > the lock within do_readdir_lowlat(). About your actual suggestion above
> > ...> 
> > > > the loop. The normal case is not requiring a lock at all, like the
> > > > comment
> > > > describes. Doing multiple locks inside the loop unnecessaririly
> > > > reduces
> > > > performance for no reason.
> > > > 
> > > > About *_trylock() instead of v9fs_readdir_lock(): might be a candidate
> > > > to
> > > 
> > > Hmm... are you suggesting that do_readdir_lowlat() should return if it
> > > can't take the lock ?
> > 
> > ... yep, that's what I had in mind for it later on. I would not mind
> > running trylock in a loop like you suggested; if it fails, log a warning
> > and return. Because if the lock fails, then client is buggy. User can
> > read the warning in the logs and report the bug for client to be fixed.
> > Not worth to handle that case in any fancy way by server.
> 
> The locking is just a safety net because readdir(3) isn't necessarily
> thread safe. It was never my intention to add an error path for that.
> And thinking again, sending concurrent Treaddir requests on the same
> fid is certainly a weird thing to do but is it really a bug ?

Well, I was unresolved on that issue as well, hence my decision to only leave 
a TODO comment for now. My expectation would be separate fid for concurrent 
readdir, but you are right of course, Treaddir (unlike its 9p2000.u 
counterpart) is reentrant by design, since it has an offset parameter, so from 
protocol perspective concurrent Treaddir is not per se impossible.

The lock would not be noticable either with this patch, since unlike on 
master, the lock is just retained on driver side now (with this patch), which 
returns very fast even on large directories, as you can see from the benchmark 
output.

So yes, returning from function with an error is probably not the best choice. 
My tendency is still though logging a message at least. I don't care about 
that too much though to deal with trylock() in a loop or something right now. 
There are more important things to take care of ATM.

> > avoid similar mistakes by placing appropriate comment(s).
> > 
> > > > > > > +                                        struct V9fsDirEnt
> > > > > > > **entries,
> > > > > > > +                                        int32_t maxsize, bool
> > > > > > > dostat)
> > > > > 
> > > > > s/maxsize/max_count since this is the wording use in the caller.
> > > > 
> > > > Wouldn't do that. This is the driver side, not the 9p protocol request
> > > > handler. As you might have relized before, "max_count" is semantically
> > > > completely wrong. This variable does not count integral entries, it is
> > > > rather a maximum size (in bytes) of the destination buffer being
> > > > filled.
> > > > 
> > > > On 9p request handler side it is ok to use "max_count" instead,
> > > > because
> > > > the
> > > > protocol definition uses exactly this term for the request parameter,
> > > > but
> > > > on driver side it's IMO inappropriate.
> > > 
> > > I agree the max_count wording sucks and I don't care that much to
> > > name variables according to protocol definitions, but I do care
> > > to use the same name in caller/callee when it makes sense, for a
> > > better grep or cscope experience.
> > 
> > So you insist on that max_count change?
> 
> Rather the opposite. Kill max_count and make it maxsize, in a preparatory
> patch with the reasons you've exposed.

Ah got it. OK then.

Best regards,
Christian Schoenebeck
Greg Kurz March 17, 2020, 2:14 p.m. UTC | #10
On Wed, 11 Mar 2020 20:54:58 +0100
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Mittwoch, 11. März 2020 17:14:08 CET Greg Kurz wrote:
> > On Wed, 11 Mar 2020 02:18:04 +0100
> > 
> > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > On Dienstag, 10. März 2020 19:33:36 CET Greg Kurz wrote:
> > > > > This patch is also too big for my preference, but I don't see a viable
> > > > > way
> > > > > to split it further into separate patches. I already separated all the
> > > > > patches I could. If you have suggestions, very much appreciated!
> > > > 
> > > > Well, the patch could be split in two or three parts at least:
> > > > 
> > > > (1) introduce the new function that reads multiple entries in codir.c
> > > > 
> > > > (2) use it from 9p.c
> > > > 
> > > > (3) remove unused stuff if anything remains
> > > > 
> > > > This doesn't seem to change much but the smaller diffstats
> > > > for each individual patch make them less scary :) and with
> > > > (1) applied only it is easier to compare what the old code
> > > > in 9p.c and the new one in codir.c do.
> > > > 
> > > > > The reason for this is that in order to fix these issues with current
> > > > > T_readdir implementation, it requires to separate what's currently one
> > > > > task
> > > > > (i.e. one function) into two separate tasks (i.e. two functions).
> > > > > There is
> > > > > no sensible way to do that.
> > > > 
> > > > Yeah, I won't ask for finer grain.
> > > 
> > > Me confused. Does that mean your split suggestion was just theoretical, or
> > > do you need it?
> > 
> > I need it and I won't ask for more splitting. Promised ! :)
> 
> Okay then. :)
> 
> > > > > Current code on master is much more tricky and error prone due to the
> > > > > huge
> > > > > amount of potential branches, individual error/cleanup handlers, high
> > > > > amount of thread dispatching and much more. In the patched version all
> > > > > these code complexities and error sources are removed.
> > > > 
> > > > Come on :) The 9pfs code has been a can of worms from the beginning.
> > > > It produced more than the average amount of security-related bugs,
> > > > and most sadly, due to the overall lack of interest, it bitrotted
> > > > and missed a lot of cool improvements like an appropriate support of
> > > > unlinked files, QOM-ification of fsdev, conversion of proxy fsdev to
> > > > vhost-user, hot plug/unplug support, live migration support and
> > > > "much more"... The performance aspect of things is a full time job
> > > 
> > > No, the performance issues are actually very managable in case of 9pfs.
> > > I already addressed readdir with this patch (by far the worst performance
> > 
> > They're very manageable if someone cares and spends time. Thanks again
> > for doing this.
> 
> Thanks!
> 
> > > My intention is not to squeeze out the last fractional percent of
> > > performance for 9pfs, but you certainly agree that a simple "ls" blocking
> > > for more than 1 second is something that should be fixed, and fortunately
> > > the amount of
> > I never observed such timeouts with my personal toy use of 9p but
> > you did and this motivated you to step in, which is very welcome.
> 
> Probably you don't notice it much because of the dirent cache on guest side. 
> If guest's dirent cache is cold, and you do a readdir() ("ls") on some 
> directory with e.g. several hundred entries, you should notice it.
> 
> > > I think the cause of disagreements we have are solely our use cases of
> > > 9pfs: your personal use case does not seem to require any performance
> > > considerations or multi-user aspects, whereas my use case requires at
> > > least some minimum performance grade for utilizing 9pfs for server
> > > applications.
> > 
> > Your point about the personal use case is right indeed but our
> > disagreements, if any, aren't uniquely related to that. It's more about
> > maintainership and available time really. I'm 100% benevolent "Odd fixer"
> > now and I just try to avoid being forced into fixing things after my PR is
> > merged. If you want to go faster, then you're welcome to upgrade to
> > maintainer and send PRs. This would make sense since you care for 9p, you
> > showed a good understanding of the code and you provided beneficial
> > contributions so far :)
> 
> That maintainership upgrade is planned. The question is just when. What was 
> your idea, co-maintainership?
> 

Basically yes.

> > > > > > Oh, so we'd still have the current implementation being used, even
> > > > > > with this patch applied... This would be okay for a RFC patch but
> > > > > > in the end I'd really like to merge something that also converts
> > > > > > v9fs_do_readdir_with_stat().
> > > > > 
> > > > > Yes, I know, but I would not recommend mixing these things at this
> > > > > point,
> > > > > because it would be an entire effort on its own.
> > > > > 
> > > > > v9fs_do_readdir_with_stat() is used for 9P2000.u, while
> > > > > v9fs_do_readdir()
> > > > > is used for 9P2000.L. They're behaving very differently, so it would
> > > > > not
> > > > > only require me to update v9fs_do_readdir_with_stat() and v9fs_read(),
> > > > > I
> > > > > would also need to write their own test cases (plural, since there are
> > > > > none at all yet) and benchmarks, and of course somebody what need to
> > > > > review all that additional amount of code, and who would actually test
> > > > > it? I am actively testing my 9P2000.L changes, but I am not actually
> > > > > using 9P2000.u, so I could not guarantee for the same quality.
> > > > > 
> > > > > Considering the current commitment situation regarding 9pfs, we can
> > > > > only
> > > > > bring things forward with compromises. Hence I suggest I concentrate
> > > > > on
> > > > > fixing the worst performance issues on 9P2000.L side first, and later
> > > > > on
> > > > > if there are really people interested in seeing these improvements on
> > > > > 9P2000.u side and willing to at least help out with testing, then I
> > > > > can
> > > > > definitely also adjust 9P2000.u side accordingly as well.
> > > > > 
> > > > > But to also make this clear: this patch 10 is not introducing any
> > > > > behaviour
> > > > > change for 9P2000.u side.
> > > > 
> > > > Ok, fair enough to leave 9p2000.U behind for now but I had to ask :)
> > > > /me is even wondering if we should start deprecating 9p2000.U since
> > > > most clients can use 9p2000.L now...
> > > 
> > > I probably wouldn't. On macOS for instance there's only 9p2000.u. In the
> > > end
> > Hmm... the only thing I've heard about is:
> > 
> > https://github.com/benavento/mac9p
> > 
> > and unless I'm missing something, this seems to only have a socket based
> > transport... the server in QEMU is for virtio-9p and Xen 9p devices only.
> > Unless mac9p seriously plans to implement a driver for either of those,
> > I'm still not convinced it is worth keeping .U around... and BTW, our
> > deprecation policy imposes a 2 QEMU versions cooling period before
> > actually removing the code. This would leave ample time for someone
> > to speak up.
> 
> Well, I leave that up to you. I could consider adding a transport for macOS 
> guests, but that's definitely not going to happen in any near future.
> 

The whole idea behind dropping support for .U is really just about
reducing the potential attack surface. But I'm also fine to keep
things as is until the next CVE since I'm confident someone will
help ;-)

> > > > Not sure we're talking about the same loop here...
> > > > 
> > > > I'm just suggesting that v9fs_readdir_lock() could be something like:
> > > > 
> > > > static inline void v9fs_readdir_lock(V9fsDir *dir)
> > > > {
> > > > 
> > > >     while (qemu_mutex_trylock(&dir->readdir_mutex)) {
> > > >     
> > > >         warn_report_once("blah");
> > > >     
> > > >     }
> > > > 
> > > > }
> > > 
> > > Ah yeah, in fact I got you wrong on this one. I thought you meant to move
> > > the lock within do_readdir_lowlat(). About your actual suggestion above
> > > ...> 
> > > > > the loop. The normal case is not requiring a lock at all, like the
> > > > > comment
> > > > > describes. Doing multiple locks inside the loop unnecessaririly
> > > > > reduces
> > > > > performance for no reason.
> > > > > 
> > > > > About *_trylock() instead of v9fs_readdir_lock(): might be a candidate
> > > > > to
> > > > 
> > > > Hmm... are you suggesting that do_readdir_lowlat() should return if it
> > > > can't take the lock ?
> > > 
> > > ... yep, that's what I had in mind for it later on. I would not mind
> > > running trylock in a loop like you suggested; if it fails, log a warning
> > > and return. Because if the lock fails, then client is buggy. User can
> > > read the warning in the logs and report the bug for client to be fixed.
> > > Not worth to handle that case in any fancy way by server.
> > 
> > The locking is just a safety net because readdir(3) isn't necessarily
> > thread safe. It was never my intention to add an error path for that.
> > And thinking again, sending concurrent Treaddir requests on the same
> > fid is certainly a weird thing to do but is it really a bug ?
> 
> Well, I was unresolved on that issue as well, hence my decision to only leave 
> a TODO comment for now. My expectation would be separate fid for concurrent 
> readdir, but you are right of course, Treaddir (unlike its 9p2000.u 
> counterpart) is reentrant by design, since it has an offset parameter, so from 
> protocol perspective concurrent Treaddir is not per se impossible.
> 
> The lock would not be noticable either with this patch, since unlike on 
> master, the lock is just retained on driver side now (with this patch), which 
> returns very fast even on large directories, as you can see from the benchmark 
> output.
> 
> So yes, returning from function with an error is probably not the best choice. 
> My tendency is still though logging a message at least. I don't care about 
> that too much though to deal with trylock() in a loop or something right now. 
> There are more important things to take care of ATM.
> 

Yeah, that can wait.

> > > avoid similar mistakes by placing appropriate comment(s).
> > > 
> > > > > > > > +                                        struct V9fsDirEnt
> > > > > > > > **entries,
> > > > > > > > +                                        int32_t maxsize, bool
> > > > > > > > dostat)
> > > > > > 
> > > > > > s/maxsize/max_count since this is the wording use in the caller.
> > > > > 
> > > > > Wouldn't do that. This is the driver side, not the 9p protocol request
> > > > > handler. As you might have relized before, "max_count" is semantically
> > > > > completely wrong. This variable does not count integral entries, it is
> > > > > rather a maximum size (in bytes) of the destination buffer being
> > > > > filled.
> > > > > 
> > > > > On 9p request handler side it is ok to use "max_count" instead,
> > > > > because
> > > > > the
> > > > > protocol definition uses exactly this term for the request parameter,
> > > > > but
> > > > > on driver side it's IMO inappropriate.
> > > > 
> > > > I agree the max_count wording sucks and I don't care that much to
> > > > name variables according to protocol definitions, but I do care
> > > > to use the same name in caller/callee when it makes sense, for a
> > > > better grep or cscope experience.
> > > 
> > > So you insist on that max_count change?
> > 
> > Rather the opposite. Kill max_count and make it maxsize, in a preparatory
> > patch with the reasons you've exposed.
> 
> Ah got it. OK then.
> 
> Best regards,
> Christian Schoenebeck
> 
>
Christian Schoenebeck March 17, 2020, 4:09 p.m. UTC | #11
On Dienstag, 17. März 2020 15:14:23 CET Greg Kurz wrote:
> > > > I think the cause of disagreements we have are solely our use cases of
> > > > 9pfs: your personal use case does not seem to require any performance
> > > > considerations or multi-user aspects, whereas my use case requires at
> > > > least some minimum performance grade for utilizing 9pfs for server
> > > > applications.
> > > 
> > > Your point about the personal use case is right indeed but our
> > > disagreements, if any, aren't uniquely related to that. It's more about
> > > maintainership and available time really. I'm 100% benevolent "Odd
> > > fixer"
> > > now and I just try to avoid being forced into fixing things after my PR
> > > is
> > > merged. If you want to go faster, then you're welcome to upgrade to
> > > maintainer and send PRs. This would make sense since you care for 9p,
> > > you
> > > showed a good understanding of the code and you provided beneficial
> > > contributions so far :)
> > 
> > That maintainership upgrade is planned. The question is just when. What
> > was
> > your idea, co-maintainership?
> 
> Basically yes.

Ok, I'll send out a MAINTAINERS patch tomorrow or so to make that
co-maintainer upgrade official. But I obviously still need a while to learn 
the bureaucracy involved for PRs, signing, ML tools, Wiki, etc.

> > > > > Ok, fair enough to leave 9p2000.U behind for now but I had to ask :)
> > > > > /me is even wondering if we should start deprecating 9p2000.U since
> > > > > most clients can use 9p2000.L now...
> > > > 
> > > > I probably wouldn't. On macOS for instance there's only 9p2000.u. In
> > > > the
> > > > end
> > > 
> > > Hmm... the only thing I've heard about is:
> > > 
> > > https://github.com/benavento/mac9p
> > > 
> > > and unless I'm missing something, this seems to only have a socket based
> > > transport... the server in QEMU is for virtio-9p and Xen 9p devices
> > > only.
> > > Unless mac9p seriously plans to implement a driver for either of those,
> > > I'm still not convinced it is worth keeping .U around... and BTW, our
> > > deprecation policy imposes a 2 QEMU versions cooling period before
> > > actually removing the code. This would leave ample time for someone
> > > to speak up.
> > 
> > Well, I leave that up to you. I could consider adding a transport for
> > macOS
> > guests, but that's definitely not going to happen in any near future.
> 
> The whole idea behind dropping support for .U is really just about
> reducing the potential attack surface. But I'm also fine to keep
> things as is until the next CVE since I'm confident someone will
> help ;-)

Sure, sounds like a plan!

> > > > > > the loop. The normal case is not requiring a lock at all, like the
> > > > > > comment
> > > > > > describes. Doing multiple locks inside the loop unnecessaririly
> > > > > > reduces
> > > > > > performance for no reason.
> > > > > > 
> > > > > > About *_trylock() instead of v9fs_readdir_lock(): might be a
> > > > > > candidate
> > > > > > to
> > > > > 
> > > > > Hmm... are you suggesting that do_readdir_lowlat() should return if
> > > > > it
> > > > > can't take the lock ?
> > > > 
> > > > ... yep, that's what I had in mind for it later on. I would not mind
> > > > running trylock in a loop like you suggested; if it fails, log a
> > > > warning
> > > > and return. Because if the lock fails, then client is buggy. User can
> > > > read the warning in the logs and report the bug for client to be
> > > > fixed.
> > > > Not worth to handle that case in any fancy way by server.
> > > 
> > > The locking is just a safety net because readdir(3) isn't necessarily
> > > thread safe. It was never my intention to add an error path for that.
> > > And thinking again, sending concurrent Treaddir requests on the same
> > > fid is certainly a weird thing to do but is it really a bug ?
> > 
> > Well, I was unresolved on that issue as well, hence my decision to only
> > leave a TODO comment for now. My expectation would be separate fid for
> > concurrent readdir, but you are right of course, Treaddir (unlike its
> > 9p2000.u counterpart) is reentrant by design, since it has an offset
> > parameter, so from protocol perspective concurrent Treaddir is not per se
> > impossible.
> > 
> > The lock would not be noticable either with this patch, since unlike on
> > master, the lock is just retained on driver side now (with this patch),
> > which returns very fast even on large directories, as you can see from
> > the benchmark output.
> > 
> > So yes, returning from function with an error is probably not the best
> > choice. My tendency is still though logging a message at least. I don't
> > care about that too much though to deal with trylock() in a loop or
> > something right now. There are more important things to take care of ATM.
> 
> Yeah, that can wait.

Okay.

The only thing I actually still need to figure out for this patch series to be 
complete (at least from my side), is how to fix the test environment bug 
discussed. Probably I can reset the test environment's buffer after every test 
somehow ...

Best regards,
Christian Schoenebeck
diff mbox series

Patch

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 18370183c4..e0ca45d46b 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -971,30 +971,6 @@  static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
     return 0;
 }
 
-static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
-                                      struct dirent *dent, V9fsQID *qidp)
-{
-    struct stat stbuf;
-    V9fsPath path;
-    int err;
-
-    v9fs_path_init(&path);
-
-    err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path);
-    if (err < 0) {
-        goto out;
-    }
-    err = v9fs_co_lstat(pdu, &path, &stbuf);
-    if (err < 0) {
-        goto out;
-    }
-    err = stat_to_qid(pdu, &stbuf, qidp);
-
-out:
-    v9fs_path_free(&path);
-    return err;
-}
-
 V9fsPDU *pdu_alloc(V9fsState *s)
 {
     V9fsPDU *pdu = NULL;
@@ -2314,7 +2290,7 @@  out_nofid:
     pdu_complete(pdu, err);
 }
 
-static size_t v9fs_readdir_data_size(V9fsString *name)
+size_t v9fs_readdir_response_size(V9fsString *name)
 {
     /*
      * Size of each dirent on the wire: size of qid (13) + size of offset (8)
@@ -2323,6 +2299,18 @@  static size_t v9fs_readdir_data_size(V9fsString *name)
     return 24 + v9fs_string_size(name);
 }
 
+static void v9fs_free_dirents(struct V9fsDirEnt *e)
+{
+    struct V9fsDirEnt *next = NULL;
+
+    for (; e; e = next) {
+        next = e->next;
+        g_free(e->dent);
+        g_free(e->st);
+        g_free(e);
+    }
+}
+
 static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
                                         int32_t max_count)
 {
@@ -2331,54 +2319,53 @@  static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
     V9fsString name;
     int len, err = 0;
     int32_t count = 0;
-    off_t saved_dir_pos;
     struct dirent *dent;
+    struct stat *st;
+    struct V9fsDirEnt *entries = NULL;
 
-    /* save the directory position */
-    saved_dir_pos = v9fs_co_telldir(pdu, fidp);
-    if (saved_dir_pos < 0) {
-        return saved_dir_pos;
-    }
-
-    while (1) {
-        v9fs_readdir_lock(&fidp->fs.dir);
+    /*
+     * inode remapping requires the device id, which in turn might be
+     * different for different directory entries, so if inode remapping is
+     * enabled we have to make a full stat for each directory entry
+     */
+    const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
 
-        err = v9fs_co_readdir(pdu, fidp, &dent);
-        if (err || !dent) {
-            break;
-        }
-        v9fs_string_init(&name);
-        v9fs_string_sprintf(&name, "%s", dent->d_name);
-        if ((count + v9fs_readdir_data_size(&name)) > max_count) {
-            v9fs_readdir_unlock(&fidp->fs.dir);
+    /*
+     * Fetch all required directory entries altogether on a background IO
+     * thread from fs driver. We don't want to do that for each entry
+     * individually, because hopping between threads (this main IO thread
+     * and background IO driver thread) would sum up to huge latencies.
+     */
+    count = v9fs_co_readdir_lowlat(pdu, fidp, &entries, max_count, dostat);
+    if (count < 0) {
+        err = count;
+        count = 0;
+        goto out;
+    }
+    count = 0;
 
-            /* Ran out of buffer. Set dir back to old position and return */
-            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-            v9fs_string_free(&name);
-            return count;
-        }
+    for (struct V9fsDirEnt *e = entries; e; e = e->next) {
+        dent = e->dent;
 
         if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
-            /*
-             * dirent_to_qid() implies expensive stat call for each entry,
-             * we must do that here though since inode remapping requires
-             * the device id, which in turn might be different for
-             * different entries; we cannot make any assumption to avoid
-             * that here.
-             */
-            err = dirent_to_qid(pdu, fidp, dent, &qid);
+            st = e->st;
+            /* e->st should never be NULL, but just to be sure */
+            if (!st) {
+                err = -1;
+                break;
+            }
+
+            /* remap inode */
+            err = stat_to_qid(pdu, st, &qid);
             if (err < 0) {
-                v9fs_readdir_unlock(&fidp->fs.dir);
-                v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-                v9fs_string_free(&name);
-                return err;
+                break;
             }
         } else {
             /*
              * Fill up just the path field of qid because the client uses
              * only that. To fill the entire qid structure we will have
              * to stat each dirent found, which is expensive. For the
-             * latter reason we don't call dirent_to_qid() here. Only drawback
+             * latter reason we don't call stat_to_qid() here. Only drawback
              * is that no multi-device export detection of stat_to_qid()
              * would be done and provided as error to the user here. But
              * user would get that error anyway when accessing those
@@ -2391,25 +2378,26 @@  static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
             qid.version = 0;
         }
 
+        v9fs_string_init(&name);
+        v9fs_string_sprintf(&name, "%s", dent->d_name);
+
         /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
         len = pdu_marshal(pdu, 11 + count, "Qqbs",
                           &qid, dent->d_off,
                           dent->d_type, &name);
 
-        v9fs_readdir_unlock(&fidp->fs.dir);
+        v9fs_string_free(&name);
 
         if (len < 0) {
-            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-            v9fs_string_free(&name);
-            return len;
+            err = len;
+            break;
         }
+
         count += len;
-        v9fs_string_free(&name);
-        saved_dir_pos = dent->d_off;
     }
 
-    v9fs_readdir_unlock(&fidp->fs.dir);
-
+out:
+    v9fs_free_dirents(entries);
     if (err < 0) {
         return err;
     }
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 6fffe44f5a..1dbb0ad189 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -215,6 +215,28 @@  static inline void v9fs_readdir_init(V9fsDir *dir)
     qemu_mutex_init(&dir->readdir_mutex);
 }
 
+/*
+ * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
+ * which is a chained list of directory entries.
+ */
+typedef struct V9fsDirEnt {
+    /* mandatory (must not be NULL) information for all readdir requests */
+    struct dirent *dent;
+    /*
+     * optional (may be NULL): A full stat of each directory entry is just
+     * done if explicitly told to fs driver.
+     */
+    struct stat *st;
+    /*
+     * instead of an array, directory entries are always returned as
+     * chained list, that's because the amount of entries retrieved by fs
+     * drivers is dependent on the individual entries' name (since response
+     * messages are size limited), so the final amount cannot be estimated
+     * before hand
+     */
+    struct V9fsDirEnt *next;
+} V9fsDirEnt;
+
 /*
  * Filled by fs driver on open and other
  * calls.
@@ -419,6 +441,7 @@  void v9fs_path_init(V9fsPath *path);
 void v9fs_path_free(V9fsPath *path);
 void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...);
 void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src);
+size_t v9fs_readdir_response_size(V9fsString *name);
 int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
                       const char *name, V9fsPath *path);
 int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
index 73f9a751e1..6ce7dc8cde 100644
--- a/hw/9pfs/codir.c
+++ b/hw/9pfs/codir.c
@@ -18,28 +18,189 @@ 
 #include "qemu/main-loop.h"
 #include "coth.h"
 
+/*
+ * This is solely executed on a background IO thread.
+ */
+static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent)
+{
+    int err = 0;
+    V9fsState *s = pdu->s;
+    struct dirent *entry;
+
+    errno = 0;
+    entry = s->ops->readdir(&s->ctx, &fidp->fs);
+    if (!entry && errno) {
+        *dent = NULL;
+        err = -errno;
+    } else {
+        *dent = entry;
+    }
+    return err;
+}
+
+/*
+ * TODO: This will be removed for performance reasons.
+ * Use v9fs_co_readdir_lowlat() instead.
+ */
 int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
                                  struct dirent **dent)
 {
     int err;
-    V9fsState *s = pdu->s;
 
     if (v9fs_request_cancelled(pdu)) {
         return -EINTR;
     }
-    v9fs_co_run_in_worker(
-        {
-            struct dirent *entry;
+    v9fs_co_run_in_worker({
+        err = do_readdir(pdu, fidp, dent);
+    });
+    return err;
+}
+
+/*
+ * This is solely executed on a background IO thread.
+ *
+ * See v9fs_co_readdir_lowlat() (as its only user) below for details.
+ */
+static int do_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
+                             struct V9fsDirEnt **entries,
+                             int32_t maxsize, bool dostat)
+{
+    V9fsState *s = pdu->s;
+    V9fsString name;
+    int len, err = 0;
+    int32_t size = 0;
+    off_t saved_dir_pos;
+    struct dirent *dent;
+    struct V9fsDirEnt *e = NULL;
+    V9fsPath path;
+    struct stat stbuf;
+
+    *entries = NULL;
+    v9fs_path_init(&path);
+
+    /*
+     * TODO: Here should be a warn_report_once() if lock failed.
+     *
+     * With a good 9p client we should not get into concurrency here,
+     * because a good client would not use the same fid for concurrent
+     * requests. We do the lock here for safety reasons though. However
+     * the client would then suffer performance issues, so better log that
+     * issue here.
+     */
+    v9fs_readdir_lock(&fidp->fs.dir);
+
+    /* save the directory position */
+    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
+    if (saved_dir_pos < 0) {
+        err = saved_dir_pos;
+        goto out;
+    }
+
+    while (true) {
+        /* get directory entry from fs driver */
+        err = do_readdir(pdu, fidp, &dent);
+        if (err || !dent) {
+            break;
+        }
+
+        /*
+         * stop this loop as soon as it would exceed the allowed maximum
+         * response message size for the directory entries collected so far,
+         * because anything beyond that size would need to be discarded by
+         * 9p controller (main thread / top half) anyway
+         */
+        v9fs_string_init(&name);
+        v9fs_string_sprintf(&name, "%s", dent->d_name);
+        len = v9fs_readdir_response_size(&name);
+        v9fs_string_free(&name);
+        if (size + len > maxsize) {
+            /* this is not an error case actually */
+            break;
+        }
+
+        /* append next node to result chain */
+        if (!e) {
+            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
+        } else {
+            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
+        }
+        e->dent = g_malloc0(sizeof(struct dirent));
+        memcpy(e->dent, dent, sizeof(struct dirent));
 
-            errno = 0;
-            entry = s->ops->readdir(&s->ctx, &fidp->fs);
-            if (!entry && errno) {
+        /* perform a full stat() for directory entry if requested by caller */
+        if (dostat) {
+            err = s->ops->name_to_path(
+                &s->ctx, &fidp->path, dent->d_name, &path
+            );
+            if (err < 0) {
                 err = -errno;
-            } else {
-                *dent = entry;
-                err = 0;
+                break;
             }
-        });
+
+            err = s->ops->lstat(&s->ctx, &path, &stbuf);
+            if (err < 0) {
+                err = -errno;
+                break;
+            }
+
+            e->st = g_malloc0(sizeof(struct stat));
+            memcpy(e->st, &stbuf, sizeof(struct stat));
+        }
+
+        size += len;
+        saved_dir_pos = dent->d_off;
+    }
+
+    /* restore (last) saved position */
+    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
+
+out:
+    v9fs_readdir_unlock(&fidp->fs.dir);
+    v9fs_path_free(&path);
+    if (err < 0) {
+        return err;
+    }
+    return size;
+}
+
+/**
+ * @brief Low latency variant of fs driver readdir handling.
+ *
+ * Retrieves the requested (max. amount of) directory entries from the fs
+ * driver. This function must only be called by the main IO thread (top half).
+ * Internally this function call will be dispatched to a background IO thread
+ * (bottom half) where it is eventually executed by the fs driver.
+ *
+ * The old readdir implementation above just retrieves always one dir entry
+ * per call. The problem of that implementation above is that latency is
+ * added for (retrieval of) each directory entry, which in practice lead to
+ * latencies of several hundred ms for readdir of only one directory.
+ *
+ * This is avoided in this function by letting the fs driver retrieve all
+ * required directory entries with only call of this function and hence with
+ * only a single fs driver request.
+ *
+ * @param pdu - the causing 9p (T_readdir) client request
+ * @param fidp - already opened directory where readdir shall be performed on
+ * @param entries - output for directory entries (must not be NULL)
+ * @param maxsize - maximum result message body size (in bytes)
+ * @param dostat - whether a stat() should be performed and returned for
+ *                 each directory entry
+ * @returns resulting response message body size (in bytes) on success,
+ *          negative error code otherwise
+ */
+int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *pdu, V9fsFidState *fidp,
+                                        struct V9fsDirEnt **entries,
+                                        int32_t maxsize, bool dostat)
+{
+    int err = 0;
+
+    if (v9fs_request_cancelled(pdu)) {
+        return -EINTR;
+    }
+    v9fs_co_run_in_worker({
+        err = do_readdir_lowlat(pdu, fidp, entries, maxsize, dostat);
+    });
     return err;
 }
 
diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
index c2cdc7a9ea..1249dbe6df 100644
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -49,6 +49,9 @@ 
 void co_run_in_worker_bh(void *);
 int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
 int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
+int coroutine_fn v9fs_co_readdir_lowlat(V9fsPDU *, V9fsFidState *,
+                                        struct V9fsDirEnt **,
+                                        int32_t, bool);
 off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
 void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
 void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);