diff mbox series

[v4,09/11] hw/9pfs/9p-synth: avoid n-square issue in synth_readdir()

Message ID d385726be4d8146a86561703bc6d77edd39fb654.1579567020.git.qemu_oss@crudebyte.com
State New
Headers show
Series 9pfs: readdir optimization | expand

Commit Message

Christian Schoenebeck Jan. 21, 2020, 12:26 a.m. UTC
This patch is just a temporary benchmark hack, not intended
to be merged!

9pfs synth driver's readdir() implementation has a severe
n-square performance problem. This patch is a quick and dirty
hack to prevent that performance problem from tainting the
readdir() benchmark results. In its current form, this patch
is not useful for anything else than for an isolated readdir
benchmark.

NOTE: This patch would break the new readdir/split test,
because it would alter the behaviour of seekdir() required
for retrieving directory entries splitted over several
requests.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p-synth.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

Comments

Greg Kurz Jan. 23, 2020, 11:13 a.m. UTC | #1
On Tue, 21 Jan 2020 01:26:15 +0100
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> This patch is just a temporary benchmark hack, not intended
> to be merged!
> 
> 9pfs synth driver's readdir() implementation has a severe
> n-square performance problem. This patch is a quick and dirty
> hack to prevent that performance problem from tainting the
> readdir() benchmark results. In its current form, this patch
> is not useful for anything else than for an isolated readdir
> benchmark.
> 
> NOTE: This patch would break the new readdir/split test,
> because it would alter the behaviour of seekdir() required
> for retrieving directory entries splitted over several
> requests.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---

Honestly it doesn't seem to change anything significant for me.
Mean time calculated over 100 runs:

Without this patch:

[greg@bahia qemu-9p]$ (cd .mbuild-$(stg branch)/obj ; export QTEST_QEMU_BINARY='x86_64-softmmu/qemu-system-x86_64'; make all tests/qtest/qos-test && for i in {1..100}; do tests/qtest/qos-test -p $(tests/qtest/qos-test -l | grep readdir/basic); done) |& awk '/IMPORTANT/ { print $10 }' | sed -e 's/s//' -e 's/^/n+=1;x+=/;$ascale=6;x/n' | bc
.055654

With this patch:

[greg@bahia qemu-9p]$ (cd .mbuild-$(stg branch)/obj ; export QTEST_QEMU_BINARY='x86_64-softmmu/qemu-system-x86_64'; make all tests/qtest/qos-test && for i in {1..100}; do tests/qtest/qos-test -p $(tests/qtest/qos-test -l | grep readdir/basic); done) |& awk '/IMPORTANT/ { print $10 }' | sed -e 's/s//' -e 's/^/n+=1;x+=/;$ascale=6;x/n' | bc
.058786

>  hw/9pfs/9p-synth.c | 29 ++++++++++++++++++++++++++---
>  1 file changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/9pfs/9p-synth.c b/hw/9pfs/9p-synth.c
> index 7eb210ffa8..54dc30f37b 100644
> --- a/hw/9pfs/9p-synth.c
> +++ b/hw/9pfs/9p-synth.c
> @@ -225,7 +225,8 @@ static void synth_direntry(V9fsSynthNode *node,
>  }
>  
>  static struct dirent *synth_get_dentry(V9fsSynthNode *dir,
> -                                            struct dirent *entry, off_t off)
> +                                       struct dirent *entry, off_t off,
> +                                       V9fsSynthNode **hack)
>  {
>      int i = 0;
>      V9fsSynthNode *node;
> @@ -243,16 +244,38 @@ static struct dirent *synth_get_dentry(V9fsSynthNode *dir,
>          /* end of directory */
>          return NULL;
>      }
> +    *hack = node;
>      synth_direntry(node, entry, off);
>      return entry;
>  }
>  
>  static struct dirent *synth_readdir(FsContext *ctx, V9fsFidOpenState *fs)
>  {
> -    struct dirent *entry;
> +    struct dirent *entry = NULL;
>      V9fsSynthOpenState *synth_open = fs->private;
>      V9fsSynthNode *node = synth_open->node;
> -    entry = synth_get_dentry(node, &synth_open->dent, synth_open->offset);
> +
> +    /*
> +     * HACK: This is just intended for benchmark, to avoid severe n-square
> +     * performance problem of synth driver's readdir implementation here which
> +     * would otherwise unncessarily taint the benchmark results. By simply
> +     * caching (globally) the previous node (of the previous synth_readdir()
> +     * call) we can simply proceed to next node in chained list efficiently.
> +     *
> +     * not a good idea for any production code ;-)
> +     */
> +    static struct V9fsSynthNode *cachedNode;
> +
> +    if (!cachedNode) {
> +        entry = synth_get_dentry(node, &synth_open->dent, synth_open->offset,
> +                                 &cachedNode);
> +    } else {
> +        cachedNode = cachedNode->sibling.le_next;
> +        if (cachedNode) {
> +            entry = &synth_open->dent;
> +            synth_direntry(cachedNode, entry, synth_open->offset + 1);
> +        }
> +    }
>      if (entry) {
>          synth_open->offset++;
>      }
Christian Schoenebeck Jan. 23, 2020, 12:40 p.m. UTC | #2
On Donnerstag, 23. Januar 2020 12:13:51 CET Greg Kurz wrote:
> Honestly it doesn't seem to change anything significant for me.
> Mean time calculated over 100 runs:
> 
> Without this patch:
> 
> [greg@bahia qemu-9p]$ (cd .mbuild-$(stg branch)/obj ; export
> QTEST_QEMU_BINARY='x86_64-softmmu/qemu-system-x86_64'; make all
> tests/qtest/qos-test && for i in {1..100}; do tests/qtest/qos-test -p
> $(tests/qtest/qos-test -l | grep readdir/basic); done) |& awk '/IMPORTANT/
> { print $10 }' | sed -e 's/s//' -e 's/^/n+=1;x+=/;$ascale=6;x/n' | bc
> .055654
> 
> With this patch:
> 
> [greg@bahia qemu-9p]$ (cd .mbuild-$(stg branch)/obj ; export
> QTEST_QEMU_BINARY='x86_64-softmmu/qemu-system-x86_64'; make all
> tests/qtest/qos-test && for i in {1..100}; do tests/qtest/qos-test -p
> $(tests/qtest/qos-test -l | grep readdir/basic); done) |& awk '/IMPORTANT/
> { print $10 }' | sed -e 's/s//' -e 's/^/n+=1;x+=/;$ascale=6;x/n' | bc
> .058786

:))) Mhmm, that's because you have run this test WITHOUT the actual readdir 
optimization patch. In this scenario the readdir latency issue is so bad, that 
the driver's n-square issue does not even matter, so same here:

Unoptimized readdir, no n-squre correction hack:
Time client spent for waiting for reply from server: 0.082831s [MOST 
IMPORTANT]

Unoptimized readdir, with n-squre correction hack:
Time client spent for waiting for reply from server: 0.082539s [MOST 
IMPORTANT]

BUT, now look at this *with* readdir optimization applied:

Optimized readdir, no n-square correction hack:
Time 9p server spent on synth_readdir() I/O only (synth driver): 0.021304s
Time 9p server spent on entire T_readdir request: 0.022033s [IMPORTANT]
Time client spent for waiting for reply from server: 0.022408s [MOST 
IMPORTANT]

Optimized readdir, with n-square correction hack:
Time 9p server spent on synth_readdir() I/O only (synth driver): 0.001576s
Time 9p server spent on entire T_readdir request: 0.002244s [IMPORTANT]
Time client spent for waiting for reply from server: 0.002566s [MOST 
IMPORTANT]

Got it? :)

I had good reasons for printing out the time spent on raw driver only.

Best regards,
Christian Schoenebeck
diff mbox series

Patch

diff --git a/hw/9pfs/9p-synth.c b/hw/9pfs/9p-synth.c
index 7eb210ffa8..54dc30f37b 100644
--- a/hw/9pfs/9p-synth.c
+++ b/hw/9pfs/9p-synth.c
@@ -225,7 +225,8 @@  static void synth_direntry(V9fsSynthNode *node,
 }
 
 static struct dirent *synth_get_dentry(V9fsSynthNode *dir,
-                                            struct dirent *entry, off_t off)
+                                       struct dirent *entry, off_t off,
+                                       V9fsSynthNode **hack)
 {
     int i = 0;
     V9fsSynthNode *node;
@@ -243,16 +244,38 @@  static struct dirent *synth_get_dentry(V9fsSynthNode *dir,
         /* end of directory */
         return NULL;
     }
+    *hack = node;
     synth_direntry(node, entry, off);
     return entry;
 }
 
 static struct dirent *synth_readdir(FsContext *ctx, V9fsFidOpenState *fs)
 {
-    struct dirent *entry;
+    struct dirent *entry = NULL;
     V9fsSynthOpenState *synth_open = fs->private;
     V9fsSynthNode *node = synth_open->node;
-    entry = synth_get_dentry(node, &synth_open->dent, synth_open->offset);
+
+    /*
+     * HACK: This is just intended for benchmark, to avoid severe n-square
+     * performance problem of synth driver's readdir implementation here which
+     * would otherwise unncessarily taint the benchmark results. By simply
+     * caching (globally) the previous node (of the previous synth_readdir()
+     * call) we can simply proceed to next node in chained list efficiently.
+     *
+     * not a good idea for any production code ;-)
+     */
+    static struct V9fsSynthNode *cachedNode;
+
+    if (!cachedNode) {
+        entry = synth_get_dentry(node, &synth_open->dent, synth_open->offset,
+                                 &cachedNode);
+    } else {
+        cachedNode = cachedNode->sibling.le_next;
+        if (cachedNode) {
+            entry = &synth_open->dent;
+            synth_direntry(cachedNode, entry, synth_open->offset + 1);
+        }
+    }
     if (entry) {
         synth_open->offset++;
     }