Patchwork Guest latency issues due to bdrv_check_byte_request

login
register
mail settings
Submitter Stefan Hajnoczi
Date April 17, 2010, 9:32 p.m.
Message ID <y2hfbd9d3991004171432zdbe02ee9id669a6f2551000b8@mail.gmail.com>
Download mbox | patch
Permalink /patch/50387/
State New
Headers show

Comments

Stefan Hajnoczi - April 17, 2010, 9:32 p.m.
Thanks Christoph.

Cached getlength with pread/pwrite:
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 96.97    1.760111       11893       148         4 futex
  1.61    0.029209           1     46891      2217 select
  0.28    0.005047           0     64609           timer_gettime
  0.22    0.004059           0     42745      2578 rt_sigreturn
  0.22    0.003911           0     46261           timer_settime
  0.18    0.003280        1093         3           shmdt
  0.17    0.003095           0     23859           pread  <---
  0.17    0.003061           0     42800           write
  0.16    0.002916           0     47759      5151 read
  0.02    0.000285           0       645           writev
[...]
  0.00    0.000000           0        13           lseek

Note that this is a Tiny Core Linux boot from disk and shutdown; not
very I/O intensive since it only loads a kernel and ~10 MB initramfs
without touching the disk much after kernel load.

     if (bdrv_flags & BDRV_O_RDWR) {
@@ -243,19 +240,7 @@ static int raw_pread_aligned(BlockDriverState
*bs, int64_t offset,
     if (ret < 0)
         return ret;

-    if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
-        ++(s->lseek_err_cnt);
-        if(s->lseek_err_cnt <= 10) {
-            DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
-                              "] lseek failed : %d = %s\n",
-                              s->fd, bs->filename, offset, buf, count,
-                              bs->total_sectors, errno, strerror(errno));
-        }
-        return -1;
-    }
-    s->lseek_err_cnt=0;
-
-    ret = read(s->fd, buf, count);
+    ret = pread(s->fd, buf, count, offset);
     if (ret == count)
         goto label__raw_read__success;

@@ -276,12 +261,10 @@ static int raw_pread_aligned(BlockDriverState
*bs, int64_t offset,

     /* Try harder for CDrom. */
     if (bs->type == BDRV_TYPE_CDROM) {
-        lseek(s->fd, offset, SEEK_SET);
-        ret = read(s->fd, buf, count);
+        ret = pread(s->fd, buf, count, offset);
         if (ret == count)
             goto label__raw_read__success;
-        lseek(s->fd, offset, SEEK_SET);
-        ret = read(s->fd, buf, count);
+        ret = pread(s->fd, buf, count, offset);
         if (ret == count)
             goto label__raw_read__success;

@@ -313,19 +296,7 @@ static int raw_pwrite_aligned(BlockDriverState
*bs, int64_t offset,
     if (ret < 0)
         return -errno;

-    if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
-        ++(s->lseek_err_cnt);
-        if(s->lseek_err_cnt) {
-            DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%"
-                              PRId64 "] lseek failed : %d = %s\n",
-                              s->fd, bs->filename, offset, buf, count,
-                              bs->total_sectors, errno, strerror(errno));
-        }
-        return -EIO;
-    }
-    s->lseek_err_cnt = 0;
-
-    ret = write(s->fd, buf, count);
+    ret = pwrite(s->fd, buf, count, offset);
     if (ret == count)
         goto label__raw_write__success;

Stefan
Christoph Hellwig - April 18, 2010, 5:37 p.m.
You should split this up into two patches - one for the the compat AIO
implementation and one for the getlength caching.
Jan Kiszka - April 18, 2010, 6:05 p.m.
Stefan Hajnoczi wrote:
> Thanks Christoph.
> 
> Cached getlength with pread/pwrite:
> % time     seconds  usecs/call     calls    errors syscall
> ------ ----------- ----------- --------- --------- ----------------
>  96.97    1.760111       11893       148         4 futex
>   1.61    0.029209           1     46891      2217 select
>   0.28    0.005047           0     64609           timer_gettime
>   0.22    0.004059           0     42745      2578 rt_sigreturn
>   0.22    0.003911           0     46261           timer_settime
>   0.18    0.003280        1093         3           shmdt
>   0.17    0.003095           0     23859           pread  <---
>   0.17    0.003061           0     42800           write
>   0.16    0.002916           0     47759      5151 read
>   0.02    0.000285           0       645           writev
> [...]
>   0.00    0.000000           0        13           lseek
> 
> Note that this is a Tiny Core Linux boot from disk and shutdown; not
> very I/O intensive since it only loads a kernel and ~10 MB initramfs
> without touching the disk much after kernel load.

Nice. Will give this a try tomorrow with "a bit" more load.

We already played with a hack to completely remove the checks from AIO
requests, thus avoiding lseek this way - effect as desired, but fragile
of course.

Jan

Patch

diff --git a/block.c b/block.c
index 0f6be17..5c1652c 100644
--- a/block.c
+++ b/block.c
@@ -363,6 +363,7 @@  static int bdrv_open_common(BlockDriverState *bs,
const char *filename,
     assert(drv != NULL);

     bs->file = NULL;
+    bs->total_sectors = 0;
     bs->is_temporary = 0;
     bs->encrypted = 0;
     bs->valid_key = 0;
@@ -416,9 +417,7 @@  static int bdrv_open_common(BlockDriverState *bs,
const char *filename,
     }

     bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
-    if (drv->bdrv_getlength) {
-        bs->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
-    }
+    bs->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
 #ifndef _WIN32
     if (bs->is_temporary) {
         unlink(filename);
@@ -957,13 +956,26 @@  int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
 {
     BlockDriver *drv = bs->drv;
+    int ret;
     if (!drv)
         return -ENOMEDIUM;
     if (!drv->bdrv_truncate)
         return -ENOTSUP;
     if (bs->read_only)
         return -EACCES;
-    return drv->bdrv_truncate(bs, offset);
+    ret = drv->bdrv_truncate(bs, offset);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* refresh total sectors */
+    if (drv->bdrv_getlength) {
+        bs->total_sectors = 0; /* discard cached value */
+        bs->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    } else {
+        bs->total_sectors = offset >> BDRV_SECTOR_BITS;
+    }
+    return ret;
 }

 /**
@@ -974,8 +986,12 @@  int64_t bdrv_getlength(BlockDriverState *bs)
     BlockDriver *drv = bs->drv;
     if (!drv)
         return -ENOMEDIUM;
-    if (!drv->bdrv_getlength) {
-        /* legacy mode */
+
+    /* Fixed size devices use the total_sectors value for speed instead of
+       issuing a length query (like lseek) on each call.  Also, legacy block
+       drivers don't provide a bdrv_getlength function and must use
+       total_sectors. */
+    if ((bs->total_sectors && !bs->growable) || !drv->bdrv_getlength) {
         return bs->total_sectors * BDRV_SECTOR_SIZE;
     }
     return drv->bdrv_getlength(bs);
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 598ea19..7541ed2 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -105,7 +105,6 @@ 
 typedef struct BDRVRawState {
     int fd;
     int type;
-    unsigned int lseek_err_cnt;
     int open_flags;
 #if defined(__linux__)
     /* linux floppy specific */
@@ -134,8 +133,6 @@  static int raw_open_common(BlockDriverState *bs,
const char *filename,
     BDRVRawState *s = bs->opaque;
     int fd, ret;

-    s->lseek_err_cnt = 0;
-
     s->open_flags = open_flags | O_BINARY;
     s->open_flags &= ~O_ACCMODE;