diff mbox

[2/3] block: explicit I/O accounting

Message ID 20110821222606.GB22064@lst.de
State New
Headers show

Commit Message

Christoph Hellwig Aug. 21, 2011, 10:26 p.m. UTC
Decouple the I/O accounting from bdrv_aio_readv/writev/flush and
make the hardware models call directly into the accounting helpers.

This means:
 - we do not count internal requests from image formats in addition
   to guest originating I/O
 - we do not double count I/O ops if the device model handles it
   chunk wise
 - we only account I/O once it actuall is done
 - can extent I/O accounting to synchronous or coroutine I/O easily
 - implement I/O latency tracking easily (see the next patch)

I've conveted the existing device model callers to the new model,
device models that are using synchronous I/O and weren't accounted
before haven't been updated yet.  Also scsi hasn't been converted
to the end-to-end accounting as I want to defer that after the pending
scsi layer overhaul.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Comments

Kevin Wolf Aug. 22, 2011, 3:48 p.m. UTC | #1
Am 22.08.2011 00:26, schrieb Christoph Hellwig:
> Decouple the I/O accounting from bdrv_aio_readv/writev/flush and
> make the hardware models call directly into the accounting helpers.
> 
> This means:
>  - we do not count internal requests from image formats in addition
>    to guest originating I/O
>  - we do not double count I/O ops if the device model handles it
>    chunk wise
>  - we only account I/O once it actuall is done
>  - can extent I/O accounting to synchronous or coroutine I/O easily
>  - implement I/O latency tracking easily (see the next patch)
> 
> I've conveted the existing device model callers to the new model,
> device models that are using synchronous I/O and weren't accounted
> before haven't been updated yet.  Also scsi hasn't been converted
> to the end-to-end accounting as I want to defer that after the pending
> scsi layer overhaul.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Index: qemu/block.c
> ===================================================================
> --- qemu.orig/block.c	2011-08-21 14:53:09.684226561 -0700
> +++ qemu/block.c	2011-08-21 14:57:09.507558463 -0700
> @@ -1915,13 +1915,13 @@ static QObject* bdrv_info_stats_bs(Block
>                               "'wr_highest_offset': %" PRId64 ","
>                               "'flush_operations': %" PRId64
>                               "} }",
> -                             bs->rd_bytes,
> -                             bs->wr_bytes,
> -                             bs->rd_ops,
> -                             bs->wr_ops,
> +                             bs->nr_bytes[BDRV_ACCT_READ],
> +                             bs->nr_bytes[BDRV_ACCT_WRITE],
> +                             bs->nr_ops[BDRV_ACCT_READ],
> +                             bs->nr_ops[BDRV_ACCT_WRITE],
>                               bs->wr_highest_sector *
>                               (uint64_t)BDRV_SECTOR_SIZE,
> -                             bs->flush_ops);
> +                             bs->nr_ops[BDRV_ACCT_FLUSH]);
>      dict  = qobject_to_qdict(res);
>  
>      if (*bs->device_name) {
> @@ -2235,7 +2235,6 @@ char *bdrv_snapshot_dump(char *buf, int
>      return buf;
>  }
>  
> -
>  /**************************************************************/
>  /* async I/Os */
>  
> @@ -2244,7 +2243,6 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
>                                   BlockDriverCompletionFunc *cb, void *opaque)
>  {
>      BlockDriver *drv = bs->drv;
> -    BlockDriverAIOCB *ret;
>  
>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>  
> @@ -2253,16 +2251,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
>      if (bdrv_check_request(bs, sector_num, nb_sectors))
>          return NULL;
>  
> -    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
> -                              cb, opaque);
> -
> -    if (ret) {
> -	/* Update stats even though technically transfer has not happened. */
> -	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> -	bs->rd_ops ++;
> -    }
> -
> -    return ret;
> +    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, cb, opaque);
>  }
>  
>  typedef struct BlockCompleteData {
> @@ -2329,9 +2318,6 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockD
>                                 cb, opaque);
>  
>      if (ret) {
> -        /* Update stats even though technically transfer has not happened. */
> -        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> -        bs->wr_ops ++;
>          if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>              bs->wr_highest_sector = sector_num + nb_sectors - 1;
>          }
> @@ -2585,8 +2571,6 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDr
>  
>      trace_bdrv_aio_flush(bs, opaque);
>  
> -    bs->flush_ops++;
> -
>      if (bs->open_flags & BDRV_O_NO_FLUSH) {
>          return bdrv_aio_noop_em(bs, cb, opaque);
>      }
> Index: qemu/block_int.h
> ===================================================================
> --- qemu.orig/block_int.h	2011-08-21 14:53:09.000000000 -0700
> +++ qemu/block_int.h	2011-08-21 14:57:09.510891797 -0700
> @@ -148,6 +148,13 @@ struct BlockDriver {
>      QLIST_ENTRY(BlockDriver) list;
>  };
>  
> +enum BlockIOType {
> +    BDRV_ACCT_READ,
> +    BDRV_ACCT_WRITE,
> +    BDRV_ACCT_FLUSH,
> +    BDRV_MAX_IOTYPE,
> +};
> +
>  struct BlockDriverState {
>      int64_t total_sectors; /* if we are reading a disk image, give its
>                                size in sectors */
> @@ -184,11 +191,8 @@ struct BlockDriverState {
>      void *sync_aiocb;
>  
>      /* I/O stats (display with "info blockstats"). */
> -    uint64_t rd_bytes;
> -    uint64_t wr_bytes;
> -    uint64_t rd_ops;
> -    uint64_t wr_ops;
> -    uint64_t flush_ops;
> +    uint64_t nr_bytes[BDRV_MAX_IOTYPE];
> +    uint64_t nr_ops[BDRV_MAX_IOTYPE];
>      uint64_t wr_highest_sector;
>  
>      /* Whether the disk can expand beyond total_sectors */
> @@ -212,6 +216,27 @@ struct BlockDriverState {
>      void *private;
>  };
>  
> +typedef struct BlockAcctCookie {
> +    int64_t bytes;
> +    enum BlockIOType type;
> +} BlockAcctCookie;
> +
> +static inline void
> +bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
> +        enum BlockIOType type)
> +{
> +    cookie->bytes = bytes;
> +    cookie->type = type;
> +}
> +
> +static inline void
> +bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
> +{
> +    bs->nr_bytes[cookie->type] += cookie->bytes;
> +    bs->nr_ops[cookie->type]++;
> +}

Device models actually shouldn't include block_int.h, so this isn't very
nice. Moving it to block.h would lose the inline, does it matter?

>  #define CHANGE_MEDIA    0x01
>  #define CHANGE_SIZE     0x02
>  
> Index: qemu/hw/ide/ahci.c
> ===================================================================
> --- qemu.orig/hw/ide/ahci.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/ide/ahci.c	2011-08-21 14:57:09.510891797 -0700
> @@ -710,6 +710,7 @@ static void ncq_cb(void *opaque, int ret
>      DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
>              ncq_tfs->tag);
>  
> +    bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct);
>      qemu_sglist_destroy(&ncq_tfs->sglist);
>      ncq_tfs->used = 0;
>  }
> @@ -755,7 +756,11 @@ static void process_ncq_command(AHCIStat
>                      ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
>              ncq_tfs->is_read = 1;
>  
> -            DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
> +	    DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);

The patch is adding tabs here and in other places. Please fix.

> +
> +            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
> +                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
> +                            BDRV_ACCT_READ);
>              ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs,
>                                             &ncq_tfs->sglist, ncq_tfs->lba,
>                                             ncq_cb, ncq_tfs);
> @@ -766,6 +771,10 @@ static void process_ncq_command(AHCIStat
>              ncq_tfs->is_read = 0;
>  
>              DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba);
> +
> +            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
> +                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
> +                            BDRV_ACCT_WRITE);
>              ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs,
>                                              &ncq_tfs->sglist, ncq_tfs->lba,
>                                              ncq_cb, ncq_tfs);
> Index: qemu/hw/ide/atapi.c
> ===================================================================
> --- qemu.orig/hw/ide/atapi.c	2011-08-21 11:49:17.000000000 -0700
> +++ qemu/hw/ide/atapi.c	2011-08-21 14:57:09.510891797 -0700
> @@ -250,6 +250,7 @@ static void ide_atapi_cmd_reply(IDEState
>      s->io_buffer_index = 0;
>  
>      if (s->atapi_dma) {
> +        bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ);
>          s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
>          s->bus->dma->ops->start_dma(s->bus->dma, s,
>                                     ide_atapi_cmd_read_dma_cb);
> @@ -322,10 +323,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
>          s->status = READY_STAT | SEEK_STAT;
>          s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
>          ide_set_irq(s->bus);
> -    eot:
> -        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
> -        ide_set_inactive(s);
> -        return;
> +        goto eot;
>      }
>  
>      s->io_buffer_index = 0;
> @@ -343,9 +341,11 @@ static void ide_atapi_cmd_read_dma_cb(vo
>  #ifdef DEBUG_AIO
>      printf("aio_read_cd: lba=%u n=%d\n", s->lba, n);
>  #endif
> +
>      s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset);
>      s->bus->dma->iov.iov_len = n * 4 * 512;
>      qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
> +
>      s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2,
>                                         &s->bus->dma->qiov, n * 4,
>                                         ide_atapi_cmd_read_dma_cb, s);
> @@ -355,6 +355,12 @@ static void ide_atapi_cmd_read_dma_cb(vo
>                              ASC_MEDIUM_NOT_PRESENT);
>          goto eot;
>      }
> +
> +    return;
> +eot:
> +    bdrv_acct_done(s->bs, &s->acct);
> +    s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
> +    ide_set_inactive(s);
>  }
>  
>  /* start a CD-CDROM read command with DMA */
> @@ -368,6 +374,8 @@ static void ide_atapi_cmd_read_dma(IDESt
>      s->io_buffer_size = 0;
>      s->cd_sector_size = sector_size;
>  
> +    bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ);
> +
>      /* XXX: check if BUSY_STAT should be set */
>      s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
>      s->bus->dma->ops->start_dma(s->bus->dma, s,
> Index: qemu/hw/ide/core.c
> ===================================================================
> --- qemu.orig/hw/ide/core.c	2011-08-21 11:49:17.000000000 -0700
> +++ qemu/hw/ide/core.c	2011-08-21 14:57:09.510891797 -0700
> @@ -473,7 +473,10 @@ void ide_sector_read(IDEState *s)
>  #endif
>          if (n > s->req_nb_sectors)
>              n = s->req_nb_sectors;
> +
> +        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
>          ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
> +        bdrv_acct_done(s->bs, &s->acct);

The commit message says that you're only converting bdrv_aio_readv. I
think it makes sense to add the accounting to the sychronous calls, too,
so that devices have complete accounting or none at all.

If your plan is to convert all bdrv_read calls, I think you've missed
some in atapi.c.

>          if (ret != 0) {
>              if (ide_handle_rw_error(s, -ret,
>                  BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ))
> @@ -610,7 +613,8 @@ handle_rw_error:
>      return;
>  
>  eot:
> -   ide_set_inactive(s);
> +    bdrv_acct_done(s->bs, &s->acct);
> +    ide_set_inactive(s);
>  }
>  
>  static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
> @@ -619,6 +623,20 @@ static void ide_sector_start_dma(IDEStat
>      s->io_buffer_index = 0;
>      s->io_buffer_size = 0;
>      s->dma_cmd = dma_cmd;
> +
> +    switch (dma_cmd) {
> +    case IDE_DMA_READ:
> +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> +                        BDRV_ACCT_READ);
> +        break;
> +    case IDE_DMA_WRITE:
> +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> +                        BDRV_ACCT_WRITE);
> +	break;
> +    default:
> +        break;
> +    }

So should the semantics of bdrv_acct_done be that it does nothing if no
bdrv_acct_start has been called before? If so, its implementation is
broken. Otherwise, the default case of this switch statement looks
broken to me.

> +
>      s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb);
>  }
>  
> @@ -641,7 +659,10 @@ void ide_sector_write(IDEState *s)
>      n = s->nsector;
>      if (n > s->req_nb_sectors)
>          n = s->req_nb_sectors;
> +
> +    bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
>      ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
> +    bdrv_acct_done(s->bs, &s->acct);
>  
>      if (ret != 0) {
>          if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY))
> @@ -685,6 +706,7 @@ static void ide_flush_cb(void *opaque, i
>          }
>      }
>  
> +    bdrv_acct_done(s->bs, &s->acct);
>      s->status = READY_STAT | SEEK_STAT;
>      ide_set_irq(s->bus);
>  }
> @@ -698,6 +720,7 @@ void ide_flush_cache(IDEState *s)
>          return;
>      }
>  
> +    bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH);
>      acb = bdrv_aio_flush(s->bs, ide_flush_cb, s);
>      if (acb == NULL) {
>          ide_flush_cb(s, -EIO);
> Index: qemu/hw/ide/macio.c
> ===================================================================
> --- qemu.orig/hw/ide/macio.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/ide/macio.c	2011-08-21 14:57:09.514225130 -0700
> @@ -52,8 +52,7 @@ static void pmac_ide_atapi_transfer_cb(v
>          m->aiocb = NULL;
>          qemu_sglist_destroy(&s->sg);
>          ide_atapi_io_error(s, ret);
> -        io->dma_end(opaque);
> -        return;
> +        goto done;
>      }
>  
>      if (s->io_buffer_size > 0) {
> @@ -71,8 +70,7 @@ static void pmac_ide_atapi_transfer_cb(v
>          ide_atapi_cmd_ok(s);
>  
>      if (io->len == 0) {
> -        io->dma_end(opaque);
> -        return;
> +        goto done;
>      }
>  
>      /* launch next transfer */
> @@ -92,9 +90,14 @@ static void pmac_ide_atapi_transfer_cb(v
>          /* Note: media not present is the most likely case */
>          ide_atapi_cmd_error(s, SENSE_NOT_READY,
>                              ASC_MEDIUM_NOT_PRESENT);
> -        io->dma_end(opaque);
> -        return;
> +        goto done;
>      }
> +    return;
> +
> +done:
> +    bdrv_acct_done(s->bs, &s->acct);
> +    io->dma_end(opaque);
> +    return;
>  }
>  
>  static void pmac_ide_transfer_cb(void *opaque, int ret)
> @@ -109,8 +112,7 @@ static void pmac_ide_transfer_cb(void *o
>          m->aiocb = NULL;
>          qemu_sglist_destroy(&s->sg);
>  	ide_dma_error(s);
> -        io->dma_end(io);
> -        return;
> +        goto done;
>      }
>  
>      sector_num = ide_get_sector(s);
> @@ -130,10 +132,8 @@ static void pmac_ide_transfer_cb(void *o
>      }
>  
>      /* end of DMA ? */
> -
>      if (io->len == 0) {
> -        io->dma_end(io);
> -	return;
> +        goto done;
>      }
>  
>      /* launch next transfer */
> @@ -163,6 +163,10 @@ static void pmac_ide_transfer_cb(void *o
>  
>      if (!m->aiocb)
>          pmac_ide_transfer_cb(io, -1);
> +    return;
> +done:
> +    bdrv_acct_done(s->bs, &s->acct);
> +    io->dma_end(io);
>  }
>  
>  static void pmac_ide_transfer(DBDMA_io *io)
> @@ -172,10 +176,22 @@ static void pmac_ide_transfer(DBDMA_io *
>  
>      s->io_buffer_size = 0;
>      if (s->drive_kind == IDE_CD) {
> +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
>          pmac_ide_atapi_transfer_cb(io, 0);
>          return;
>      }
>  
> +    switch (s->dma_cmd) {
> +    case IDE_DMA_READ:
> +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
> +        break;
> +    case IDE_DMA_WRITE:
> +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
> +        break;
> +    default:
> +        break;

Can it happen? If yes, see above. If no, abort() is better.

> +    }
> +
>      pmac_ide_transfer_cb(io, 0);
>  }
>  
> Index: qemu/hw/scsi-disk.c
> ===================================================================
> --- qemu.orig/hw/scsi-disk.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/scsi-disk.c	2011-08-21 14:57:09.514225130 -0700
> @@ -57,6 +57,7 @@ typedef struct SCSIDiskReq {
>      struct iovec iov;
>      QEMUIOVector qiov;
>      uint32_t status;
> +    BlockAcctCookie acct;
>  } SCSIDiskReq;
>  
>  struct SCSIDiskState
> @@ -107,6 +108,7 @@ static void scsi_cancel_io(SCSIRequest *
>  static void scsi_read_complete(void * opaque, int ret)
>  {
>      SCSIDiskReq *r = (SCSIDiskReq *)opaque;
> +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
>      int n;
>  
>      r->req.aiocb = NULL;
> @@ -117,6 +119,8 @@ static void scsi_read_complete(void * op
>          }
>      }
>  
> +    bdrv_acct_done(s->bs, &r->acct);
> +
>      DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);

scsi-disk doesn't account for failed requests. IDE does. I don't have a
strong preference on which way we handle it, but I think it should be
consistent.

>  
>      n = r->iov.iov_len / 512;
> @@ -161,6 +165,8 @@ static void scsi_read_data(SCSIRequest *
>  
>      r->iov.iov_len = n * 512;
>      qemu_iovec_init_external(&r->qiov, &r->iov, 1);
> +
> +    bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
>      r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
>                                scsi_read_complete, r);
>      if (r->req.aiocb == NULL) {
> @@ -207,6 +213,7 @@ static int scsi_handle_rw_error(SCSIDisk
>  static void scsi_write_complete(void * opaque, int ret)
>  {
>      SCSIDiskReq *r = (SCSIDiskReq *)opaque;
> +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
>      uint32_t len;
>      uint32_t n;
>  
> @@ -218,6 +225,8 @@ static void scsi_write_complete(void * o
>          }
>      }
>  
> +    bdrv_acct_done(s->bs, &r->acct);
> +
>      n = r->iov.iov_len / 512;
>      r->sector += n;
>      r->sector_count -= n;
> @@ -252,6 +261,8 @@ static void scsi_write_data(SCSIRequest
>      n = r->iov.iov_len / 512;
>      if (n) {
>          qemu_iovec_init_external(&r->qiov, &r->iov, 1);
> +
> +        bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
>          r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
>                                     scsi_write_complete, r);
>          if (r->req.aiocb == NULL) {
> @@ -854,13 +865,19 @@ static int scsi_disk_emulate_command(SCS
>          buflen = 8;
>          break;
>      case SYNCHRONIZE_CACHE:
> +{
> +        BlockAcctCookie acct;
> +
> +        bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH);
>          ret = bdrv_flush(s->bs);
>          if (ret < 0) {
>              if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) {
>                  return -1;
>              }
>          }
> +        bdrv_acct_done(s->bs, &acct);
>          break;
> +}
>      case GET_CONFIGURATION:
>          memset(outbuf, 0, 8);
>          /* ??? This should probably return much more information.  For now
> Index: qemu/hw/virtio-blk.c
> ===================================================================
> --- qemu.orig/hw/virtio-blk.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/virtio-blk.c	2011-08-21 14:57:44.887558247 -0700
> @@ -47,6 +47,7 @@ typedef struct VirtIOBlockReq
>      struct virtio_scsi_inhdr *scsi;
>      QEMUIOVector qiov;
>      struct VirtIOBlockReq *next;
> +    BlockAcctCookie acct;
>  } VirtIOBlockReq;
>  
>  static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
> @@ -59,6 +60,7 @@ static void virtio_blk_req_complete(Virt
>      virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
>      virtio_notify(&s->vdev, s->vq);
>  
> +    bdrv_acct_done(s->bs, &req->acct);
>      g_free(req);
>  }
>  
> @@ -266,6 +268,8 @@ static void virtio_blk_handle_flush(Virt
>  {
>      BlockDriverAIOCB *acb;
>  
> +    bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
> +
>      /*
>       * Make sure all outstanding writes are posted to the backing device.
>       */
> @@ -284,6 +288,8 @@ static void virtio_blk_handle_write(Virt
>  
>      sector = ldq_p(&req->out->sector);
>  
> +    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
> +
>      trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
>  
>      if (sector & req->dev->sector_mask) {
> @@ -317,6 +323,8 @@ static void virtio_blk_handle_read(VirtI
>  
>      sector = ldq_p(&req->out->sector);
>  
> +    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
> +
>      if (sector & req->dev->sector_mask) {
>          virtio_blk_rw_complete(req, -EIO);
>          return;
> Index: qemu/hw/xen_disk.c
> ===================================================================
> --- qemu.orig/hw/xen_disk.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/xen_disk.c	2011-08-21 14:57:09.517558463 -0700
> @@ -79,6 +79,7 @@ struct ioreq {
>  
>      struct XenBlkDev    *blkdev;
>      QLIST_ENTRY(ioreq)   list;
> +    BlockAcctCookie     acct;

Indentation is off.

Kevin
Christoph Hellwig Aug. 24, 2011, 6:22 p.m. UTC | #2
On Mon, Aug 22, 2011 at 05:48:37PM +0200, Kevin Wolf wrote:
> > +static inline void
> > +bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
> > +        enum BlockIOType type)
> > +{
> > +    cookie->bytes = bytes;
> > +    cookie->type = type;
> > +}
> > +
> > +static inline void
> > +bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
> > +{
> > +    bs->nr_bytes[cookie->type] += cookie->bytes;
> > +    bs->nr_ops[cookie->type]++;
> > +}
> 
> Device models actually shouldn't include block_int.h, so this isn't very
> nice. Moving it to block.h would lose the inline, does it matter?

I have moved it out of line, we can change it if anyone cares.

> > +
> > +        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
> >          ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
> > +        bdrv_acct_done(s->bs, &s->acct);
> 
> The commit message says that you're only converting bdrv_aio_readv. I
> think it makes sense to add the accounting to the sychronous calls, too,
> so that devices have complete accounting or none at all.

That was the plan, I just worded it incorrectly.  The idea is to fully
convert device modles that were using bdrv_aio_* in some places, but
leave those that never had any accounting out for now.

> If your plan is to convert all bdrv_read calls, I think you've missed
> some in atapi.c.

Added.


> > +    switch (dma_cmd) {
> > +    case IDE_DMA_READ:
> > +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> > +                        BDRV_ACCT_READ);
> > +        break;
> > +    case IDE_DMA_WRITE:
> > +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> > +                        BDRV_ACCT_WRITE);
> > +	break;
> > +    default:
> > +        break;
> > +    }
> 
> So should the semantics of bdrv_acct_done be that it does nothing if no
> bdrv_acct_start has been called before? If so, its implementation is
> broken. Otherwise, the default case of this switch statement looks
> broken to me.

I have fixed ide_dma_cb to only do the accounting for read and write requests.

> > +    switch (s->dma_cmd) {
> > +    case IDE_DMA_READ:
> > +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
> > +        break;
> > +    case IDE_DMA_WRITE:
> > +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
> > +        break;
> > +    default:
> > +        break;
> 
> Can it happen? If yes, see above. If no, abort() is better.

We can get a trim request, so it can happen.

> >  static void scsi_read_complete(void * opaque, int ret)
> >  {
> >      SCSIDiskReq *r = (SCSIDiskReq *)opaque;
> > +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
> >      int n;
> >  
> >      r->req.aiocb = NULL;
> > @@ -117,6 +119,8 @@ static void scsi_read_complete(void * op
> >          }
> >      }
> >  
> > +    bdrv_acct_done(s->bs, &r->acct);
> > +
> >      DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);
> 
> scsi-disk doesn't account for failed requests. IDE does. I don't have a
> strong preference on which way we handle it, but I think it should be
> consistent.

The idea is to call bdrv_acct_done even for failed request to make sure
the calls are balanced.  In the future we might want to pass the error
code to it to e.g. count failed requests separately.

> > Index: qemu/hw/xen_disk.c
> > ===================================================================
> > --- qemu.orig/hw/xen_disk.c	2011-08-21 11:49:29.000000000 -0700
> > +++ qemu/hw/xen_disk.c	2011-08-21 14:57:09.517558463 -0700
> > @@ -79,6 +79,7 @@ struct ioreq {
> >  
> >      struct XenBlkDev    *blkdev;
> >      QLIST_ENTRY(ioreq)   list;
> > +    BlockAcctCookie     acct;
> 
> Indentation is off.

Actually only the indentation of list if off compared to the rest of
the structure and everything else in the file.
diff mbox

Patch

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-21 14:53:09.684226561 -0700
+++ qemu/block.c	2011-08-21 14:57:09.507558463 -0700
@@ -1915,13 +1915,13 @@  static QObject* bdrv_info_stats_bs(Block
                              "'wr_highest_offset': %" PRId64 ","
                              "'flush_operations': %" PRId64
                              "} }",
-                             bs->rd_bytes,
-                             bs->wr_bytes,
-                             bs->rd_ops,
-                             bs->wr_ops,
+                             bs->nr_bytes[BDRV_ACCT_READ],
+                             bs->nr_bytes[BDRV_ACCT_WRITE],
+                             bs->nr_ops[BDRV_ACCT_READ],
+                             bs->nr_ops[BDRV_ACCT_WRITE],
                              bs->wr_highest_sector *
                              (uint64_t)BDRV_SECTOR_SIZE,
-                             bs->flush_ops);
+                             bs->nr_ops[BDRV_ACCT_FLUSH]);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -2235,7 +2235,6 @@  char *bdrv_snapshot_dump(char *buf, int
     return buf;
 }
 
-
 /**************************************************************/
 /* async I/Os */
 
@@ -2244,7 +2243,6 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDr
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
-    BlockDriverAIOCB *ret;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
@@ -2253,16 +2251,7 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDr
     if (bdrv_check_request(bs, sector_num, nb_sectors))
         return NULL;
 
-    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
-                              cb, opaque);
-
-    if (ret) {
-	/* Update stats even though technically transfer has not happened. */
-	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-	bs->rd_ops ++;
-    }
-
-    return ret;
+    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, cb, opaque);
 }
 
 typedef struct BlockCompleteData {
@@ -2329,9 +2318,6 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockD
                                cb, opaque);
 
     if (ret) {
-        /* Update stats even though technically transfer has not happened. */
-        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-        bs->wr_ops ++;
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
@@ -2585,8 +2571,6 @@  BlockDriverAIOCB *bdrv_aio_flush(BlockDr
 
     trace_bdrv_aio_flush(bs, opaque);
 
-    bs->flush_ops++;
-
     if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return bdrv_aio_noop_em(bs, cb, opaque);
     }
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-21 14:53:09.000000000 -0700
+++ qemu/block_int.h	2011-08-21 14:57:09.510891797 -0700
@@ -148,6 +148,13 @@  struct BlockDriver {
     QLIST_ENTRY(BlockDriver) list;
 };
 
+enum BlockIOType {
+    BDRV_ACCT_READ,
+    BDRV_ACCT_WRITE,
+    BDRV_ACCT_FLUSH,
+    BDRV_MAX_IOTYPE,
+};
+
 struct BlockDriverState {
     int64_t total_sectors; /* if we are reading a disk image, give its
                               size in sectors */
@@ -184,11 +191,8 @@  struct BlockDriverState {
     void *sync_aiocb;
 
     /* I/O stats (display with "info blockstats"). */
-    uint64_t rd_bytes;
-    uint64_t wr_bytes;
-    uint64_t rd_ops;
-    uint64_t wr_ops;
-    uint64_t flush_ops;
+    uint64_t nr_bytes[BDRV_MAX_IOTYPE];
+    uint64_t nr_ops[BDRV_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
@@ -212,6 +216,27 @@  struct BlockDriverState {
     void *private;
 };
 
+typedef struct BlockAcctCookie {
+    int64_t bytes;
+    enum BlockIOType type;
+} BlockAcctCookie;
+
+static inline void
+bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
+        enum BlockIOType type)
+{
+    cookie->bytes = bytes;
+    cookie->type = type;
+}
+
+static inline void
+bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
+{
+    bs->nr_bytes[cookie->type] += cookie->bytes;
+    bs->nr_ops[cookie->type]++;
+}
+
+
 #define CHANGE_MEDIA    0x01
 #define CHANGE_SIZE     0x02
 
Index: qemu/hw/ide/ahci.c
===================================================================
--- qemu.orig/hw/ide/ahci.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/ide/ahci.c	2011-08-21 14:57:09.510891797 -0700
@@ -710,6 +710,7 @@  static void ncq_cb(void *opaque, int ret
     DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
             ncq_tfs->tag);
 
+    bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct);
     qemu_sglist_destroy(&ncq_tfs->sglist);
     ncq_tfs->used = 0;
 }
@@ -755,7 +756,11 @@  static void process_ncq_command(AHCIStat
                     ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
             ncq_tfs->is_read = 1;
 
-            DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+	    DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_READ);
             ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs,
                                            &ncq_tfs->sglist, ncq_tfs->lba,
                                            ncq_cb, ncq_tfs);
@@ -766,6 +771,10 @@  static void process_ncq_command(AHCIStat
             ncq_tfs->is_read = 0;
 
             DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_WRITE);
             ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs,
                                             &ncq_tfs->sglist, ncq_tfs->lba,
                                             ncq_cb, ncq_tfs);
Index: qemu/hw/ide/atapi.c
===================================================================
--- qemu.orig/hw/ide/atapi.c	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/atapi.c	2011-08-21 14:57:09.510891797 -0700
@@ -250,6 +250,7 @@  static void ide_atapi_cmd_reply(IDEState
     s->io_buffer_index = 0;
 
     if (s->atapi_dma) {
+        bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ);
         s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
         s->bus->dma->ops->start_dma(s->bus->dma, s,
                                    ide_atapi_cmd_read_dma_cb);
@@ -322,10 +323,7 @@  static void ide_atapi_cmd_read_dma_cb(vo
         s->status = READY_STAT | SEEK_STAT;
         s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
         ide_set_irq(s->bus);
-    eot:
-        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
-        ide_set_inactive(s);
-        return;
+        goto eot;
     }
 
     s->io_buffer_index = 0;
@@ -343,9 +341,11 @@  static void ide_atapi_cmd_read_dma_cb(vo
 #ifdef DEBUG_AIO
     printf("aio_read_cd: lba=%u n=%d\n", s->lba, n);
 #endif
+
     s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset);
     s->bus->dma->iov.iov_len = n * 4 * 512;
     qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
+
     s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2,
                                        &s->bus->dma->qiov, n * 4,
                                        ide_atapi_cmd_read_dma_cb, s);
@@ -355,6 +355,12 @@  static void ide_atapi_cmd_read_dma_cb(vo
                             ASC_MEDIUM_NOT_PRESENT);
         goto eot;
     }
+
+    return;
+eot:
+    bdrv_acct_done(s->bs, &s->acct);
+    s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
+    ide_set_inactive(s);
 }
 
 /* start a CD-CDROM read command with DMA */
@@ -368,6 +374,8 @@  static void ide_atapi_cmd_read_dma(IDESt
     s->io_buffer_size = 0;
     s->cd_sector_size = sector_size;
 
+    bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ);
+
     /* XXX: check if BUSY_STAT should be set */
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
     s->bus->dma->ops->start_dma(s->bus->dma, s,
Index: qemu/hw/ide/core.c
===================================================================
--- qemu.orig/hw/ide/core.c	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/core.c	2011-08-21 14:57:09.510891797 -0700
@@ -473,7 +473,10 @@  void ide_sector_read(IDEState *s)
 #endif
         if (n > s->req_nb_sectors)
             n = s->req_nb_sectors;
+
+        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
         ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
+        bdrv_acct_done(s->bs, &s->acct);
         if (ret != 0) {
             if (ide_handle_rw_error(s, -ret,
                 BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ))
@@ -610,7 +613,8 @@  handle_rw_error:
     return;
 
 eot:
-   ide_set_inactive(s);
+    bdrv_acct_done(s->bs, &s->acct);
+    ide_set_inactive(s);
 }
 
 static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
@@ -619,6 +623,20 @@  static void ide_sector_start_dma(IDEStat
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
     s->dma_cmd = dma_cmd;
+
+    switch (dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_WRITE);
+	break;
+    default:
+        break;
+    }
+
     s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb);
 }
 
@@ -641,7 +659,10 @@  void ide_sector_write(IDEState *s)
     n = s->nsector;
     if (n > s->req_nb_sectors)
         n = s->req_nb_sectors;
+
+    bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+    bdrv_acct_done(s->bs, &s->acct);
 
     if (ret != 0) {
         if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY))
@@ -685,6 +706,7 @@  static void ide_flush_cb(void *opaque, i
         }
     }
 
+    bdrv_acct_done(s->bs, &s->acct);
     s->status = READY_STAT | SEEK_STAT;
     ide_set_irq(s->bus);
 }
@@ -698,6 +720,7 @@  void ide_flush_cache(IDEState *s)
         return;
     }
 
+    bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH);
     acb = bdrv_aio_flush(s->bs, ide_flush_cb, s);
     if (acb == NULL) {
         ide_flush_cb(s, -EIO);
Index: qemu/hw/ide/macio.c
===================================================================
--- qemu.orig/hw/ide/macio.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/ide/macio.c	2011-08-21 14:57:09.514225130 -0700
@@ -52,8 +52,7 @@  static void pmac_ide_atapi_transfer_cb(v
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
         ide_atapi_io_error(s, ret);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     if (s->io_buffer_size > 0) {
@@ -71,8 +70,7 @@  static void pmac_ide_atapi_transfer_cb(v
         ide_atapi_cmd_ok(s);
 
     if (io->len == 0) {
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -92,9 +90,14 @@  static void pmac_ide_atapi_transfer_cb(v
         /* Note: media not present is the most likely case */
         ide_atapi_cmd_error(s, SENSE_NOT_READY,
                             ASC_MEDIUM_NOT_PRESENT);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
+    return;
+
+done:
+    bdrv_acct_done(s->bs, &s->acct);
+    io->dma_end(opaque);
+    return;
 }
 
 static void pmac_ide_transfer_cb(void *opaque, int ret)
@@ -109,8 +112,7 @@  static void pmac_ide_transfer_cb(void *o
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
 	ide_dma_error(s);
-        io->dma_end(io);
-        return;
+        goto done;
     }
 
     sector_num = ide_get_sector(s);
@@ -130,10 +132,8 @@  static void pmac_ide_transfer_cb(void *o
     }
 
     /* end of DMA ? */
-
     if (io->len == 0) {
-        io->dma_end(io);
-	return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -163,6 +163,10 @@  static void pmac_ide_transfer_cb(void *o
 
     if (!m->aiocb)
         pmac_ide_transfer_cb(io, -1);
+    return;
+done:
+    bdrv_acct_done(s->bs, &s->acct);
+    io->dma_end(io);
 }
 
 static void pmac_ide_transfer(DBDMA_io *io)
@@ -172,10 +176,22 @@  static void pmac_ide_transfer(DBDMA_io *
 
     s->io_buffer_size = 0;
     if (s->drive_kind == IDE_CD) {
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
         pmac_ide_atapi_transfer_cb(io, 0);
         return;
     }
 
+    switch (s->dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
+        break;
+    default:
+        break;
+    }
+
     pmac_ide_transfer_cb(io, 0);
 }
 
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/scsi-disk.c	2011-08-21 14:57:09.514225130 -0700
@@ -57,6 +57,7 @@  typedef struct SCSIDiskReq {
     struct iovec iov;
     QEMUIOVector qiov;
     uint32_t status;
+    BlockAcctCookie acct;
 } SCSIDiskReq;
 
 struct SCSIDiskState
@@ -107,6 +108,7 @@  static void scsi_cancel_io(SCSIRequest *
 static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     int n;
 
     r->req.aiocb = NULL;
@@ -117,6 +119,8 @@  static void scsi_read_complete(void * op
         }
     }
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);
 
     n = r->iov.iov_len / 512;
@@ -161,6 +165,8 @@  static void scsi_read_data(SCSIRequest *
 
     r->iov.iov_len = n * 512;
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+    bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
                               scsi_read_complete, r);
     if (r->req.aiocb == NULL) {
@@ -207,6 +213,7 @@  static int scsi_handle_rw_error(SCSIDisk
 static void scsi_write_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t len;
     uint32_t n;
 
@@ -218,6 +225,8 @@  static void scsi_write_complete(void * o
         }
     }
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     n = r->iov.iov_len / 512;
     r->sector += n;
     r->sector_count -= n;
@@ -252,6 +261,8 @@  static void scsi_write_data(SCSIRequest
     n = r->iov.iov_len / 512;
     if (n) {
         qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+        bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
         r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
                                    scsi_write_complete, r);
         if (r->req.aiocb == NULL) {
@@ -854,13 +865,19 @@  static int scsi_disk_emulate_command(SCS
         buflen = 8;
         break;
     case SYNCHRONIZE_CACHE:
+{
+        BlockAcctCookie acct;
+
+        bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH);
         ret = bdrv_flush(s->bs);
         if (ret < 0) {
             if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) {
                 return -1;
             }
         }
+        bdrv_acct_done(s->bs, &acct);
         break;
+}
     case GET_CONFIGURATION:
         memset(outbuf, 0, 8);
         /* ??? This should probably return much more information.  For now
Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/virtio-blk.c	2011-08-21 14:57:44.887558247 -0700
@@ -47,6 +47,7 @@  typedef struct VirtIOBlockReq
     struct virtio_scsi_inhdr *scsi;
     QEMUIOVector qiov;
     struct VirtIOBlockReq *next;
+    BlockAcctCookie acct;
 } VirtIOBlockReq;
 
 static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
@@ -59,6 +60,7 @@  static void virtio_blk_req_complete(Virt
     virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
     virtio_notify(&s->vdev, s->vq);
 
+    bdrv_acct_done(s->bs, &req->acct);
     g_free(req);
 }
 
@@ -266,6 +268,8 @@  static void virtio_blk_handle_flush(Virt
 {
     BlockDriverAIOCB *acb;
 
+    bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
+
     /*
      * Make sure all outstanding writes are posted to the backing device.
      */
@@ -284,6 +288,8 @@  static void virtio_blk_handle_write(Virt
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
+
     trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
 
     if (sector & req->dev->sector_mask) {
@@ -317,6 +323,8 @@  static void virtio_blk_handle_read(VirtI
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
+
     if (sector & req->dev->sector_mask) {
         virtio_blk_rw_complete(req, -EIO);
         return;
Index: qemu/hw/xen_disk.c
===================================================================
--- qemu.orig/hw/xen_disk.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/xen_disk.c	2011-08-21 14:57:09.517558463 -0700
@@ -79,6 +79,7 @@  struct ioreq {
 
     struct XenBlkDev    *blkdev;
     QLIST_ENTRY(ioreq)   list;
+    BlockAcctCookie     acct;
 };
 
 struct XenBlkDev {
@@ -401,6 +402,7 @@  static void qemu_aio_complete(void *opaq
     ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
     ioreq_unmap(ioreq);
     ioreq_finish(ioreq);
+    bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct);
     qemu_bh_schedule(ioreq->blkdev->bh);
 }
 
@@ -419,6 +421,7 @@  static int ioreq_runio_qemu_aio(struct i
 
     switch (ioreq->req.operation) {
     case BLKIF_OP_READ:
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ);
         ioreq->aio_inflight++;
         bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
@@ -429,6 +432,8 @@  static int ioreq_runio_qemu_aio(struct i
         if (!ioreq->req.nr_segments) {
             break;
         }
+
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE);
         ioreq->aio_inflight++;
         bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
                         &ioreq->v, ioreq->v.size / BLOCK_SIZE,
Index: qemu/hw/ide/internal.h
===================================================================
--- qemu.orig/hw/ide/internal.h	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/internal.h	2011-08-21 14:57:09.517558463 -0700
@@ -440,6 +440,7 @@  struct IDEState {
     int lba;
     int cd_sector_size;
     int atapi_dma; /* true if dma is requested for the packet cmd */
+    BlockAcctCookie acct;
     /* ATA DMA state */
     int io_buffer_size;
     QEMUSGList sg;
Index: qemu/hw/ide/ahci.h
===================================================================
--- qemu.orig/hw/ide/ahci.h	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/ahci.h	2011-08-21 14:57:09.517558463 -0700
@@ -258,6 +258,7 @@  typedef struct NCQTransferState {
     AHCIDevice *drive;
     BlockDriverAIOCB *aiocb;
     QEMUSGList sglist;
+    BlockAcctCookie acct;
     int is_read;
     uint16_t sector_count;
     uint64_t lba;