Patchwork [4/5] ide: add TRIM support

login
register
mail settings
Submitter Christoph Hellwig
Date Dec. 1, 2010, 3:35 p.m.
Message ID <20101201153543.GD6310@lst.de>
Download mbox | patch
Permalink /patch/73838/
State New
Headers show

Comments

Christoph Hellwig - Dec. 1, 2010, 3:35 p.m.
Add support for the data set management command, and the TRIM sub function
in it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Kevin Wolf - Dec. 2, 2010, 2:07 p.m.
Am 01.12.2010 16:35, schrieb Christoph Hellwig:
> Add support for the data set management command, and the TRIM sub function
> in it.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Index: qemu/hw/ide/core.c
> ===================================================================
> --- qemu.orig/hw/ide/core.c	2010-11-30 23:12:59.513132702 +0100
> +++ qemu/hw/ide/core.c	2010-12-01 12:02:47.347023889 +0100
> @@ -145,6 +145,8 @@ static void ide_identify(IDEState *s)
>      put_le16(p + 66, 120);
>      put_le16(p + 67, 120);
>      put_le16(p + 68, 120);
> +    if (dev && dev->conf.discard_granularity)
> +        put_le16(p + 69, (1 << 14)); /* determinate TRIM behavior */

Braces

>      put_le16(p + 80, 0xf0); /* ata3 -> ata6 supported */
>      put_le16(p + 81, 0x16); /* conforms to ata5 */
>      /* 14=NOP supported, 5=WCACHE supported, 0=SMART supported */
> @@ -171,6 +173,8 @@ static void ide_identify(IDEState *s)
>      dev = s->unit ? s->bus->slave : s->bus->master;
>      if (dev && dev->conf.physical_block_size)
>          put_le16(p + 106, 0x6000 | get_physical_block_exp(&dev->conf));
> +    if (dev && dev->conf.discard_granularity)
> +        put_le16(p + 169, 1); /* TRIM support */
>  
>      memcpy(s->identify_data, p, sizeof(s->identify_data));
>      s->identify_set = 1;
> @@ -1788,6 +1792,128 @@ static void ide_clear_hob(IDEBus *bus)
>      bus->ifs[1].select &= ~(1 << 7);
>  }
>  
> +typedef struct TrimAIOCB {
> +    BlockDriverAIOCB common;
> +    QEMUBH *bh;
> +    int ret;
> +} TrimAIOCB;
> +
> +static void trim_aio_cancel(BlockDriverAIOCB *acb)
> +{
> +    TrimAIOCB *iocb = container_of(acb, TrimAIOCB, common);
> +
> +    qemu_bh_delete(iocb->bh);
> +    iocb->bh = NULL;
> +    qemu_aio_release(iocb);
> +}
> +
> +static AIOPool trim_aio_pool = {
> +    .aiocb_size         = sizeof(TrimAIOCB),
> +    .cancel             = trim_aio_cancel,
> +};
> +
> +static void ide_trim_bh_cb(void *opaque)
> +{
> +    TrimAIOCB *iocb = opaque;
> +
> +    iocb->common.cb(iocb->common.opaque, iocb->ret);
> +
> +    qemu_bh_delete(iocb->bh);
> +    iocb->bh = NULL;
> +
> +    qemu_aio_release(iocb);
> +}
> +
> +static BlockDriverAIOCB *ide_issue_trim(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    TrimAIOCB *iocb;
> +    int i, j, ret;
> +
> +    iocb = qemu_aio_get(&trim_aio_pool, bs, cb, opaque);
> +    iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb);
> +    iocb->ret = 0;
> +
> +    for (j = 0; j < qiov->niov; j++) {
> +        uint64_t *buffer = qiov->iov[j].iov_base;
> +
> +        for (i = 0; i < qiov->iov[j].iov_len / 8; i++) {
> +            /* 6-byte LBA + 2-byte range per entry */
> +            uint64_t entry = le64_to_cpu(buffer[i]);
> +            uint64_t sector = entry & 0x0000ffffffffffffULL;
> +            uint16_t count = entry >> 48;
> +
> +            if (count == 0)
> +                break;
> +
> +            ret = bdrv_discard(bs, sector * 512, count * 512);
> +            if (!iocb->ret)
> +                iocb->ret = ret;
> +        }
> +    }
> +
> +    qemu_bh_schedule(iocb->bh);
> +
> +    return &iocb->common;
> +}
> +
> +static void ide_trim_dma_cb(void *opaque, int ret)
> +{
> +    BMDMAState *bm = opaque;
> +    IDEState *s = bmdma_active_if(bm);
> +    int n;
> +    int64_t sector_num;
> +
> +    if (ret < 0) {
> +        if (ide_handle_rw_error(s, -ret,  BM_STATUS_DMA_RETRY))
> +            return;

This looks wrong. Wouldn't werror=stop cause the request to be retried
as a write when the VM is resumed?

But having a copy&paste error gives just about right reason to mention
that after read and write this is the third almost unchanged copy of
this code. Eventually we'll want to refactor this.

> +    }
> +
> +    n = s->io_buffer_size >> 9;
> +    sector_num = ide_get_sector(s);
> +    if (n > 0) {
> +        dma_buf_commit(s, 0);
> +        sector_num += n;
> +        ide_set_sector(s, sector_num);
> +        s->nsector -= n;
> +    }
> +
> +    /* end of transfer ? */
> +    if (s->nsector == 0) {
> +        s->status = READY_STAT | SEEK_STAT;
> +        ide_set_irq(s->bus);
> +    eot:
> +        bm->status &= ~BM_STATUS_DMAING;
> +        bm->status |= BM_STATUS_INT;
> +        bm->dma_cb = NULL;
> +        bm->unit = -1;
> +        bm->aiocb = NULL;

You can use ide_dma_set_inactive() here.

While we're at it, do you know why in the eot: case we set
BM_STATUS_INT, but don't actually call ide_set_irq? From what I
understand, those two should always be coupled, but I might be wrong.

Kevin
Christoph Hellwig - Dec. 10, 2010, 1:39 p.m.
On Thu, Dec 02, 2010 at 03:07:49PM +0100, Kevin Wolf wrote:
> This looks wrong. Wouldn't werror=stop cause the request to be retried
> as a write when the VM is resumed?

Indeed.

> But having a copy&paste error gives just about right reason to mention
> that after read and write this is the third almost unchanged copy of
> this code. Eventually we'll want to refactor this.

I've added a patch to refactor the DMA code to the next iteration
of the patch series.

> While we're at it, do you know why in the eot: case we set
> BM_STATUS_INT, but don't actually call ide_set_irq? From what I
> understand, those two should always be coupled, but I might be wrong.

No idea, sorry.

Patch

Index: qemu/hw/ide/core.c
===================================================================
--- qemu.orig/hw/ide/core.c	2010-11-30 23:12:59.513132702 +0100
+++ qemu/hw/ide/core.c	2010-12-01 12:02:47.347023889 +0100
@@ -145,6 +145,8 @@  static void ide_identify(IDEState *s)
     put_le16(p + 66, 120);
     put_le16(p + 67, 120);
     put_le16(p + 68, 120);
+    if (dev && dev->conf.discard_granularity)
+        put_le16(p + 69, (1 << 14)); /* determinate TRIM behavior */
     put_le16(p + 80, 0xf0); /* ata3 -> ata6 supported */
     put_le16(p + 81, 0x16); /* conforms to ata5 */
     /* 14=NOP supported, 5=WCACHE supported, 0=SMART supported */
@@ -171,6 +173,8 @@  static void ide_identify(IDEState *s)
     dev = s->unit ? s->bus->slave : s->bus->master;
     if (dev && dev->conf.physical_block_size)
         put_le16(p + 106, 0x6000 | get_physical_block_exp(&dev->conf));
+    if (dev && dev->conf.discard_granularity)
+        put_le16(p + 169, 1); /* TRIM support */
 
     memcpy(s->identify_data, p, sizeof(s->identify_data));
     s->identify_set = 1;
@@ -1788,6 +1792,128 @@  static void ide_clear_hob(IDEBus *bus)
     bus->ifs[1].select &= ~(1 << 7);
 }
 
+typedef struct TrimAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+    int ret;
+} TrimAIOCB;
+
+static void trim_aio_cancel(BlockDriverAIOCB *acb)
+{
+    TrimAIOCB *iocb = container_of(acb, TrimAIOCB, common);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_release(iocb);
+}
+
+static AIOPool trim_aio_pool = {
+    .aiocb_size         = sizeof(TrimAIOCB),
+    .cancel             = trim_aio_cancel,
+};
+
+static void ide_trim_bh_cb(void *opaque)
+{
+    TrimAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+
+    qemu_aio_release(iocb);
+}
+
+static BlockDriverAIOCB *ide_issue_trim(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    TrimAIOCB *iocb;
+    int i, j, ret;
+
+    iocb = qemu_aio_get(&trim_aio_pool, bs, cb, opaque);
+    iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb);
+    iocb->ret = 0;
+
+    for (j = 0; j < qiov->niov; j++) {
+        uint64_t *buffer = qiov->iov[j].iov_base;
+
+        for (i = 0; i < qiov->iov[j].iov_len / 8; i++) {
+            /* 6-byte LBA + 2-byte range per entry */
+            uint64_t entry = le64_to_cpu(buffer[i]);
+            uint64_t sector = entry & 0x0000ffffffffffffULL;
+            uint16_t count = entry >> 48;
+
+            if (count == 0)
+                break;
+
+            ret = bdrv_discard(bs, sector * 512, count * 512);
+            if (!iocb->ret)
+                iocb->ret = ret;
+        }
+    }
+
+    qemu_bh_schedule(iocb->bh);
+
+    return &iocb->common;
+}
+
+static void ide_trim_dma_cb(void *opaque, int ret)
+{
+    BMDMAState *bm = opaque;
+    IDEState *s = bmdma_active_if(bm);
+    int n;
+    int64_t sector_num;
+
+    if (ret < 0) {
+        if (ide_handle_rw_error(s, -ret,  BM_STATUS_DMA_RETRY))
+            return;
+    }
+
+    n = s->io_buffer_size >> 9;
+    sector_num = ide_get_sector(s);
+    if (n > 0) {
+        dma_buf_commit(s, 0);
+        sector_num += n;
+        ide_set_sector(s, sector_num);
+        s->nsector -= n;
+    }
+
+    /* end of transfer ? */
+    if (s->nsector == 0) {
+        s->status = READY_STAT | SEEK_STAT;
+        ide_set_irq(s->bus);
+    eot:
+        bm->status &= ~BM_STATUS_DMAING;
+        bm->status |= BM_STATUS_INT;
+        bm->dma_cb = NULL;
+        bm->unit = -1;
+        bm->aiocb = NULL;
+        return;
+    }
+
+    n = s->nsector;
+    s->io_buffer_size = n * 512;
+    /* launch next transfer */
+    if (dma_buf_prepare(bm, 0) == 0)
+        goto eot;
+#ifdef DEBUG_AIO
+    printf("aio_write: sector_num=%" PRId64 " n=%d\n", sector_num, n);
+#endif
+    bm->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num, ide_issue_trim,
+                    ide_trim_dma_cb, bm, 1);
+    ide_dma_submit_check(s, ide_trim_dma_cb, bm);
+}
+
+static void ide_trim(IDEState *s)
+{
+    s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
+    s->io_buffer_index = 0;
+    s->io_buffer_size = 0;
+    s->is_read = 0;
+    ide_dma_start(s, ide_trim_dma_cb);
+}
+
 void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 {
     IDEBus *bus = opaque;
@@ -1867,6 +1993,17 @@  void ide_ioport_write(void *opaque, uint
             break;
 
         switch(val) {
+        case WIN_DSM:
+            switch (s->feature) {
+            case DSM_TRIM:
+                if (!s->bs)
+                   goto abort_cmd;
+                ide_trim(s);
+                break;
+            default:
+                goto abort_cmd;
+            }
+            break;
         case WIN_IDENTIFY:
             if (s->bs && s->drive_kind != IDE_CD) {
                 if (s->drive_kind != IDE_CFATA)
Index: qemu/hw/ide/internal.h
===================================================================
--- qemu.orig/hw/ide/internal.h	2010-11-30 23:12:59.515265122 +0100
+++ qemu/hw/ide/internal.h	2010-12-01 12:02:47.352634169 +0100
@@ -60,7 +60,11 @@  typedef struct BMDMAState BMDMAState;
  */
 #define CFA_REQ_EXT_ERROR_CODE		0x03 /* CFA Request Extended Error Code */
 /*
- *	0x04->0x07 Reserved
+ *      0x04->0x05 Reserved
+ */
+#define WIN_DSM                         0x06
+/*
+ *      0x07 Reserved
  */
 #define WIN_SRST			0x08 /* ATAPI soft reset command */
 #define WIN_DEVICE_RESET		0x08
@@ -188,6 +192,9 @@  typedef struct BMDMAState BMDMAState;
 
 #define IDE_DMA_BUF_SECTORS 256
 
+/* feature values for Data Set Management */
+#define DSM_TRIM                        0x01
+
 #if (IDE_DMA_BUF_SECTORS < MAX_MULT_SECTORS)
 #error "IDE_DMA_BUF_SECTORS must be bigger or equal to MAX_MULT_SECTORS"
 #endif
Index: qemu/hw/ide/qdev.c
===================================================================
--- qemu.orig/hw/ide/qdev.c	2010-11-30 23:12:56.601004402 +0100
+++ qemu/hw/ide/qdev.c	2010-12-01 12:02:47.354276228 +0100
@@ -110,6 +110,11 @@  static int ide_drive_initfn(IDEDevice *d
     const char *serial;
     DriveInfo *dinfo;
 
+    if (dev->conf.discard_granularity && dev->conf.discard_granularity != 512) {
+        error_report("discard_granularity must be 512 for ide");
+        return -1;
+    }
+
     serial = dev->serial;
     if (!serial) {
         /* try to fall back to value set with legacy -drive serial=... */