diff mbox

[06/11] scsi-disk: correctly implement WRITE SAME

Message ID 1384271389-20716-7-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Nov. 12, 2013, 3:49 p.m. UTC
The WRITE SAME command is implemented incorrectly.  WRITE SAME with the
UNMAP bit set should _not_ unmap the sectors unless the written data
matches the payload of the WRITE SAME command; currently, QEMU is not
looking at the payload at all.

Thus, fetch the data to be written from the input buffer.  If it is
all zeroes, we can use the write_zeroes call (possibly with the new
MAY_UNMAP flag).  Otherwise, do as many write cycles as needed, covering
512k at a time to avoid allocating lots of memory for the bounce
buffer.

Strictly speaking, this is still incorrect because a zero cluster should
only be written if the MAY_UNMAP flag is set.  But this is a bug in the
block layer, not here.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-disk.c | 140 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 116 insertions(+), 24 deletions(-)

Comments

Peter Lieven Nov. 13, 2013, 6:18 a.m. UTC | #1
Am 12.11.2013 um 16:49 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> The WRITE SAME command is implemented incorrectly.  WRITE SAME with the
> UNMAP bit set should _not_ unmap the sectors unless the written data
> matches the payload of the WRITE SAME command; currently, QEMU is not
> looking at the payload at all.
> 
> Thus, fetch the data to be written from the input buffer.  If it is
> all zeroes, we can use the write_zeroes call (possibly with the new
> MAY_UNMAP flag).  Otherwise, do as many write cycles as needed, covering
> 512k at a time to avoid allocating lots of memory for the bounce
> buffer.

Would it make sense to add a bdrv_write_same or is the use case for
WRITE SAME with non-zero payload too rare?

And secondly would it make sense to add an optimal request size field
to the BlockLimits?

> 
> Strictly speaking, this is still incorrect because a zero cluster should
> only be written if the MAY_UNMAP flag is set.  But this is a bug in the
> block layer, not here.

Can you explain what exactly you mean?

> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> hw/scsi/scsi-disk.c | 140 +++++++++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 116 insertions(+), 24 deletions(-)
> 
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 7e29760..cd5116c 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -41,6 +41,7 @@ do { printf("scsi-disk: " fmt , ## __VA_ARGS__); } while (0)
> #include <scsi/sg.h>
> #endif
> 
> +#define SCSI_WRITE_SAME_MAX         524288

I would call this SCSI_MAX_WRITE_SAME_LEN (like max_ws_len)

> #define SCSI_DMA_BUF_SIZE           131072
> #define SCSI_MAX_INQUIRY_LEN        256
> #define SCSI_MAX_MODE_LEN           256
> @@ -634,6 +635,8 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
>             buflen = 0x40;
>             memset(outbuf + 4, 0, buflen - 4);
> 
> +            outbuf[4] = 0x1; /* wsnz */
> +
>             /* optimal transfer length granularity */
>             outbuf[6] = (min_io_size >> 8) & 0xff;
>             outbuf[7] = min_io_size & 0xff;
> @@ -1589,6 +1592,111 @@ invalid_field:
>     scsi_check_condition(r, SENSE_CODE(INVALID_FIELD));
> }
> 
> +typedef struct WriteSameCBData {
> +    SCSIDiskReq *r;
> +    int64_t sector;
> +    int nb_sectors;
> +    QEMUIOVector qiov;
> +    struct iovec iov;
> +} WriteSameCBData;
> +
> +static void scsi_write_same_complete(void *opaque, int ret)
> +{
> +    WriteSameCBData *data = opaque;
> +    SCSIDiskReq *r = data->r;
> +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
> +
> +    assert(r->req.aiocb != NULL);
> +    r->req.aiocb = NULL;
> +    bdrv_acct_done(s->qdev.conf.bs, &r->acct);
> +    if (r->req.io_canceled) {
> +        goto done;
> +    }
> +
> +    if (ret < 0) {
> +        if (scsi_handle_rw_error(r, -ret)) {
> +            goto done;
> +        }
> +    }
> +
> +    data->nb_sectors -= data->iov.iov_len / 512;
> +    data->sector += data->iov.iov_len / 512;
> +    data->iov.iov_len = MIN(data->nb_sectors * 512, data->iov.iov_len);
> +    if (data->iov.iov_len) {
> +        bdrv_acct_start(s->qdev.conf.bs, &r->acct, data->iov.iov_len, BDRV_ACCT_WRITE);
> +        r->req.aiocb = bdrv_aio_writev(s->qdev.conf.bs, data->sector,
> +                                       &data->qiov, data->iov.iov_len / 512,
> +                                       scsi_write_same_complete, r);
> +        return;
> +    }
> +
> +    scsi_req_complete(&r->req, GOOD);
> +
> +done:
> +    if (!r->req.io_canceled) {
> +        scsi_req_unref(&r->req);
> +    }
> +    g_free (data->iov.iov_base);
> +    g_free (data);
> +}
> +
> +static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
> +{
> +    SCSIRequest *req = &r->req;
> +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
> +    uint32_t nb_sectors = scsi_data_cdb_length(r->req.cmd.buf);
> +    WriteSameCBData *data;
> +    uint8_t *buf;
> +    int i;
> +
> +    /* Fail if PBDATA=1 or LBDATA=1 or ANCHOR=1.  */
> +    if (nb_sectors == 0 || (req->cmd.buf[1] & 0x16)) {
> +        scsi_check_condition(r, SENSE_CODE(INVALID_FIELD));
> +        return;
> +    }
> +
> +    if (bdrv_is_read_only(s->qdev.conf.bs)) {
> +        scsi_check_condition(r, SENSE_CODE(WRITE_PROTECTED));
> +        return;
> +    }
> +    if (!check_lba_range(s, r->req.cmd.lba, nb_sectors)) {
> +        scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
> +        return;
> +    }
> +
> +    if (buffer_is_zero(inbuf, s->qdev.blocksize)) {
> +        int flags = (req->cmd.buf[1] & 0x8) ? BDRV_REQ_MAY_UNMAP : 0;
> +
> +        /* The request is used as the AIO opaque value, so add a ref.  */
> +        scsi_req_ref(&r->req);
> +        bdrv_acct_start(s->qdev.conf.bs, &r->acct, nb_sectors * s->qdev.blocksize,
> +                        BDRV_ACCT_WRITE);
> +        r->req.aiocb = bdrv_aio_write_zeroes(s->qdev.conf.bs,
> +                                             r->req.cmd.lba * (s->qdev.blocksize / 512),
> +                                             nb_sectors * (s->qdev.blocksize / 512),
> +                                             flags, scsi_aio_complete, r);
> +        return;
> +    }
> +
> +    data = g_new0(WriteSameCBData, 1);
> +    data->r = r;
> +    data->sector = r->req.cmd.lba * (s->qdev.blocksize / 512);
> +    data->nb_sectors = nb_sectors * (s->qdev.blocksize / 512);
> +    data->iov.iov_len = MIN(data->nb_sectors * 512, SCSI_WRITE_SAME_MAX);
> +    data->iov.iov_base = buf = g_malloc(data->iov.iov_len);
> +    qemu_iovec_init_external(&data->qiov, &data->iov, 1);
> +
> +    for (i = 0; i < data->iov.iov_len; i += s->qdev.blocksize) {
> +         memcpy(&buf[i], inbuf, s->qdev.blocksize);
> +    }
> +
> +    scsi_req_ref(&r->req);
> +    bdrv_acct_start(s->qdev.conf.bs, &r->acct, data->iov.iov_len, BDRV_ACCT_WRITE);
> +    r->req.aiocb = bdrv_aio_writev(s->qdev.conf.bs, data->sector,
> +                                   &data->qiov, data->iov.iov_len / 512,
> +                                   scsi_write_same_complete, data);
> +}

I asked myself many time if to use 512 or BDRV_SECTOR_SIZE. Both is used
heavily in the whole block code.

> +
> static void scsi_disk_emulate_write_data(SCSIRequest *req)
> {
>     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
> @@ -1612,6 +1720,10 @@ static void scsi_disk_emulate_write_data(SCSIRequest *req)
>         scsi_disk_emulate_unmap(r, r->iov.iov_base);
>         break;
> 
> +    case WRITE_SAME_10:
> +    case WRITE_SAME_16:
> +        scsi_disk_emulate_write_same(r, r->iov.iov_base);
> +        break;
>     default:
>         abort();
>     }
> @@ -1854,30 +1966,10 @@ static int32_t scsi_disk_emulate_command(SCSIRequest *req, uint8_t *buf)
>         break;
>     case WRITE_SAME_10:
>     case WRITE_SAME_16:
> -        nb_sectors = scsi_data_cdb_length(r->req.cmd.buf);
> -        if (bdrv_is_read_only(s->qdev.conf.bs)) {
> -            scsi_check_condition(r, SENSE_CODE(WRITE_PROTECTED));
> -            return 0;
> -        }
> -        if (!check_lba_range(s, r->req.cmd.lba, nb_sectors)) {
> -            goto illegal_lba;
> -        }
> -
> -        /*
> -         * We only support WRITE SAME with the unmap bit set for now.
> -	 * Reject UNMAP=0 or ANCHOR=1.
> -         */
> -        if (!(req->cmd.buf[1] & 0x8) || (req->cmd.buf[1] & 0x10)) {
> -            goto illegal_request;
> -        }
> -
> -        /* The request is used as the AIO opaque value, so add a ref.  */
> -        scsi_req_ref(&r->req);
> -        r->req.aiocb = bdrv_aio_discard(s->qdev.conf.bs,
> -                                        r->req.cmd.lba * (s->qdev.blocksize / 512),
> -                                        nb_sectors * (s->qdev.blocksize / 512),
> -                                        scsi_aio_complete, r);
> -        return 0;
> +        DPRINTF("WRITE SAME %d (len %lu)\n",
> +                req->cmd.buf[0] == WRITE_SAME_10 ? 10 : 16,
> +                (long)r->req.cmd.xfer);
> +        break;
>     default:
>         DPRINTF("Unknown SCSI command (%2.2x)\n", buf[0]);
>         scsi_check_condition(r, SENSE_CODE(INVALID_OPCODE));
> -- 
> 1.8.4.2
> 
> 

Peter
Paolo Bonzini Nov. 13, 2013, 9:38 a.m. UTC | #2
Il 13/11/2013 07:18, Peter Lieven ha scritto:
>> > The WRITE SAME command is implemented incorrectly.  WRITE SAME with the
>> > UNMAP bit set should _not_ unmap the sectors unless the written data
>> > matches the payload of the WRITE SAME command; currently, QEMU is not
>> > looking at the payload at all.
>> > 
>> > Thus, fetch the data to be written from the input buffer.  If it is
>> > all zeroes, we can use the write_zeroes call (possibly with the new
>> > MAY_UNMAP flag).  Otherwise, do as many write cycles as needed, covering
>> > 512k at a time to avoid allocating lots of memory for the bounce
>> > buffer.
> 
> Would it make sense to add a bdrv_write_same or is the use case for
> WRITE SAME with non-zero payload too rare?

It would, but it is definitely very rare, probably so much that we need
not care.  Linux only invokes it for zero payloads.

Also, for zero payload there are additional benefits.  First, supporting
WRITE SAME with UNMAP if the host has LBPRZ=1 or analogous.  Second,
using zero clusters in qcow2/qed/vmdk.

> And secondly would it make sense to add an optimal request size field
> to the BlockLimits?

The optimal request size is not particularly useful if it is not visible
to the guest, unfortunately.  But we cannot pass values arbitrarily to
the guest because they would change if the backing storage changed (e.g.
from NFS to local, or from raw to qcow2).

So I'm not sure who would actually use the optimal request size.

Paolo
diff mbox

Patch

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 7e29760..cd5116c 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -41,6 +41,7 @@  do { printf("scsi-disk: " fmt , ## __VA_ARGS__); } while (0)
 #include <scsi/sg.h>
 #endif
 
+#define SCSI_WRITE_SAME_MAX         524288
 #define SCSI_DMA_BUF_SIZE           131072
 #define SCSI_MAX_INQUIRY_LEN        256
 #define SCSI_MAX_MODE_LEN           256
@@ -634,6 +635,8 @@  static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             buflen = 0x40;
             memset(outbuf + 4, 0, buflen - 4);
 
+            outbuf[4] = 0x1; /* wsnz */
+
             /* optimal transfer length granularity */
             outbuf[6] = (min_io_size >> 8) & 0xff;
             outbuf[7] = min_io_size & 0xff;
@@ -1589,6 +1592,111 @@  invalid_field:
     scsi_check_condition(r, SENSE_CODE(INVALID_FIELD));
 }
 
+typedef struct WriteSameCBData {
+    SCSIDiskReq *r;
+    int64_t sector;
+    int nb_sectors;
+    QEMUIOVector qiov;
+    struct iovec iov;
+} WriteSameCBData;
+
+static void scsi_write_same_complete(void *opaque, int ret)
+{
+    WriteSameCBData *data = opaque;
+    SCSIDiskReq *r = data->r;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+    assert(r->req.aiocb != NULL);
+    r->req.aiocb = NULL;
+    bdrv_acct_done(s->qdev.conf.bs, &r->acct);
+    if (r->req.io_canceled) {
+        goto done;
+    }
+
+    if (ret < 0) {
+        if (scsi_handle_rw_error(r, -ret)) {
+            goto done;
+        }
+    }
+
+    data->nb_sectors -= data->iov.iov_len / 512;
+    data->sector += data->iov.iov_len / 512;
+    data->iov.iov_len = MIN(data->nb_sectors * 512, data->iov.iov_len);
+    if (data->iov.iov_len) {
+        bdrv_acct_start(s->qdev.conf.bs, &r->acct, data->iov.iov_len, BDRV_ACCT_WRITE);
+        r->req.aiocb = bdrv_aio_writev(s->qdev.conf.bs, data->sector,
+                                       &data->qiov, data->iov.iov_len / 512,
+                                       scsi_write_same_complete, r);
+        return;
+    }
+
+    scsi_req_complete(&r->req, GOOD);
+
+done:
+    if (!r->req.io_canceled) {
+        scsi_req_unref(&r->req);
+    }
+    g_free (data->iov.iov_base);
+    g_free (data);
+}
+
+static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
+{
+    SCSIRequest *req = &r->req;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
+    uint32_t nb_sectors = scsi_data_cdb_length(r->req.cmd.buf);
+    WriteSameCBData *data;
+    uint8_t *buf;
+    int i;
+
+    /* Fail if PBDATA=1 or LBDATA=1 or ANCHOR=1.  */
+    if (nb_sectors == 0 || (req->cmd.buf[1] & 0x16)) {
+        scsi_check_condition(r, SENSE_CODE(INVALID_FIELD));
+        return;
+    }
+
+    if (bdrv_is_read_only(s->qdev.conf.bs)) {
+        scsi_check_condition(r, SENSE_CODE(WRITE_PROTECTED));
+        return;
+    }
+    if (!check_lba_range(s, r->req.cmd.lba, nb_sectors)) {
+        scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
+        return;
+    }
+
+    if (buffer_is_zero(inbuf, s->qdev.blocksize)) {
+        int flags = (req->cmd.buf[1] & 0x8) ? BDRV_REQ_MAY_UNMAP : 0;
+
+        /* The request is used as the AIO opaque value, so add a ref.  */
+        scsi_req_ref(&r->req);
+        bdrv_acct_start(s->qdev.conf.bs, &r->acct, nb_sectors * s->qdev.blocksize,
+                        BDRV_ACCT_WRITE);
+        r->req.aiocb = bdrv_aio_write_zeroes(s->qdev.conf.bs,
+                                             r->req.cmd.lba * (s->qdev.blocksize / 512),
+                                             nb_sectors * (s->qdev.blocksize / 512),
+                                             flags, scsi_aio_complete, r);
+        return;
+    }
+
+    data = g_new0(WriteSameCBData, 1);
+    data->r = r;
+    data->sector = r->req.cmd.lba * (s->qdev.blocksize / 512);
+    data->nb_sectors = nb_sectors * (s->qdev.blocksize / 512);
+    data->iov.iov_len = MIN(data->nb_sectors * 512, SCSI_WRITE_SAME_MAX);
+    data->iov.iov_base = buf = g_malloc(data->iov.iov_len);
+    qemu_iovec_init_external(&data->qiov, &data->iov, 1);
+
+    for (i = 0; i < data->iov.iov_len; i += s->qdev.blocksize) {
+         memcpy(&buf[i], inbuf, s->qdev.blocksize);
+    }
+
+    scsi_req_ref(&r->req);
+    bdrv_acct_start(s->qdev.conf.bs, &r->acct, data->iov.iov_len, BDRV_ACCT_WRITE);
+    r->req.aiocb = bdrv_aio_writev(s->qdev.conf.bs, data->sector,
+                                   &data->qiov, data->iov.iov_len / 512,
+                                   scsi_write_same_complete, data);
+}
+
 static void scsi_disk_emulate_write_data(SCSIRequest *req)
 {
     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
@@ -1612,6 +1720,10 @@  static void scsi_disk_emulate_write_data(SCSIRequest *req)
         scsi_disk_emulate_unmap(r, r->iov.iov_base);
         break;
 
+    case WRITE_SAME_10:
+    case WRITE_SAME_16:
+        scsi_disk_emulate_write_same(r, r->iov.iov_base);
+        break;
     default:
         abort();
     }
@@ -1854,30 +1966,10 @@  static int32_t scsi_disk_emulate_command(SCSIRequest *req, uint8_t *buf)
         break;
     case WRITE_SAME_10:
     case WRITE_SAME_16:
-        nb_sectors = scsi_data_cdb_length(r->req.cmd.buf);
-        if (bdrv_is_read_only(s->qdev.conf.bs)) {
-            scsi_check_condition(r, SENSE_CODE(WRITE_PROTECTED));
-            return 0;
-        }
-        if (!check_lba_range(s, r->req.cmd.lba, nb_sectors)) {
-            goto illegal_lba;
-        }
-
-        /*
-         * We only support WRITE SAME with the unmap bit set for now.
-	 * Reject UNMAP=0 or ANCHOR=1.
-         */
-        if (!(req->cmd.buf[1] & 0x8) || (req->cmd.buf[1] & 0x10)) {
-            goto illegal_request;
-        }
-
-        /* The request is used as the AIO opaque value, so add a ref.  */
-        scsi_req_ref(&r->req);
-        r->req.aiocb = bdrv_aio_discard(s->qdev.conf.bs,
-                                        r->req.cmd.lba * (s->qdev.blocksize / 512),
-                                        nb_sectors * (s->qdev.blocksize / 512),
-                                        scsi_aio_complete, r);
-        return 0;
+        DPRINTF("WRITE SAME %d (len %lu)\n",
+                req->cmd.buf[0] == WRITE_SAME_10 ? 10 : 16,
+                (long)r->req.cmd.xfer);
+        break;
     default:
         DPRINTF("Unknown SCSI command (%2.2x)\n", buf[0]);
         scsi_check_condition(r, SENSE_CODE(INVALID_OPCODE));