diff mbox

[-qemu] nvme: support Google vendor extension

Message ID 1448007096.3473.10.camel@hasee
State New
Headers show

Commit Message

Ming Lin Nov. 20, 2015, 8:11 a.m. UTC
On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
> 
> On 18/11/2015 06:47, Ming Lin wrote:
> > @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >          }
> >  
> >          start_sqs = nvme_cq_full(cq) ? 1 : 0;
> > -        cq->head = new_head;
> > +        /* When the mapped pointer memory area is setup, we don't rely on
> > +         * the MMIO written values to update the head pointer. */
> > +        if (!cq->db_addr) {
> > +            cq->head = new_head;
> > +        }
> 
> You are still checking
> 
>         if (new_head >= cq->size) {
>             return;
>         }
> 
> above.  I think this is incorrect when the extension is present, and
> furthermore it's the only case where val is being used.
> 
> If you're not using val, you could use ioeventfd for the MMIO.  An
> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> benchmark, on two very different machines:
> 
> 			Haswell-EP		Ivy Bridge i7
>   MMIO memory write	5100 -> 2250 (55%)	7000 -> 3000 (58%)
>   I/O port write	3800 -> 1150 (70%)	4100 -> 1800 (57%)
> 
> You would need to allocate two eventfds for each qid, one for the sq and
> one for the cq.  Also, processing the queues is now bounced to the QEMU
> iothread, so you can probably get rid of sq->timer and cq->timer.

Here is a quick try.
Too late now, I'll test it morning.

Do you see obvious problem?


> 
> Paolo

Comments

Paolo Bonzini Nov. 20, 2015, 8:58 a.m. UTC | #1
On 20/11/2015 09:11, Ming Lin wrote:
> On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
>>
>> On 18/11/2015 06:47, Ming Lin wrote:
>>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>>>          }
>>>  
>>>          start_sqs = nvme_cq_full(cq) ? 1 : 0;
>>> -        cq->head = new_head;
>>> +        /* When the mapped pointer memory area is setup, we don't rely on
>>> +         * the MMIO written values to update the head pointer. */
>>> +        if (!cq->db_addr) {
>>> +            cq->head = new_head;
>>> +        }
>>
>> You are still checking
>>
>>         if (new_head >= cq->size) {
>>             return;
>>         }
>>
>> above.  I think this is incorrect when the extension is present, and
>> furthermore it's the only case where val is being used.
>>
>> If you're not using val, you could use ioeventfd for the MMIO.  An
>> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
>> quick and dirty measurements from kvm-unit-tests's vmexit.flat
>> benchmark, on two very different machines:
>>
>> 			Haswell-EP		Ivy Bridge i7
>>   MMIO memory write	5100 -> 2250 (55%)	7000 -> 3000 (58%)
>>   I/O port write	3800 -> 1150 (70%)	4100 -> 1800 (57%)
>>
>> You would need to allocate two eventfds for each qid, one for the sq and
>> one for the cq.  Also, processing the queues is now bounced to the QEMU
>> iothread, so you can probably get rid of sq->timer and cq->timer.
> 
> Here is a quick try.
> Too late now, I'll test it morning.
> 
> Do you see obvious problem?
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 3e1c38d..d28690d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      return NVME_SUCCESS;
>  }
>  
> +static void nvme_cq_notifier(EventNotifier *e)
> +{
> +    NvmeCQueue *cq =
> +        container_of(e, NvmeCQueue, notifier);
> +
> +    nvme_post_cqes(cq);
> +}
> +
> +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> +{
> +    NvmeCtrl *n = cq->ctrl;
> +    uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> +
> +    event_notifier_init(&cq->notifier, 0);
> +    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> +    memory_region_add_eventfd(&n->iomem,
> +        0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);

should be 0x1000 + offset, 4, false, 0, &cq->notifier

> +}
> +
> +static void nvme_sq_notifier(EventNotifier *e)
> +{
> +    NvmeSQueue *sq =
> +        container_of(e, NvmeSQueue, notifier);
> +
> +    nvme_process_sq(sq);
> +}
> +
> +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> +{
> +    NvmeCtrl *n = sq->ctrl;
> +    uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> +
> +    event_notifier_init(&sq->notifier, 0);
> +    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> +    memory_region_add_eventfd(&n->iomem,
> +        0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);

likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier

Otherwise looks good!

Paolo

> +}


> +
>  static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
>  {
>      uint64_t db_addr = le64_to_cpu(cmd->prp1);
> @@ -565,6 +603,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
>              /* Submission queue tail pointer location, 2 * QID * stride. */
>              sq->db_addr = db_addr + 2 * i * 4;
>              sq->eventidx_addr = eventidx_addr + 2 * i * 4;
> +            nvme_init_sq_eventfd(sq);
>          }
>  
>          if (cq != NULL) {
> @@ -572,6 +611,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
>               */
>              cq->db_addr = db_addr + (2 * i + 1) * 4;
>              cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
> +            nvme_init_cq_eventfd(cq);
>          }
>      }
>      return NVME_SUCCESS;
> @@ -793,7 +833,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>          }
>  
>          cq = n->cq[qid];
> -        if (new_head >= cq->size) {
> +        if (!cq->db_addr && new_head >= cq->size) {
>              return;
>          }
>  
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 82aeab4..608f202 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -667,6 +667,7 @@ typedef struct NvmeSQueue {
>       * do not go over this value will not result in MMIO writes (but will
>       * still write the tail pointer to the "db_addr" location above). */
>      uint64_t    eventidx_addr;
> +    EventNotifier notifier;
>  } NvmeSQueue;
>  
>  typedef struct NvmeCQueue {
> @@ -689,6 +690,7 @@ typedef struct NvmeCQueue {
>       * do not go over this value will not result in MMIO writes (but will
>       * still write the head pointer to the "db_addr" location above). */
>      uint64_t    eventidx_addr;
> +    EventNotifier notifier;
>  } NvmeCQueue;
>  
>  typedef struct NvmeNamespace {
> 
>>
>> Paolo
Ming Lin Nov. 20, 2015, 11:05 p.m. UTC | #2
On Fri, 2015-11-20 at 09:58 +0100, Paolo Bonzini wrote:
> 
> On 20/11/2015 09:11, Ming Lin wrote:
> > On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
> >>
> >> On 18/11/2015 06:47, Ming Lin wrote:
> >>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >>>          }
> >>>  
> >>>          start_sqs = nvme_cq_full(cq) ? 1 : 0;
> >>> -        cq->head = new_head;
> >>> +        /* When the mapped pointer memory area is setup, we don't rely on
> >>> +         * the MMIO written values to update the head pointer. */
> >>> +        if (!cq->db_addr) {
> >>> +            cq->head = new_head;
> >>> +        }
> >>
> >> You are still checking
> >>
> >>         if (new_head >= cq->size) {
> >>             return;
> >>         }
> >>
> >> above.  I think this is incorrect when the extension is present, and
> >> furthermore it's the only case where val is being used.
> >>
> >> If you're not using val, you could use ioeventfd for the MMIO.  An
> >> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> >> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> >> benchmark, on two very different machines:
> >>
> >> 			Haswell-EP		Ivy Bridge i7
> >>   MMIO memory write	5100 -> 2250 (55%)	7000 -> 3000 (58%)
> >>   I/O port write	3800 -> 1150 (70%)	4100 -> 1800 (57%)
> >>
> >> You would need to allocate two eventfds for each qid, one for the sq and
> >> one for the cq.  Also, processing the queues is now bounced to the QEMU
> >> iothread, so you can probably get rid of sq->timer and cq->timer.
> > 
> > Here is a quick try.
> > Too late now, I'll test it morning.
> > 
> > Do you see obvious problem?
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 3e1c38d..d28690d 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >      return NVME_SUCCESS;
> >  }
> >  
> > +static void nvme_cq_notifier(EventNotifier *e)
> > +{
> > +    NvmeCQueue *cq =
> > +        container_of(e, NvmeCQueue, notifier);
> > +
> > +    nvme_post_cqes(cq);
> > +}
> > +
> > +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> > +{
> > +    NvmeCtrl *n = cq->ctrl;
> > +    uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > +    event_notifier_init(&cq->notifier, 0);
> > +    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> > +    memory_region_add_eventfd(&n->iomem,
> > +        0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
> 
> should be 0x1000 + offset, 4, false, 0, &cq->notifier
> 
> > +}
> > +
> > +static void nvme_sq_notifier(EventNotifier *e)
> > +{
> > +    NvmeSQueue *sq =
> > +        container_of(e, NvmeSQueue, notifier);
> > +
> > +    nvme_process_sq(sq);
> > +}
> > +
> > +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> > +{
> > +    NvmeCtrl *n = sq->ctrl;
> > +    uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > +    event_notifier_init(&sq->notifier, 0);
> > +    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> > +    memory_region_add_eventfd(&n->iomem,
> > +        0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
> 
> likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier

It works. But for some unknown reason, when boots guest kernel, it stuck
for almost 1 minute.

[    1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[    1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
[    1.988187] clocksource: Switched to clocksource tsc
[    3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
[    3.358713] clocksource:                       'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
[    3.410013] clocksource:                       'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
[    3.450026] clocksource: Switched to clocksource refined-jiffies
[    7.696769] Adding 392188k swap on /dev/vda5.  Priority:-1 extents:1 across:392188k 
[    7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
[    8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro

Then it doesn't response input for almost 1 minute.
Without this patch, kernel loads quickly.

[    1.351095] Freeing unused kernel memory: 2968K (ffffffff81d49000 - ffffffff8202f000)
[    1.352039] Write protecting the kernel read-only data: 12288k
[    1.353340] Freeing unused kernel memory: 216K (ffff8800017ca000 - ffff880001800000)
[    1.354670] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[    1.796272] Adding 392188k swap on /dev/vda5.  Priority:-1 extents:1 across:392188k 
[    1.809579] EXT4-fs (vda1): re-mounted. Opts: (null)
[    1.864834] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
[    1.964181] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e58d9c595, max_idle_ns: 440795220169 ns
[    1.965610] clocksource: Switched to clocksource tsc
[    2.377965] random: dd urandom read with 19 bits of entropy available



void memory_region_add_eventfd(MemoryRegion *mr,
                               hwaddr addr,
                               unsigned size,
                               bool match_data,
                               uint64_t data,
                               EventNotifier *e)

Could you help to explain what "match_data" and "data" mean?

I use a real NVMe device as backend.

-drive file=/dev/nvme0n1,format=raw,if=none,id=D22 \
-device nvme,drive=D22,serial=1234

Here is the test results:

local NVMe: 860MB/s
qemu-nvme: 108MB/s
qemu-nvme+google-ext: 140MB/s
qemu-nvme-google-ext+eventfd: 190MB/s

root@wheezy:~# cat test.job 
[global]
bs=4k
ioengine=libaio
iodepth=64
direct=1
runtime=60
time_based
norandommap
group_reporting
gtod_reduce=1
numjobs=8

[job1]
filename=/dev/nvme0n1
rw=read
Paolo Bonzini Nov. 21, 2015, 12:56 p.m. UTC | #3
On 21/11/2015 00:05, Ming Lin wrote:
> [    1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> [    1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> [    1.988187] clocksource: Switched to clocksource tsc
> [    3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> [    3.358713] clocksource:                       'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> [    3.410013] clocksource:                       'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> [    3.450026] clocksource: Switched to clocksource refined-jiffies
> [    7.696769] Adding 392188k swap on /dev/vda5.  Priority:-1 extents:1 across:392188k 
> [    7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> [    8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
> 
> Then it doesn't response input for almost 1 minute.
> Without this patch, kernel loads quickly.

Interesting.  I guess there's time to debug it, since QEMU 2.6 is still 
a few months away.  In the meanwhile we can apply your patch as is, 
apart from disabling the "if (new_head >= cq->size)" and the similar 
one for "if (new_ tail >= sq->size".

But, I have a possible culprit.  In your nvme_cq_notifier you are not doing the 
equivalent of:

	start_sqs = nvme_cq_full(cq) ? 1 : 0;
        cq->head = new_head;
        if (start_sqs) {
            NvmeSQueue *sq;
            QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
                timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
            }
            timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
        }

Instead, you are just calling nvme_post_cqes, which is the equivalent of

	timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);

Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
fix the weird 1-minute delay.

Paolo

> void memory_region_add_eventfd(MemoryRegion *mr,
>                                hwaddr addr,
>                                unsigned size,
>                                bool match_data,
>                                uint64_t data,
>                                EventNotifier *e)
> 
> Could you help to explain what "match_data" and "data" mean?

If match_data is true, the eventfd is only signalled if "data" is being written to memory.

Paolo
diff mbox

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e1c38d..d28690d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -543,6 +543,44 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static void nvme_cq_notifier(EventNotifier *e)
+{
+    NvmeCQueue *cq =
+        container_of(e, NvmeCQueue, notifier);
+
+    nvme_post_cqes(cq);
+}
+
+static void nvme_init_cq_eventfd(NvmeCQueue *cq)
+{
+    NvmeCtrl *n = cq->ctrl;
+    uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
+
+    event_notifier_init(&cq->notifier, 0);
+    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+    memory_region_add_eventfd(&n->iomem,
+        0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+    NvmeSQueue *sq =
+        container_of(e, NvmeSQueue, notifier);
+
+    nvme_process_sq(sq);
+}
+
+static void nvme_init_sq_eventfd(NvmeSQueue *sq)
+{
+    NvmeCtrl *n = sq->ctrl;
+    uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
+
+    event_notifier_init(&sq->notifier, 0);
+    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+    memory_region_add_eventfd(&n->iomem,
+        0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
+}
+
 static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
 {
     uint64_t db_addr = le64_to_cpu(cmd->prp1);
@@ -565,6 +603,7 @@  static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
             /* Submission queue tail pointer location, 2 * QID * stride. */
             sq->db_addr = db_addr + 2 * i * 4;
             sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+            nvme_init_sq_eventfd(sq);
         }
 
         if (cq != NULL) {
@@ -572,6 +611,7 @@  static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
              */
             cq->db_addr = db_addr + (2 * i + 1) * 4;
             cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+            nvme_init_cq_eventfd(cq);
         }
     }
     return NVME_SUCCESS;
@@ -793,7 +833,7 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         }
 
         cq = n->cq[qid];
-        if (new_head >= cq->size) {
+        if (!cq->db_addr && new_head >= cq->size) {
             return;
         }
 
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 82aeab4..608f202 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -667,6 +667,7 @@  typedef struct NvmeSQueue {
      * do not go over this value will not result in MMIO writes (but will
      * still write the tail pointer to the "db_addr" location above). */
     uint64_t    eventidx_addr;
+    EventNotifier notifier;
 } NvmeSQueue;
 
 typedef struct NvmeCQueue {
@@ -689,6 +690,7 @@  typedef struct NvmeCQueue {
      * do not go over this value will not result in MMIO writes (but will
      * still write the head pointer to the "db_addr" location above). */
     uint64_t    eventidx_addr;
+    EventNotifier notifier;
 } NvmeCQueue;
 
 typedef struct NvmeNamespace {