Patchwork [-V3,09/32] virtio-9p: Implement P9_TWRITE

login
register
mail settings
Submitter Aneesh Kumar K.V
Date March 25, 2010, 4:43 p.m.
Message ID <1269535420-31206-10-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/48555/
State New
Headers show

Comments

Aneesh Kumar K.V - March 25, 2010, 4:43 p.m.
From: Anthony Liguori <aliguori@us.ibm.com>

This gets write to file to work

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 hw/virtio-9p-local.c |    7 ++++
 hw/virtio-9p.c       |   97 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 102 insertions(+), 2 deletions(-)
jvrao - March 29, 2010, 6:36 a.m.
Aneesh Kumar K.V wrote:
> From: Anthony Liguori <aliguori@us.ibm.com>

We have implemented all the vfs calls in state machine model so that we are prepared
for the model where the VCPU thread(s) does the initial work until it needs to block then it
submits that work (via a function pointer) to a thread pool.  A thread in that thread pool
picks up the work, and completes the blocking call, when blocking call returns a callback is
invoked in the IO thread.  Then the IO thread runs until the next blocking function, and goto start.

Basically the VCPU/IO threads does all the non-blocking work, and let the threads in the
thread pool work on the blocking calls like mkdir() stat() etc.

My question is, why not let the whole work done by the thread in the thread pool?
VCPU thread receives the PDU and hands over the entire job to worker thread.
When all work is completed, either the worker thread or the IO thread(we can switch back at this point if needed) marks the request as completed in the virtqueue and injects an
interrupt to notify the guest.

We can still keep the same number of threads in the thread pool. 
This way, we are not increasing #of threads employed by QEMU...also it makes code lot 
more easy to read/maintain.

I may be missing something..but would like to know more on the advantages of this model.

Thanks,
JV

> 
> This gets write to file to work
> 
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
> Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  hw/virtio-9p-local.c |    7 ++++
>  hw/virtio-9p.c       |   97 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 102 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/virtio-9p-local.c b/hw/virtio-9p-local.c
> index d77ecc2..c5d1db3 100644
> --- a/hw/virtio-9p-local.c
> +++ b/hw/virtio-9p-local.c
> @@ -129,6 +129,12 @@ static off_t local_lseek(void *opaque, int fd, off_t offset, int whence)
>      return lseek(fd, offset, whence);
>  }
> 
> +static ssize_t local_writev(void *opaque, int fd, const struct iovec *iov,
> +			    int iovcnt)
> +{
> +    return writev(fd, iov, iovcnt);
> +}
> +
>  static V9fsPosixFileOperations ops = {
>      .lstat = local_lstat,
>      .setuid = local_setuid,
> @@ -143,6 +149,7 @@ static V9fsPosixFileOperations ops = {
>      .seekdir = local_seekdir,
>      .readv = local_readv,
>      .lseek = local_lseek,
> +    .writev = local_writev,
>  };
> 
>  V9fsPosixFileOperations *virtio_9p_init_local(const char *path)
> diff --git a/hw/virtio-9p.c b/hw/virtio-9p.c
> index 3ac6255..bc26d66 100644
> --- a/hw/virtio-9p.c
> +++ b/hw/virtio-9p.c
> @@ -168,6 +168,12 @@ static off_t posix_lseek(V9fsState *s, int fd, off_t offset, int whence)
>      return s->ops->lseek(s->ops->opaque, fd, offset, whence);
>  }
> 
> +static int posix_writev(V9fsState *s, int fd, const struct iovec *iov,
> +                       int iovcnt)
> +{
> +    return s->ops->writev(s->ops->opaque, fd, iov, iovcnt);
> +}
> +
>  static void v9fs_string_init(V9fsString *str)
>  {
>      str->data = NULL;
> @@ -1319,10 +1325,97 @@ out:
>      complete_pdu(s, pdu, err);
>  }
> 
> +typedef struct V9fsWriteState {
> +    V9fsPDU *pdu;
> +    size_t offset;
> +    int32_t fid;
> +    int32_t len;
> +    int32_t count;
> +    int32_t total;
> +    int64_t off;
> +    V9fsFidState *fidp;
> +    struct iovec iov[128]; /* FIXME: bad, bad, bad */
> +    struct iovec *sg;
> +    int cnt;
> +} V9fsWriteState;
> +
> +static void v9fs_write_post_writev(V9fsState *s, V9fsWriteState *vs,
> +                                   ssize_t err)
> +{
> +    BUG_ON(vs->len < 0);
> +    vs->total += vs->len;
> +    vs->sg = adjust_sg(vs->sg, vs->len, &vs->cnt);
> +    if (vs->total < vs->count && vs->len > 0) {
> +        do {
> +            if (0)
> +                print_sg(vs->sg, vs->cnt);
> +            vs->len =  posix_writev(s, vs->fidp->fd, vs->sg, vs->cnt);
> +        } while (vs->len == -1 && errno == EINTR);
> +        v9fs_write_post_writev(s, vs, err);
> +    }
> +    vs->offset += pdu_marshal(vs->pdu, vs->offset, "d", vs->total);
> +
> +    err = vs->offset;
> +    complete_pdu(s, vs->pdu, err);
> +    qemu_free(vs);
> +}
> +
> +static void v9fs_write_post_lseek(V9fsState *s, V9fsWriteState *vs, ssize_t err)
> +{
> +    BUG_ON(err == -1);
> +
> +    vs->sg = cap_sg(vs->sg, vs->count, &vs->cnt);
> +
> +    if (vs->total < vs->count) {
> +        do {
> +            if (0)
> +                print_sg(vs->sg, vs->cnt);
> +            vs->len = posix_writev(s, vs->fidp->fd, vs->sg, vs->cnt);
> +        } while (vs->len == -1 && errno == EINTR);
> +
> +        v9fs_write_post_writev(s, vs, err);
> +        return;
> +    }
> +
> +    complete_pdu(s, vs->pdu, err);
> +    qemu_free(vs);
> +}
> +
>  static void v9fs_write(V9fsState *s, V9fsPDU *pdu)
>  {
> -    if (debug_9p_pdu)
> -        pprint_pdu(pdu);
> +    V9fsWriteState *vs;
> +    ssize_t err;
> +
> +    vs = qemu_malloc(sizeof(*vs));
> +
> +    vs->pdu = pdu;
> +    vs->offset = 7;
> +    vs->sg = vs->iov;
> +    vs->total = 0;
> +    vs->len = 0;
> +
> +    pdu_unmarshal(vs->pdu, vs->offset, "dqdv", &vs->fid, &vs->off, &vs->count,
> +                    vs->sg, &vs->cnt);
> +
> +    vs->fidp = lookup_fid(s, vs->fid);
> +    if (vs->fidp == NULL) {
> +        err = -EINVAL;
> +        goto out;
> +    }
> +
> +    if (vs->fidp->fd == -1) {
> +        err = -EINVAL;
> +        goto out;
> +    }
> +
> +    err = posix_lseek(s, vs->fidp->fd, vs->off, SEEK_SET);
> +
> +    v9fs_write_post_lseek(s, vs, err);
> +    return;
> +
> +out:
> +    complete_pdu(s, vs->pdu, err);
> +    qemu_free(vs);
>  }
> 
>  static void v9fs_create(V9fsState *s, V9fsPDU *pdu)
Anthony Liguori - March 29, 2010, 3 p.m.
On 03/29/2010 01:36 AM, jvrao wrote:
> Aneesh Kumar K.V wrote:
>    
>> From: Anthony Liguori<aliguori@us.ibm.com>
>>      
> We have implemented all the vfs calls in state machine model so that we are prepared
> for the model where the VCPU thread(s) does the initial work until it needs to block then it
> submits that work (via a function pointer) to a thread pool.  A thread in that thread pool
> picks up the work, and completes the blocking call, when blocking call returns a callback is
> invoked in the IO thread.  Then the IO thread runs until the next blocking function, and goto start.
>
> Basically the VCPU/IO threads does all the non-blocking work, and let the threads in the
> thread pool work on the blocking calls like mkdir() stat() etc.
>
> My question is, why not let the whole work done by the thread in the thread pool?
> VCPU thread receives the PDU and hands over the entire job to worker thread.
> When all work is completed, either the worker thread or the IO thread(we can switch back at this point if needed) marks the request as completed in the virtqueue and injects an
> interrupt to notify the guest.
>
> We can still keep the same number of threads in the thread pool.
> This way, we are not increasing #of threads employed by QEMU...also it makes code lot
> more easy to read/maintain.
>
> I may be missing something..but would like to know more on the advantages of this model.
>    

In qemu, we tend to prefer state-machine based code.

There are a few different models that we have discussed at different points:

1) state machines with a thread pool to make blocking functions 
asynchronous (what we have today)

2) co-operative threading

3) pre-emptive threading

All three models are independent of making device models re-entrant.  In 
order to allow VCPU threads to run in qemu simultaneously, we need each 
device to carry a lock and for that lock to be acquired upon any of the 
public functions for the device model.

For individual device models or host services, I think (3) is probably 
the worst model overall.  I personally think that (1) is better in the 
long run but ultimately would need an existence proof to compare against 
(2).  (2) looks appealing until you actually try to have the device 
handle multiple requests at a time.

For virtio-9p, I don't think (1) is much of a burden to be honest.  In 
particular, when the 9p2000.L dialect is used, there should be a 1-1 
correlation between protocol operations and system calls which means 
that for the most part, there's really no advantage to threading from a 
complexity point of view.

Regards,

Anthony Liguori

> Thanks,
> JV
>
Avi Kivity - March 29, 2010, 8:31 p.m.
On 03/29/2010 06:00 PM, Anthony Liguori wrote:
>
> In qemu, we tend to prefer state-machine based code.
>
> There are a few different models that we have discussed at different 
> points:
>
> 1) state machines with a thread pool to make blocking functions 
> asynchronous (what we have today)
>
> 2) co-operative threading
>
> 3) pre-emptive threading
>
> All three models are independent of making device models re-entrant.  
> In order to allow VCPU threads to run in qemu simultaneously, we need 
> each device to carry a lock and for that lock to be acquired upon any 
> of the public functions for the device model.
>
> For individual device models or host services, I think (3) is probably 
> the worst model overall.  I personally think that (1) is better in the 
> long run but ultimately would need an existence proof to compare 
> against (2).  (2) looks appealing until you actually try to have the 
> device handle multiple requests at a time.

Sooner or later nature and the ever more complicated code will force us 
towards (3).  As an example, we've observed live migration to throttle 
vcpus when sending a large guest's zeroed memory over; the bandwidth 
control doesn't kick in since zero pages are compressed, so the iothread 
spends large amounts of time reading memory.

We could fix this by yielding every so often (and a patch will be posted 
soon), but it remains an issue.  We have too much work in the I/O thread 
and that defers I/O completion and timers.

> For virtio-9p, I don't think (1) is much of a burden to be honest.  In 
> particular, when the 9p2000.L dialect is used, there should be a 1-1 
> correlation between protocol operations and system calls which means 
> that for the most part, there's really no advantage to threading from 
> a complexity point of view.

But if those system calls are blocking, you need a thread?

On a philosophical note, threads may be easier to model complex hardware 
that includes a processor, for example our scsi card (and how about 
using tcg as a jit to boost it :)
Anthony Liguori - March 29, 2010, 8:42 p.m.
On 03/29/2010 03:31 PM, Avi Kivity wrote:
> On 03/29/2010 06:00 PM, Anthony Liguori wrote:
>>
>> In qemu, we tend to prefer state-machine based code.
>>
>> There are a few different models that we have discussed at different 
>> points:
>>
>> 1) state machines with a thread pool to make blocking functions 
>> asynchronous (what we have today)
>>
>> 2) co-operative threading
>>
>> 3) pre-emptive threading
>>
>> All three models are independent of making device models re-entrant.  
>> In order to allow VCPU threads to run in qemu simultaneously, we need 
>> each device to carry a lock and for that lock to be acquired upon any 
>> of the public functions for the device model.
>>
>> For individual device models or host services, I think (3) is 
>> probably the worst model overall.  I personally think that (1) is 
>> better in the long run but ultimately would need an existence proof 
>> to compare against (2).  (2) looks appealing until you actually try 
>> to have the device handle multiple requests at a time.
>
> Sooner or later nature and the ever more complicated code will force 
> us towards (3).  As an example, we've observed live migration to 
> throttle vcpus when sending a large guest's zeroed memory over; the 
> bandwidth control doesn't kick in since zero pages are compressed, so 
> the iothread spends large amounts of time reading memory.

Making things re-entrant is different than (3) in my mind.

There's no reason that VCPU threads should run in lock-step with live 
migration during the live phase.  Making device models re-entrant and 
making live migration depend not depend on the big global lock is a good 
thing to do.

What I'm skeptical of, is whether converting virtio-9p or qcow2 to 
handle each request in a separate thread is really going to improve 
things.  The VNC server is another area that I think multithreading 
would be a bad idea.

> We could fix this by yielding every so often (and a patch will be 
> posted soon), but it remains an issue.  We have too much work in the 
> I/O thread and that defers I/O completion and timers.
>
>> For virtio-9p, I don't think (1) is much of a burden to be honest.  
>> In particular, when the 9p2000.L dialect is used, there should be a 
>> 1-1 correlation between protocol operations and system calls which 
>> means that for the most part, there's really no advantage to 
>> threading from a complexity point of view.
>
> But if those system calls are blocking, you need a thread?

You can dispatch just the system call to a thread pool.  The advantage 
of doing that is that you don't need to worry about locking since the 
system calls are not (usually) handling shared state.

> On a philosophical note, threads may be easier to model complex 
> hardware that includes a processor, for example our scsi card (and how 
> about using tcg as a jit to boost it :)

Yeah, it's hard to argue that script evaluation shouldn't be done in a 
thread.  But that doesn't prevent me from being very cautious about how 
and where we use threading :-)

Regards,

Anthony Liguori
Avi Kivity - March 29, 2010, 8:54 p.m.
On 03/29/2010 11:42 PM, Anthony Liguori wrote:
>>> For individual device models or host services, I think (3) is 
>>> probably the worst model overall.  I personally think that (1) is 
>>> better in the long run but ultimately would need an existence proof 
>>> to compare against (2).  (2) looks appealing until you actually try 
>>> to have the device handle multiple requests at a time.
>>
>> Sooner or later nature and the ever more complicated code will force 
>> us towards (3).  As an example, we've observed live migration to 
>> throttle vcpus when sending a large guest's zeroed memory over; the 
>> bandwidth control doesn't kick in since zero pages are compressed, so 
>> the iothread spends large amounts of time reading memory.
>
>
> Making things re-entrant is different than (3) in my mind.
>
> There's no reason that VCPU threads should run in lock-step with live 
> migration during the live phase.  Making device models re-entrant and 
> making live migration depend not depend on the big global lock is a 
> good thing to do.

It's not sufficient.  If you have a single thread that runs both live 
migrations and timers, then timers will be backlogged behind live 
migration, or you'll have to yield often.  This is regardless of the 
locking model (and of course having threads without fixing the locking 
is insufficient as well, live migration accesses guest memory so it 
needs the big qemu lock).

> What I'm skeptical of, is whether converting virtio-9p or qcow2 to 
> handle each request in a separate thread is really going to improve 
> things. 

Currently qcow2 isn't even fullly asynchronous, so it can't fail to 
improve things.

> The VNC server is another area that I think multithreading would be a 
> bad idea.

If the vnc server is stuffing a few megabytes of screen into a socket, 
then timers will be delayed behind it, unless you litter the code with 
calls to bottom halves.  Even worse if it does complicated compression 
and encryption.

>>
>> But if those system calls are blocking, you need a thread?
>
> You can dispatch just the system call to a thread pool.  The advantage 
> of doing that is that you don't need to worry about locking since the 
> system calls are not (usually) handling shared state.

There is always implied shared state.  If you're doing direct guest 
memory access, you need to lock memory against hotunplug, or the syscall 
will end up writing into freed memory.  If the device can be 
hotunplugged, you need to make sure all threads have returned before 
unplugging it.

>> On a philosophical note, threads may be easier to model complex 
>> hardware that includes a processor, for example our scsi card (and 
>> how about using tcg as a jit to boost it :)
>
> Yeah, it's hard to argue that script evaluation shouldn't be done in a 
> thread.  But that doesn't prevent me from being very cautious about 
> how and where we use threading :-)

Caution where threads are involved is a good thing.  They are inevitable 
however, IMO.
jvrao - March 29, 2010, 9:17 p.m.
Anthony Liguori wrote:
> On 03/29/2010 03:31 PM, Avi Kivity wrote:
>> On 03/29/2010 06:00 PM, Anthony Liguori wrote:
>>>
>>> In qemu, we tend to prefer state-machine based code.
>>>
>>> There are a few different models that we have discussed at different
>>> points:
>>>
>>> 1) state machines with a thread pool to make blocking functions
>>> asynchronous (what we have today)
>>>
>>> 2) co-operative threading
>>>
>>> 3) pre-emptive threading
>>>
>>> All three models are independent of making device models re-entrant. 
>>> In order to allow VCPU threads to run in qemu simultaneously, we need
>>> each device to carry a lock and for that lock to be acquired upon any
>>> of the public functions for the device model.
>>>
>>> For individual device models or host services, I think (3) is
>>> probably the worst model overall.  I personally think that (1) is
>>> better in the long run but ultimately would need an existence proof
>>> to compare against (2).  (2) looks appealing until you actually try
>>> to have the device handle multiple requests at a time.
>>
>> Sooner or later nature and the ever more complicated code will force
>> us towards (3).  As an example, we've observed live migration to
>> throttle vcpus when sending a large guest's zeroed memory over; the
>> bandwidth control doesn't kick in since zero pages are compressed, so
>> the iothread spends large amounts of time reading memory.
> 
> Making things re-entrant is different than (3) in my mind.
> 
> There's no reason that VCPU threads should run in lock-step with live
> migration during the live phase.  Making device models re-entrant and
> making live migration depend not depend on the big global lock is a good
> thing to do.
> 
> What I'm skeptical of, is whether converting virtio-9p or qcow2 to
> handle each request in a separate thread is really going to improve
> things.  The VNC server is another area that I think multithreading
> would be a bad idea.

Excuse me for some basic questions..still trying to understand QEMU concepts..

How does IO thread is accounted for? 

If I start a 2 CPU QEMU, we will be occupying two physical CPUs with two VCPU threads
and IO thread runs on other CPUs owned by the host right? If the answer is yes, then
we are good as the fileserver load has to be on the host CPUs. 

> 
>> We could fix this by yielding every so often (and a patch will be
>> posted soon), but it remains an issue.  We have too much work in the
>> I/O thread and that defers I/O completion and timers.
>>
>>> For virtio-9p, I don't think (1) is much of a burden to be honest. 
>>> In particular, when the 9p2000.L dialect is used, there should be a
>>> 1-1 correlation between protocol operations and system calls which
>>> means that for the most part, there's really no advantage to
>>> threading from a complexity point of view.
>>
>> But if those system calls are blocking, you need a thread?
> 
> You can dispatch just the system call to a thread pool.  The advantage
> of doing that is that you don't need to worry about locking since the
> system calls are not (usually) handling shared state.

How is locking avoided in the IO thread model? do we just have 1 IO thread 
irrespective of #VCPU threads? Is is that how the locking is avoided?

I think IO thread is used to do disk/network IO right? 
Putting 9P also on that will have any performance/scalability issues?

Thanks,
JV

> 
>> On a philosophical note, threads may be easier to model complex
>> hardware that includes a processor, for example our scsi card (and how
>> about using tcg as a jit to boost it :)
> 
> Yeah, it's hard to argue that script evaluation shouldn't be done in a
> thread.  But that doesn't prevent me from being very cautious about how
> and where we use threading :-)
> 
> Regards,
> 
> Anthony Liguori
>
Anthony Liguori - March 29, 2010, 9:23 p.m.
On 03/29/2010 03:54 PM, Avi Kivity wrote:
> On 03/29/2010 11:42 PM, Anthony Liguori wrote:
>>>> For individual device models or host services, I think (3) is 
>>>> probably the worst model overall.  I personally think that (1) is 
>>>> better in the long run but ultimately would need an existence proof 
>>>> to compare against (2).  (2) looks appealing until you actually try 
>>>> to have the device handle multiple requests at a time.
>>>
>>> Sooner or later nature and the ever more complicated code will force 
>>> us towards (3).  As an example, we've observed live migration to 
>>> throttle vcpus when sending a large guest's zeroed memory over; the 
>>> bandwidth control doesn't kick in since zero pages are compressed, 
>>> so the iothread spends large amounts of time reading memory.
>>
>>
>> Making things re-entrant is different than (3) in my mind.
>>
>> There's no reason that VCPU threads should run in lock-step with live 
>> migration during the live phase.  Making device models re-entrant and 
>> making live migration depend not depend on the big global lock is a 
>> good thing to do.
>
> It's not sufficient.  If you have a single thread that runs both live 
> migrations and timers, then timers will be backlogged behind live 
> migration, or you'll have to yield often.  This is regardless of the 
> locking model (and of course having threads without fixing the locking 
> is insufficient as well, live migration accesses guest memory so it 
> needs the big qemu lock).

But what's the solution?  Sending every timer in a separate thread?  
We'll hit the same problem if we implement an arbitrary limit to number 
of threads.

>> What I'm skeptical of, is whether converting virtio-9p or qcow2 to 
>> handle each request in a separate thread is really going to improve 
>> things. 
>
> Currently qcow2 isn't even fullly asynchronous, so it can't fail to 
> improve things.

Unless it introduces more data corruptions which is my concern with any 
significant change to qcow2.

>> The VNC server is another area that I think multithreading would be a 
>> bad idea.
>
> If the vnc server is stuffing a few megabytes of screen into a socket, 
> then timers will be delayed behind it, unless you litter the code with 
> calls to bottom halves.  Even worse if it does complicated compression 
> and encryption.

Sticking the VNC server in it's own thread would be fine.  Trying to 
make the VNC server multithreaded though would be problematic.

Basically, sticking isolated components in a single thread should be 
pretty reasonable.

>>>
>>> But if those system calls are blocking, you need a thread?
>>
>> You can dispatch just the system call to a thread pool.  The 
>> advantage of doing that is that you don't need to worry about locking 
>> since the system calls are not (usually) handling shared state.
>
> There is always implied shared state.  If you're doing direct guest 
> memory access, you need to lock memory against hotunplug, or the 
> syscall will end up writing into freed memory.  If the device can be 
> hotunplugged, you need to make sure all threads have returned before 
> unplugging it.

There are other ways to handle hot unplug (like reference counting) that 
avoid this problem.

Ultimately, this comes down to a question of lock granularity and thread 
granularity.  I don't think it's a good idea to start with the 
assumption that we want extremely fine granularity.  There's certainly 
very low hanging fruit with respect to threading.

>>> On a philosophical note, threads may be easier to model complex 
>>> hardware that includes a processor, for example our scsi card (and 
>>> how about using tcg as a jit to boost it :)
>>
>> Yeah, it's hard to argue that script evaluation shouldn't be done in 
>> a thread.  But that doesn't prevent me from being very cautious about 
>> how and where we use threading :-)
>
> Caution where threads are involved is a good thing.  They are 
> inevitable however, IMO.

We already are using threads so they aren't just inevitable, they're 
reality.  I still don't think using threads would significantly simplify 
virtio-9p.

Regards,

Anthony Liguori
Avi Kivity - March 30, 2010, 10:24 a.m.
On 03/30/2010 12:23 AM, Anthony Liguori wrote:
>> It's not sufficient.  If you have a single thread that runs both live 
>> migrations and timers, then timers will be backlogged behind live 
>> migration, or you'll have to yield often.  This is regardless of the 
>> locking model (and of course having threads without fixing the 
>> locking is insufficient as well, live migration accesses guest memory 
>> so it needs the big qemu lock).
>
>
> But what's the solution?  Sending every timer in a separate thread?  
> We'll hit the same problem if we implement an arbitrary limit to 
> number of threads.

A completion that's expected to take a couple of microseconds at most 
can live in the iothread.  A completion that's expected to take a couple 
of milliseconds wants its own thread.  We'll have to think about 
anything in between.

vnc and migration can perform large amounts of work in a single 
completion; they're limited only by the socket send rate and our 
internal rate-limiting which are both outside our control.  Most device 
timers are O(1).  virtio completions probably fall into the annoying 
"have to think about it" department.

>>> What I'm skeptical of, is whether converting virtio-9p or qcow2 to 
>>> handle each request in a separate thread is really going to improve 
>>> things. 
>>
>> Currently qcow2 isn't even fullly asynchronous, so it can't fail to 
>> improve things.
>
> Unless it introduces more data corruptions which is my concern with 
> any significant change to qcow2.

It's possible to move qcow2 to a thread without any significant change 
to it (simply run the current code in its own thread, protected by a 
mutex).  Further changes would be very incremental.

>>> The VNC server is another area that I think multithreading would be 
>>> a bad idea.
>>
>> If the vnc server is stuffing a few megabytes of screen into a 
>> socket, then timers will be delayed behind it, unless you litter the 
>> code with calls to bottom halves.  Even worse if it does complicated 
>> compression and encryption.
>
> Sticking the VNC server in it's own thread would be fine.  Trying to 
> make the VNC server multithreaded though would be problematic.

Why would it be problematic?  Each client gets its own threads, they 
don't interact at all do they?

I don't see a need to do it though (beyond dropping it into a thread).

> Basically, sticking isolated components in a single thread should be 
> pretty reasonable.

Now you're doomed.  It's easy to declare things "isolated components" 
one by one, pretty soon the main loop will be gone.

>>>>
>>>> But if those system calls are blocking, you need a thread?
>>>
>>> You can dispatch just the system call to a thread pool.  The 
>>> advantage of doing that is that you don't need to worry about 
>>> locking since the system calls are not (usually) handling shared state.
>>
>> There is always implied shared state.  If you're doing direct guest 
>> memory access, you need to lock memory against hotunplug, or the 
>> syscall will end up writing into freed memory.  If the device can be 
>> hotunplugged, you need to make sure all threads have returned before 
>> unplugging it.
>
> There are other ways to handle hot unplug (like reference counting) 
> that avoid this problem.

That's just more clever locking.

> Ultimately, this comes down to a question of lock granularity and 
> thread granularity.  I don't think it's a good idea to start with the 
> assumption that we want extremely fine granularity.  There's certainly 
> very low hanging fruit with respect to threading.

Sure.  Currently the hotspots are block devices (except raw) and hpet 
(seen with large Windows guests).  The latter includes the bus lookup 
and hpet itself, hpet reads can be performed locklessly if we're clever.

>>>> On a philosophical note, threads may be easier to model complex 
>>>> hardware that includes a processor, for example our scsi card (and 
>>>> how about using tcg as a jit to boost it :)
>>>
>>> Yeah, it's hard to argue that script evaluation shouldn't be done in 
>>> a thread.  But that doesn't prevent me from being very cautious 
>>> about how and where we use threading :-)
>>
>> Caution where threads are involved is a good thing.  They are 
>> inevitable however, IMO.
>
> We already are using threads so they aren't just inevitable, they're 
> reality.  I still don't think using threads would significantly 
> simplify virtio-9p.
>

I meant, exposing qemu core to the threads instead of pretending they 
aren't there.  I'm not familiar with 9p so don't hold much of an 
opinion, but didn't you say you need threads in order to handle async 
syscalls?  That may not be the deep threading we're discussing here.

btw, IIUC currently disk hotunplug will stall a guest, no?  We need 
async aio_flush().
Avi Kivity - March 30, 2010, 10:28 a.m.
On 03/30/2010 12:17 AM, jvrao wrote:
>
> Excuse me for some basic questions..still trying to understand QEMU concepts..
>
> How does IO thread is accounted for?
>    

It's just another thread, usually a lightly loaded one.

> If I start a 2 CPU QEMU, we will be occupying two physical CPUs with two VCPU threads
> and IO thread runs on other CPUs owned by the host right? If the answer is yes, then
> we are good as the fileserver load has to be on the host CPUs.
>    

It's completely up to the scheduler and determined by how the iothread, 
vcpu threads, and other processes on the host are loaded.

>> You can dispatch just the system call to a thread pool.  The advantage
>> of doing that is that you don't need to worry about locking since the
>> system calls are not (usually) handling shared state.
>>      
> How is locking avoided in the IO thread model? do we just have 1 IO thread
> irrespective of #VCPU threads? Is is that how the locking is avoided?
>    

When you are running qemu code you hold the qemu mutex, except when you 
are in the thread pool.  So the iothread and vcpu threads are all 
mutually excluded, and defer any blocking work to other threads.

> I think IO thread is used to do disk/network IO right?
>    

It's used for completions (disk, network, timers, anything that 
completes asynchronously to a vcpu).

> Putting 9P also on that will have any performance/scalability issues?
>    

Yes.
Anthony Liguori - March 30, 2010, 1:13 p.m.
On 03/30/2010 05:24 AM, Avi Kivity wrote:
> On 03/30/2010 12:23 AM, Anthony Liguori wrote:
>>> It's not sufficient.  If you have a single thread that runs both 
>>> live migrations and timers, then timers will be backlogged behind 
>>> live migration, or you'll have to yield often.  This is regardless 
>>> of the locking model (and of course having threads without fixing 
>>> the locking is insufficient as well, live migration accesses guest 
>>> memory so it needs the big qemu lock).
>>
>>
>> But what's the solution?  Sending every timer in a separate thread?  
>> We'll hit the same problem if we implement an arbitrary limit to 
>> number of threads.
>
> A completion that's expected to take a couple of microseconds at most 
> can live in the iothread.  A completion that's expected to take a 
> couple of milliseconds wants its own thread.  We'll have to think 
> about anything in between.
>
> vnc and migration can perform large amounts of work in a single 
> completion; they're limited only by the socket send rate and our 
> internal rate-limiting which are both outside our control.  Most 
> device timers are O(1).  virtio completions probably fall into the 
> annoying "have to think about it" department.

I think it may make more sense to have vcpu completions vs. io thread 
completions and make vcpu completions target short lived operations.

>>>> What I'm skeptical of, is whether converting virtio-9p or qcow2 to 
>>>> handle each request in a separate thread is really going to improve 
>>>> things. 
>>>
>>> Currently qcow2 isn't even fullly asynchronous, so it can't fail to 
>>> improve things.
>>
>> Unless it introduces more data corruptions which is my concern with 
>> any significant change to qcow2.
>
> It's possible to move qcow2 to a thread without any significant change 
> to it (simply run the current code in its own thread, protected by a 
> mutex).  Further changes would be very incremental.

But that offers no advantage to what we have which fails the 
proof-by-example that threading makes the situation better.  To convert 
qcow2 to be threaded, I think you would have to wrap the whole thing in 
a lock, then convert the current asynchronous functions to synchronous 
(that's the whole point, right).  At this point, you've regressed 
performance because you can only handle one read/write outstanding at a 
given time.  So now you have to make the locking more granular but 
because we do layered block devices, you've got to make most of the core 
block driver functions thread safe.

Once you get basic data operations concurrent, which I expect won't be 
so bad, to get an improvement over the current code, you have to allow 
simultaneous access to metadata which is where I think the vast majority 
of the complexity will come from.

You could argue that we stick qcow2 into a thread and stop there and 
that fixes the problems with synchronous data access.  If that's the 
argument, then let's not even bother doing at the qcow layer, let's just 
switch the block aio emulation to use a dedicated thread.

>> Sticking the VNC server in it's own thread would be fine.  Trying to 
>> make the VNC server multithreaded though would be problematic.
>
> Why would it be problematic?  Each client gets its own threads, they 
> don't interact at all do they?

Dealing with locking of the core display which each client uses for 
rendering.  Things like CopyRect will get ugly quickly.Ultimately, this 
comes down to a question of lock granularity and thread granularity.  I 
don't think it's a good idea to start with the assumption that we want 
extremely fine granularity.  There's certainly very low hanging fruit 
with respect to threading.

> Sure.  Currently the hotspots are block devices (except raw) and hpet 
> (seen with large Windows guests).  The latter includes the bus lookup 
> and hpet itself, hpet reads can be performed locklessly if we're clever.

I'm all for making devices thread safe and the hpet is probably a good 
candidate for initial converting.

>
> I meant, exposing qemu core to the threads instead of pretending they 
> aren't there.  I'm not familiar with 9p so don't hold much of an 
> opinion, but didn't you say you need threads in order to handle async 
> syscalls?  That may not be the deep threading we're discussing here.
>
> btw, IIUC currently disk hotunplug will stall a guest, no?  We need 
> async aio_flush().

But aio_flush() never takes a very long time, right :-)

We had this discussion in the past re: live migration because we do an 
aio_flush() in the critical stage.

Regards,

Anthony Liguori
Avi Kivity - March 30, 2010, 1:28 p.m.
On 03/30/2010 04:13 PM, Anthony Liguori wrote:
> On 03/30/2010 05:24 AM, Avi Kivity wrote:
>> On 03/30/2010 12:23 AM, Anthony Liguori wrote:
>>>> It's not sufficient.  If you have a single thread that runs both 
>>>> live migrations and timers, then timers will be backlogged behind 
>>>> live migration, or you'll have to yield often.  This is regardless 
>>>> of the locking model (and of course having threads without fixing 
>>>> the locking is insufficient as well, live migration accesses guest 
>>>> memory so it needs the big qemu lock).
>>>
>>>
>>> But what's the solution?  Sending every timer in a separate thread?  
>>> We'll hit the same problem if we implement an arbitrary limit to 
>>> number of threads.
>>
>> A completion that's expected to take a couple of microseconds at most 
>> can live in the iothread.  A completion that's expected to take a 
>> couple of milliseconds wants its own thread.  We'll have to think 
>> about anything in between.
>>
>> vnc and migration can perform large amounts of work in a single 
>> completion; they're limited only by the socket send rate and our 
>> internal rate-limiting which are both outside our control.  Most 
>> device timers are O(1).  virtio completions probably fall into the 
>> annoying "have to think about it" department.
>
> I think it may make more sense to have vcpu completions vs. io thread 
> completions and make vcpu completions target short lived operations.

vcpu completions make sense when you can tell that a completion will 
cause an interrupt injection and you have a good idea which cpu will be 
interrupted.

>
>>>>> What I'm skeptical of, is whether converting virtio-9p or qcow2 to 
>>>>> handle each request in a separate thread is really going to 
>>>>> improve things. 
>>>>
>>>> Currently qcow2 isn't even fullly asynchronous, so it can't fail to 
>>>> improve things.
>>>
>>> Unless it introduces more data corruptions which is my concern with 
>>> any significant change to qcow2.
>>
>> It's possible to move qcow2 to a thread without any significant 
>> change to it (simply run the current code in its own thread, 
>> protected by a mutex).  Further changes would be very incremental.
>
> But that offers no advantage to what we have which fails the 
> proof-by-example that threading makes the situation better. 

It has an advantage, qcow2 is currently synchronous in parts:

block/qcow2-cluster.c:    ret = bdrv_write(s->hd, (cluster_offset >> 9) 
+ n_start,
block/qcow2.c:        bdrv_write(s->hd, (meta.cluster_offset >> 9) + num 
- 1, buf, 1);
block/qcow2.c:        bdrv_write(bs, sector_num, buf, s->cluster_sectors);
block/qcow2-cluster.c:                    ret = 
bdrv_read(bs->backing_hd, sector_num, buf, n1);
block/qcow2-cluster.c:        ret = bdrv_read(s->hd, coffset >> 9, 
s->cluster_data, nb_csectors);

> To convert qcow2 to be threaded, I think you would have to wrap the 
> whole thing in a lock, then convert the current asynchronous functions 
> to synchronous (that's the whole point, right).  At this point, you've 
> regressed performance because you can only handle one read/write 
> outstanding at a given time.  So now you have to make the locking more 
> granular but because we do layered block devices, you've got to make 
> most of the core block driver functions thread safe.

Not at all.  The first conversion will be to keep the current code as 
is, operating asynchronously, but running in its own thread.  It will 
still support multiple outstanding requests using the current state 
machine code; the synchronous parts will be remain synchronous relative 
to the block device, but async relative to everything else.  The second 
stage will convert the state machine code to threaded code.  This is 
more difficult but not overly so - turn every dependency list into a mutex.

> Once you get basic data operations concurrent, which I expect won't be 
> so bad, to get an improvement over the current code, you have to allow 
> simultaneous access to metadata which is where I think the vast 
> majority of the complexity will come from.

I have no plans to do that, all I want is qcow2 not to block vcpus.  
btw, I don't think it's all that complicated, it's simple to lock 
individual L2 blocks and the L1 block.

> You could argue that we stick qcow2 into a thread and stop there and 
> that fixes the problems with synchronous data access.  If that's the 
> argument, then let's not even bother doing at the qcow layer, let's 
> just switch the block aio emulation to use a dedicated thread.

That's certainly the plan for vmdk and friends which are today useless.  
qcow2 deserves better treatment.

>>> Sticking the VNC server in it's own thread would be fine.  Trying to 
>>> make the VNC server multithreaded though would be problematic.
>>
>> Why would it be problematic?  Each client gets its own threads, they 
>> don't interact at all do they?
>
> Dealing with locking of the core display which each client uses for 
> rendering.  Things like CopyRect will get ugly quickly.Ultimately, 
> this comes down to a question of lock granularity and thread 
> granularity.  I don't think it's a good idea to start with the 
> assumption that we want extremely fine granularity.  There's certainly 
> very low hanging fruit with respect to threading.

Not familiar with the code, but doesn't vnc access the display core 
through an API?  Slap a lot onto that.

>>
>> I meant, exposing qemu core to the threads instead of pretending they 
>> aren't there.  I'm not familiar with 9p so don't hold much of an 
>> opinion, but didn't you say you need threads in order to handle async 
>> syscalls?  That may not be the deep threading we're discussing here.
>>
>> btw, IIUC currently disk hotunplug will stall a guest, no?  We need 
>> async aio_flush().
>
> But aio_flush() never takes a very long time, right :-)
>
> We had this discussion in the past re: live migration because we do an 
> aio_flush() in the critical stage.
>

Live migration will stall a guest anyway.  It doesn't matter if 
aio_flush blocks for a few ms, since the final stage will dominate it.
Anthony Liguori - March 30, 2010, 1:54 p.m.
On 03/30/2010 08:28 AM, Avi Kivity wrote:
>> But that offers no advantage to what we have which fails the 
>> proof-by-example that threading makes the situation better. 
>
>
> It has an advantage, qcow2 is currently synchronous in parts:
>
> block/qcow2-cluster.c:    ret = bdrv_write(s->hd, (cluster_offset >> 
> 9) + n_start,
 > block/qcow2-cluster.c:                    ret = 
bdrv_read(bs->backing_hd, sector_num, buf, n1);

The two of these happen in copy_sectors().  copy_sectors() runs from 
qcow2_alloc_cluster_link_l2() which is called from qcow_aio_write_cb() 
and preallocate().

> block/qcow2.c:        bdrv_write(s->hd, (meta.cluster_offset >> 9) + 
> num - 1, buf, 1);

This only happens during creation (for preallocation).

> block/qcow2.c:        bdrv_write(bs, sector_num, buf, 
> s->cluster_sectors);
> block/qcow2-cluster.c:        ret = bdrv_read(s->hd, coffset >> 9, 
> s->cluster_data, nb_csectors);

These two are only for compressed images.

So it looks like we really only have one operation 
(qcow2_alloc_cluster_link_l2) that blocks.  Do we really think that it's 
sufficiently difficult to make this function asynchronous that it 
justifies threading the block layer?

Regards,

Anthony Liguori
Avi Kivity - March 30, 2010, 2:03 p.m.
On 03/30/2010 04:54 PM, Anthony Liguori wrote:
> On 03/30/2010 08:28 AM, Avi Kivity wrote:
>>> But that offers no advantage to what we have which fails the 
>>> proof-by-example that threading makes the situation better. 
>>
>>
>> It has an advantage, qcow2 is currently synchronous in parts:
>>
>> block/qcow2-cluster.c:    ret = bdrv_write(s->hd, (cluster_offset >> 
>> 9) + n_start,
> > block/qcow2-cluster.c:                    ret = 
> bdrv_read(bs->backing_hd, sector_num, buf, n1);
>
> The two of these happen in copy_sectors().  copy_sectors() runs from 
> qcow2_alloc_cluster_link_l2() which is called from qcow_aio_write_cb() 
> and preallocate().
>
>> block/qcow2.c:        bdrv_write(s->hd, (meta.cluster_offset >> 9) + 
>> num - 1, buf, 1);
>
> This only happens during creation (for preallocation).
>
>> block/qcow2.c:        bdrv_write(bs, sector_num, buf, 
>> s->cluster_sectors);
>> block/qcow2-cluster.c:        ret = bdrv_read(s->hd, coffset >> 9, 
>> s->cluster_data, nb_csectors);
>
> These two are only for compressed images.

Are compressed images supposed to block vcpus?

>
> So it looks like we really only have one operation 
> (qcow2_alloc_cluster_link_l2) that blocks.  Do we really think that 
> it's sufficiently difficult to make this function asynchronous that it 
> justifies threading the block layer?

There are also tons of bdrv_pread()s and bdrv_pwrite()s.  Isn't growing 
some of the tables synchronous? how about creating a snapshot?

If it were just one operation in qcow2, threading would be a big 
overkill.  But threading also fixes all of the other format drivers, and 
makes working on qcow2 easier.
Anthony Liguori - March 30, 2010, 2:07 p.m.
On 03/30/2010 09:03 AM, Avi Kivity wrote:
>>
>> So it looks like we really only have one operation 
>> (qcow2_alloc_cluster_link_l2) that blocks.  Do we really think that 
>> it's sufficiently difficult to make this function asynchronous that 
>> it justifies threading the block layer?
>
> There are also tons of bdrv_pread()s and bdrv_pwrite()s.  Isn't 
> growing some of the tables synchronous? how about creating a snapshot?

Eh, that ruins my whole argument.

>
> If it were just one operation in qcow2, threading would be a big 
> overkill.  But threading also fixes all of the other format drivers, 
> and makes working on qcow2 easier.
>

Yeah, I think the only question is whether to stick threading in the 
generic block layer or within qcow2.

Regards,

Anthony Liguori
Avi Kivity - March 30, 2010, 2:23 p.m.
On 03/30/2010 05:07 PM, Anthony Liguori wrote:
> On 03/30/2010 09:03 AM, Avi Kivity wrote:
>>>
>>> So it looks like we really only have one operation 
>>> (qcow2_alloc_cluster_link_l2) that blocks.  Do we really think that 
>>> it's sufficiently difficult to make this function asynchronous that 
>>> it justifies threading the block layer?
>>
>> There are also tons of bdrv_pread()s and bdrv_pwrite()s.  Isn't 
>> growing some of the tables synchronous? how about creating a snapshot?
>
> Eh, that ruins my whole argument.


But we can keep on arguing regardless, yes?

>
>>
>> If it were just one operation in qcow2, threading would be a big 
>> overkill.  But threading also fixes all of the other format drivers, 
>> and makes working on qcow2 easier.
>>
>
> Yeah, I think the only question is whether to stick threading in the 
> generic block layer or within qcow2.

Both.  My plan is:

(1) some generic infrastructure
(2) kit that turns a synchronous block format driver into an 
asynchronous one using (1), limited to one outstanding request
(3) adaptations to qcow2 using (1) to make it fully asynchronous, 
capable of multiple outstanding requests (except when it issues a sync 
request)

I've mostly done (1) and am contemplating (2).
Anthony Liguori - March 30, 2010, 2:59 p.m.
On 03/30/2010 09:23 AM, Avi Kivity wrote:
> On 03/30/2010 05:07 PM, Anthony Liguori wrote:
>> On 03/30/2010 09:03 AM, Avi Kivity wrote:
>>>>
>>>> So it looks like we really only have one operation 
>>>> (qcow2_alloc_cluster_link_l2) that blocks.  Do we really think that 
>>>> it's sufficiently difficult to make this function asynchronous that 
>>>> it justifies threading the block layer?
>>>
>>> There are also tons of bdrv_pread()s and bdrv_pwrite()s.  Isn't 
>>> growing some of the tables synchronous? how about creating a snapshot?
>>
>> Eh, that ruins my whole argument.
>
>
> But we can keep on arguing regardless, yes?

Of course :-)

> Both.  My plan is:
>
> (1) some generic infrastructure
> (2) kit that turns a synchronous block format driver into an 
> asynchronous one using (1), limited to one outstanding request
> (3) adaptations to qcow2 using (1) to make it fully asynchronous, 
> capable of multiple outstanding requests (except when it issues a sync 
> request)
>
> I've mostly done (1) and am contemplating (2).

Sounds reasonable.

Regards,

Anthony Liguori
Paul Brook - April 1, 2010, 1:14 p.m.
>>> 1) state machines with a thread pool to make blocking functions
>>> asynchronous (what we have today)
>>>
>>> 2) co-operative threading
>>>
>>> 3) pre-emptive threading

> > On a philosophical note, threads may be easier to model complex
> > hardware that includes a processor, for example our scsi card (and how
> > about using tcg as a jit to boost it :)
> 
> Yeah, it's hard to argue that script evaluation shouldn't be done in a
> thread.  But that doesn't prevent me from being very cautious about how
> and where we use threading :-)

I agree that making script execution asynchronous is a good thing, however I'm 
not convinced that (3) is the best way to do this. It's certainly the most 
flexible model, however it also places responsibility for locking/correctness 
on the device developer.

A more limited scheme (e.g. asynchronous callbacks) can actually be 
significant advantage. By forcing the developer to solve the problem in a 
particular way we significantly reduce the scope for race conditions, and 
hopefully restrict all locking concerns to common code/APIs.

Paul

[1] This issue may come from accidental misuse of terminology, but it's an 
important distinction.
Avi Kivity - April 1, 2010, 2:34 p.m.
On 04/01/2010 04:14 PM, Paul Brook wrote:
>
>>> On a philosophical note, threads may be easier to model complex
>>> hardware that includes a processor, for example our scsi card (and how
>>> about using tcg as a jit to boost it :)
>>>        
>> Yeah, it's hard to argue that script evaluation shouldn't be done in a
>> thread.  But that doesn't prevent me from being very cautious about how
>> and where we use threading :-)
>>      
> I agree that making script execution asynchronous is a good thing, however I'm
> not convinced that (3) is the best way to do this. It's certainly the most
> flexible model, however it also places responsibility for locking/correctness
> on the device developer.
>
> A more limited scheme (e.g. asynchronous callbacks) can actually be
> significant advantage. By forcing the developer to solve the problem in a
> particular way we significantly reduce the scope for race conditions, and
> hopefully restrict all locking concerns to common code/APIs.
>    

Can you elaborate?  When would the callbacks be called?  When would the 
callbacks yield back?

If the script isn't run to completion (a halt instruction), there isn't 
really a good answer to these questions.

> [1] This issue may come from accidental misuse of terminology, but it's an
> important distinction.
>    

Missing reference to footnote.
Paul Brook - April 1, 2010, 3:30 p.m.
> On 04/01/2010 04:14 PM, Paul Brook wrote:
> >>> On a philosophical note, threads may be easier to model complex
> >>> hardware that includes a processor, for example our scsi card (and how
> >>> about using tcg as a jit to boost it :)
> >>
> >> Yeah, it's hard to argue that script evaluation shouldn't be done in a
> >> thread.  But that doesn't prevent me from being very cautious about how
> >> and where we use threading :-)
> >
> > I agree that making script execution asynchronous is a good thing,
> > however I'm not convinced that (3) is the best way to do this. It's
> > certainly the most flexible model, however it also places responsibility
> > for locking/correctness on the device developer.
> >
> > A more limited scheme (e.g. asynchronous callbacks) can actually be
> > significant advantage. By forcing the developer to solve the problem in a
> > particular way we significantly reduce the scope for race conditions, and
> > hopefully restrict all locking concerns to common code/APIs.
> 
> Can you elaborate?  When would the callbacks be called?  When would the
> callbacks yield back? If the script isn't run to completion (a halt 
instruction), there isn't really a good answer to these questions.

In practice I don't think pre-emtive multithreading actually solves all that 
many problems. If a script enters an indefinite loop then you want to be 
terminating or throttling it anyway.  The difference is whether you 
yield/return or drop locks and performing a blocking operation.

I suppose what I'm getting at is the difference between explicit 
multithreading and event based programming.  In the latter case I believe it's 
much easier to have individual devices use a sequential model, but have the 
whole system be concurrent.

> > [1] This issue may come from accidental misuse of terminology, but it's
> > an important distinction.
> 
> Missing reference to footnote.

I was going to comment on the difference between re-entrancy and concurrency, 
but decided against it.

Paul

Patch

diff --git a/hw/virtio-9p-local.c b/hw/virtio-9p-local.c
index d77ecc2..c5d1db3 100644
--- a/hw/virtio-9p-local.c
+++ b/hw/virtio-9p-local.c
@@ -129,6 +129,12 @@  static off_t local_lseek(void *opaque, int fd, off_t offset, int whence)
     return lseek(fd, offset, whence);
 }
 
+static ssize_t local_writev(void *opaque, int fd, const struct iovec *iov,
+			    int iovcnt)
+{
+    return writev(fd, iov, iovcnt);
+}
+
 static V9fsPosixFileOperations ops = {
     .lstat = local_lstat,
     .setuid = local_setuid,
@@ -143,6 +149,7 @@  static V9fsPosixFileOperations ops = {
     .seekdir = local_seekdir,
     .readv = local_readv,
     .lseek = local_lseek,
+    .writev = local_writev,
 };
 
 V9fsPosixFileOperations *virtio_9p_init_local(const char *path)
diff --git a/hw/virtio-9p.c b/hw/virtio-9p.c
index 3ac6255..bc26d66 100644
--- a/hw/virtio-9p.c
+++ b/hw/virtio-9p.c
@@ -168,6 +168,12 @@  static off_t posix_lseek(V9fsState *s, int fd, off_t offset, int whence)
     return s->ops->lseek(s->ops->opaque, fd, offset, whence);
 }
 
+static int posix_writev(V9fsState *s, int fd, const struct iovec *iov,
+                       int iovcnt)
+{
+    return s->ops->writev(s->ops->opaque, fd, iov, iovcnt);
+}
+
 static void v9fs_string_init(V9fsString *str)
 {
     str->data = NULL;
@@ -1319,10 +1325,97 @@  out:
     complete_pdu(s, pdu, err);
 }
 
+typedef struct V9fsWriteState {
+    V9fsPDU *pdu;
+    size_t offset;
+    int32_t fid;
+    int32_t len;
+    int32_t count;
+    int32_t total;
+    int64_t off;
+    V9fsFidState *fidp;
+    struct iovec iov[128]; /* FIXME: bad, bad, bad */
+    struct iovec *sg;
+    int cnt;
+} V9fsWriteState;
+
+static void v9fs_write_post_writev(V9fsState *s, V9fsWriteState *vs,
+                                   ssize_t err)
+{
+    BUG_ON(vs->len < 0);
+    vs->total += vs->len;
+    vs->sg = adjust_sg(vs->sg, vs->len, &vs->cnt);
+    if (vs->total < vs->count && vs->len > 0) {
+        do {
+            if (0)
+                print_sg(vs->sg, vs->cnt);
+            vs->len =  posix_writev(s, vs->fidp->fd, vs->sg, vs->cnt);
+        } while (vs->len == -1 && errno == EINTR);
+        v9fs_write_post_writev(s, vs, err);
+    }
+    vs->offset += pdu_marshal(vs->pdu, vs->offset, "d", vs->total);
+
+    err = vs->offset;
+    complete_pdu(s, vs->pdu, err);
+    qemu_free(vs);
+}
+
+static void v9fs_write_post_lseek(V9fsState *s, V9fsWriteState *vs, ssize_t err)
+{
+    BUG_ON(err == -1);
+
+    vs->sg = cap_sg(vs->sg, vs->count, &vs->cnt);
+
+    if (vs->total < vs->count) {
+        do {
+            if (0)
+                print_sg(vs->sg, vs->cnt);
+            vs->len = posix_writev(s, vs->fidp->fd, vs->sg, vs->cnt);
+        } while (vs->len == -1 && errno == EINTR);
+
+        v9fs_write_post_writev(s, vs, err);
+        return;
+    }
+
+    complete_pdu(s, vs->pdu, err);
+    qemu_free(vs);
+}
+
 static void v9fs_write(V9fsState *s, V9fsPDU *pdu)
 {
-    if (debug_9p_pdu)
-        pprint_pdu(pdu);
+    V9fsWriteState *vs;
+    ssize_t err;
+
+    vs = qemu_malloc(sizeof(*vs));
+
+    vs->pdu = pdu;
+    vs->offset = 7;
+    vs->sg = vs->iov;
+    vs->total = 0;
+    vs->len = 0;
+
+    pdu_unmarshal(vs->pdu, vs->offset, "dqdv", &vs->fid, &vs->off, &vs->count,
+                    vs->sg, &vs->cnt);
+
+    vs->fidp = lookup_fid(s, vs->fid);
+    if (vs->fidp == NULL) {
+        err = -EINVAL;
+        goto out;
+    }
+
+    if (vs->fidp->fd == -1) {
+        err = -EINVAL;
+        goto out;
+    }
+
+    err = posix_lseek(s, vs->fidp->fd, vs->off, SEEK_SET);
+
+    v9fs_write_post_lseek(s, vs, err);
+    return;
+
+out:
+    complete_pdu(s, vs->pdu, err);
+    qemu_free(vs);
 }
 
 static void v9fs_create(V9fsState *s, V9fsPDU *pdu)