diff mbox

[1/3] Introduce threadlets

Message ID 20101019174245.16514.14542.stgit@localhost6.localdomain6
State New
Headers show

Commit Message

Arun Bharadwaj Oct. 19, 2010, 5:42 p.m. UTC
From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

This patch creates a generic asynchronous-task-offloading infrastructure named
threadlets. The core idea has been borrowed from the threading framework that
is being used by paio.

The reason for creating this generic infrastructure is so that other subsystems,
such as virtio-9p could make use of it for offloading tasks that could block.

The patch creates a global queue on-to which subsystems can queue their tasks to
be executed asynchronously.

The patch also provides API's that allow a subsystem to create a private queue
with an associated pool of threads.

[ego@in.ibm.com: Facelift of the code, Documentation, cancel_threadlet
and other helpers]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
 Makefile.objs          |    3 +
 docs/async-support.txt |  141 +++++++++++++++++++++++++++++++++++++++++
 qemu-threadlets.c      |  165 ++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-threadlets.h      |   48 ++++++++++++++
 4 files changed, 356 insertions(+), 1 deletions(-)
 create mode 100644 docs/async-support.txt
 create mode 100644 qemu-threadlets.c
 create mode 100644 qemu-threadlets.h

Comments

Balbir Singh Oct. 19, 2010, 6:36 p.m. UTC | #1
* Arun R B <arun@linux.vnet.ibm.com> [2010-10-19 23:12:45]:

> From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> 
> This patch creates a generic asynchronous-task-offloading infrastructure named
> threadlets. The core idea has been borrowed from the threading framework that
> is being used by paio.
> 
> The reason for creating this generic infrastructure is so that other subsystems,
> such as virtio-9p could make use of it for offloading tasks that could block.
> 
> The patch creates a global queue on-to which subsystems can queue their tasks to
> be executed asynchronously.
> 
> The patch also provides API's that allow a subsystem to create a private queue
> with an associated pool of threads.
> 
> [ego@in.ibm.com: Facelift of the code, Documentation, cancel_threadlet
> and other helpers]
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
> Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
> Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
> ---
>  Makefile.objs          |    3 +
>  docs/async-support.txt |  141 +++++++++++++++++++++++++++++++++++++++++
>  qemu-threadlets.c      |  165 ++++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-threadlets.h      |   48 ++++++++++++++
>  4 files changed, 356 insertions(+), 1 deletions(-)
>  create mode 100644 docs/async-support.txt
>  create mode 100644 qemu-threadlets.c
>  create mode 100644 qemu-threadlets.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index cd5a24b..2cf8aba 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -9,6 +9,8 @@ qobject-obj-y += qerror.o
> 
>  block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
>  block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
> +block-obj-$(CONFIG_POSIX) += qemu-thread.o
> +block-obj-$(CONFIG_POSIX) += qemu-threadlets.o
>  block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
>  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
> 
> @@ -124,7 +126,6 @@ endif
>  common-obj-y += $(addprefix ui/, $(ui-obj-y))
> 
>  common-obj-y += iov.o acl.o
> -common-obj-$(CONFIG_THREAD) += qemu-thread.o
>  common-obj-y += notify.o event_notifier.o
>  common-obj-y += qemu-timer.o
> 
> diff --git a/docs/async-support.txt b/docs/async-support.txt
> new file mode 100644
> index 0000000..9f22b9a
> --- /dev/null
> +++ b/docs/async-support.txt
> @@ -0,0 +1,141 @@
> +== How to use the threadlets infrastructure supported in Qemu ==
> +
> +== Threadlets ==
> +
> +Q.1: What are threadlets ?
> +A.1: Threadlets is an infrastructure within QEMU that allows other subsystems
> +     to offload possibly blocking work to a queue to be processed by a pool
> +     of threads asynchronously.
> +
> +Q.2: When would one want to use threadlets ?
> +A.2: Threadlets are useful when there are operations that can be performed
> +     outside the context of the VCPU/IO threads inorder to free these latter
> +     to service any other guest requests.
> +
> +Q.3: I have some work that can be executed in an asynchronous context. How
> +     should I go about it ?
> +A.3: One could follow the steps listed below:
> +
> +     - Define a function which would do the asynchronous work.
> +	static void my_threadlet_func(ThreadletWork *work)
> +	{
> +	}
> +
> +     - Declare an object of type ThreadletWork;
> +	ThreadletWork work;
> +
> +
> +     - Assign a value to the "func" member of ThreadletWork object.
> +	work.func = my_threadlet_func;
> +
> +     - Submit the threadlet to the global queue.
> +	submit_threadletwork(&work);
> +
> +     - Continue servicing some other guest operations.
> +
> +Q.4: I want to my_threadlet_func to access some non-global data. How do I do
> +     that ?
> +A.4: Suppose you want my_threadlet_func to access some non-global data-object
> +     of type myPrivateData. In that case one could follow the following steps.
> +
> +     - Define a member of the type ThreadletWork within myPrivateData.
> +	typedef struct MyPrivateData {
> +		...;
> +		...;
> +		...;
> +		ThreadletWork work;
> +	} MyPrivateData;
> +
> +	MyPrivateData my_data;
> +
> +     - Initialize myData.work as described in A.3
> +	myData.work.func = my_threadlet_func;
> +	submit_threadletwork(&myData.work);
> +
> +     - Access the myData object inside my_threadlet_func() using container_of
> +       primitive
> +	static void my_threadlet_func(ThreadletWork *work)
> +	{
> +		myPrivateData *mydata_ptr;
> +		mydata_ptr = container_of(work, myPrivateData, work);
> +
> +		/* mydata_ptr now points to myData object */
> +	}
> +
> +Q.5: Are there any precautions one must take while sharing data with the
> +     Asynchronous thread-pool ?
> +A.5: Yes, make sure that the helper function of the type my_threadlet_func()
> +     does not access/modify data when it can be accessed or modified in the
> +     context of VCPU thread or IO thread. This is because the asynchronous
> +     threads in the pool can run in parallel with the VCPU/IOThreads as shown
> +     in the figure.
> +
> +     A typical workflow is as follows:
> +
> +              VCPU/IOThread
> +                   |
> +                   | (1)
> +                   |
> +                   V
> +                Offload work              (2)
> +      |-------> to threadlets -----------------------------> Helper thread
> +      |            |                                               |
> +      |            |                                               |
> +      |            | (3)                                           | (4)
> +      |            |                                               |
> +      |         Handle other Guest requests                        |
> +      |            |                                               |
> +      |            |                                               V
> +      |            | (3)                                  Signal the I/O Thread
> +      |(6)         |                                               |
> +      |            |                                              /
> +      |            |                                             /
> +      |            V                                            /
> +      |          Do the post <---------------------------------/
> +      |          processing               (5)
> +      |            |
> +      |            | (6)
> +      |            V
> +      |-Yes------ More async work?
> +                   |
> +                   | (7)
> +	           No
> +                   |
> +                   |
> +                   .
> +                   .
> +
> +    Hence one needs to make sure that in the steps (3) and (4) which run in
> +    parallel, any global data is accessed within only one context.
> +
> +Q.6: I have queued a threadlet which I want to cancel. How do I do that ?
> +A.6: Threadlets framework provides the API cancel_threadlet:
> +       - int cancel_threadletwork(ThreadletWork *work)
> +
> +     The API scans the ThreadletQueue to see if (work) is present. If it finds
> +     work, it'll dequeue work and return 0.
> +
> +     On the other hand, if it does not find the (work) in the ThreadletQueue,
> +     then it'll return 1. This can imply two things. Either the work is being
> +     processed by one of the helper threads or it has been processed. The
> +     threadlet infrastructure currently _does_not_ distinguish between these
> +     two and the onus is on the caller to do that.
> +
> +Q.7: Apart from the global pool of threads, can I have my own private Queue ?
> +A.7: Yes, the threadlets framework allows subsystems to create their own private
> +     queues with associated pools of threads.
> +
> +     - Define a PrivateQueue
> +       ThreadletQueue myQueue;
> +
> +     - Initialize it:
> +       threadlet_queue_init(&myQueue, my_max_threads, my_min_threads);
> +       where my_max_threads is the maximum number of threads that can be in the
> +       thread pool and my_min_threads is the minimum number of active threads
> +       that can be in the thread-pool.
> +
> +     - Submit work:
> +       submit_threadletwork_to_queue(&myQueue, &my_work);
> +
> +     - Cancel work:
> +       cancel_threadletwork_on_queue(&myQueue, &my_work);
> diff --git a/qemu-threadlets.c b/qemu-threadlets.c
> new file mode 100644
> index 0000000..fd33752
> --- /dev/null
> +++ b/qemu-threadlets.c
> @@ -0,0 +1,165 @@
> +/*
> + * Threadlet support for offloading tasks to be executed asynchronously
> + *
> + * Copyright IBM, Corp. 2008
> + * Copyright IBM, Corp. 2010
> + *
> + * Authors:
> + *  Anthony Liguori     <aliguori@us.ibm.com>
> + *  Aneesh Kumar K.V    <aneesh.kumar@linux.vnet.ibm.com>
> + *  Gautham R Shenoy    <ego@in.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu-threadlets.h"
> +#include "osdep.h"
> +
> +#define MAX_GLOBAL_THREADS  64
> +#define MIN_GLOBAL_THREADS  64
> +static ThreadletQueue globalqueue;
> +static int globalqueue_init;
> +
> +static void *threadlet_worker(void *data)
> +{
> +    ThreadletQueue *queue = data;
> +
Ideally you need

        s = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);

But qemu will need to wrap this around as well.
> +    qemu_mutex_lock(&(queue->lock));
> +    while (1) {
> +        ThreadletWork *work;
> +        int ret = 0;
> +
> +        while (QTAILQ_EMPTY(&(queue->request_list)) &&
> +               (ret != ETIMEDOUT)) {
> +            ret = qemu_cond_timedwait(&(queue->cond),
> +					 &(queue->lock), 10*100000);

Ewww... what is 10*100000, can we use something more meaningful
please?

> +        }
> +
> +        assert(queue->idle_threads != 0);

This assertion holds because we believe one of the idle_threads
actually did the dequeuing, right?

> +        if (QTAILQ_EMPTY(&(queue->request_list))) {
> +            if (queue->cur_threads > queue->min_threads) {
> +                /* We retain the minimum number of threads */
> +                break;
> +            }
> +        } else {
> +            work = QTAILQ_FIRST(&(queue->request_list));
> +            QTAILQ_REMOVE(&(queue->request_list), work, node);
> +
> +            queue->idle_threads--;
> +            qemu_mutex_unlock(&(queue->lock));
> +
> +            /* execute the work function */
> +            work->func(work);
> +
> +            qemu_mutex_lock(&(queue->lock));
> +            queue->idle_threads++;
> +        }
> +    }
> +
> +    queue->idle_threads--;
> +    queue->cur_threads--;
> +    qemu_mutex_unlock(&(queue->lock));
> +
> +    return NULL;
Does anybody do a join on the exiting thread from the pool?

> +}
> +
> +static void spawn_threadlet(ThreadletQueue *queue)
> +{
> +    QemuThread thread;
> +
> +    queue->cur_threads++;
> +    queue->idle_threads++;
> +
> +    qemu_thread_create(&thread, threadlet_worker, queue);

> +}
> +
> +/**
> + * submit_threadletwork_to_queue: Submit a new task to a private queue to be
> + *                            executed asynchronously.
> + * @queue: Per-subsystem private queue to which the new task needs
> + *         to be submitted.
> + * @work: Contains information about the task that needs to be submitted.
> + */
> +void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
> +{
> +    qemu_mutex_lock(&(queue->lock));
> +    if (queue->idle_threads == 0 && queue->cur_threads < queue->max_threads) {
> +        spawn_threadlet(queue);

So we hold queue->lock, spawn the thread, the spawned thread tries to
acquire queue->lock

> +    }
> +    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
> +    qemu_mutex_unlock(&(queue->lock));
> +    qemu_cond_signal(&(queue->cond));

In the case that we just spawned the threadlet, the cond_signal is
spurious. If we need predictable scheduling behaviour,
qemu_cond_signal needs to happen with queue->lock held.

I'd rewrite the function as 

/**
 * submit_threadletwork_to_queue: Submit a new task to a private queue to be
 *                            executed asynchronously.
 * @queue: Per-subsystem private queue to which the new task needs
 *         to be submitted.
 * @work: Contains information about the task that needs to be submitted.
 */
void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
{
    qemu_mutex_lock(&(queue->lock));
    if (queue->idle_threads == 0 && (queue->cur_threads < queue->max_threads)) {
        spawn_threadlet(queue);
    } else {
        qemu_cond_signal(&(queue->cond));
    }
    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
    qemu_mutex_unlock(&(queue->lock));
}

> +/**
> + * submit_threadletwork: Submit to the global queue a new task to be executed
> + *                   asynchronously.
> + * @work: Contains information about the task that needs to be submitted.
> + */
> +void submit_threadletwork(ThreadletWork *work)
> +{
> +    if (unlikely(!globalqueue_init)) {
> +        threadlet_queue_init(&globalqueue, MAX_GLOBAL_THREADS,
> +                                MIN_GLOBAL_THREADS);
> +        globalqueue_init = 1;
> +    }

What protects globalqueue_init?

> +
> +    submit_threadletwork_to_queue(&globalqueue, work);
> +}
> +
> +/**
> + * cancel_threadletwork_on_queue: Cancel a task queued on a Queue.
> + * @queue: The queue containing the task to be cancelled.
> + * @work: Contains the information of the task that needs to be cancelled.
> + *
> + * Returns: 0 if the task is successfully cancelled.
> + *          1 otherwise.
> + */
> +int cancel_threadletwork_on_queue(ThreadletQueue *queue, ThreadletWork *work)
> +{
> +    ThreadletWork *ret_work;
> +    int ret = 0;
> +
> +    qemu_mutex_lock(&(queue->lock));
> +    QTAILQ_FOREACH(ret_work, &(queue->request_list), node) {
> +        if (ret_work == work) {
> +            QTAILQ_REMOVE(&(queue->request_list), ret_work, node);
> +            ret = 1;
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&(queue->lock));
> +
> +    return ret;
> +}
> +
> +/**
> + * cancel_threadletwork: Cancel a task queued on the global queue.

NOTE: cancel is a confusing term, thread cancel is different from
cancelling a job on the global queue, I'd preferrably call this
dequeue_threadletwork

Generic question, is thread a reason to use threadletwork as one word,
instead of threadlet_work? Specially since the data structure is
called ThreadletWork.

> + * @work: Contains the information of the task that needs to be cancelled.
> + *
> + * Returns: 0 if the task is successfully cancelled.
> + *          1 otherwise.
> + */
> +int cancel_threadletwork(ThreadletWork *work)
> +{
> +    return cancel_threadletwork_on_queue(&globalqueue, work);
> +}
> +
> +/**
> + * threadlet_queue_init: Initialize a threadlet queue.
> + * @queue: The threadlet queue to be initialized.
> + * @max_threads: Maximum number of threads processing the queue.
> + * @min_threads: Minimum number of threads processing the queue.
> + */
> +void threadlet_queue_init(ThreadletQueue *queue,
> +				    int max_threads, int min_threads)
> +{
> +    queue->cur_threads  = 0;
> +    queue->idle_threads = 0;
> +    queue->max_threads  = max_threads;
> +    queue->min_threads  = min_threads;
> +    QTAILQ_INIT(&(queue->request_list));
> +    qemu_mutex_init(&(queue->lock));
> +    qemu_cond_init(&(queue->cond));
> +}
> diff --git a/qemu-threadlets.h b/qemu-threadlets.h
> new file mode 100644
> index 0000000..9c8f9e5
> --- /dev/null
> +++ b/qemu-threadlets.h
> @@ -0,0 +1,48 @@
> +/*
> + * Threadlet support for offloading tasks to be executed asynchronously
> + *
> + * Copyright IBM, Corp. 2008
> + * Copyright IBM, Corp. 2010
> + *
> + * Authors:
> + *  Anthony Liguori     <aliguori@us.ibm.com>
> + *  Aneesh Kumar K.V    <aneesh.kumar@linux.vnet.ibm.com>
> + *  Gautham R Shenoy    <ego@in.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + */
> +
> +#ifndef QEMU_ASYNC_WORK_H
> +#define QEMU_ASYNC_WORK_H
> +
> +#include "qemu-queue.h"
> +#include "qemu-common.h"
> +#include "qemu-thread.h"
> +
> +typedef struct ThreadletQueue
> +{
> +    QemuMutex lock;
> +    QemuCond cond;
> +    int max_threads;
> +    int min_threads;
> +    int cur_threads;
> +    int idle_threads;
> +    QTAILQ_HEAD(, ThreadletWork) request_list;
> +} ThreadletQueue;
> +
> +typedef struct ThreadletWork
> +{
> +    QTAILQ_ENTRY(ThreadletWork) node;
> +    void (*func)(struct ThreadletWork *work);
> +} ThreadletWork;
> +
> +extern void submit_threadletwork_to_queue(ThreadletQueue *queue,
> +                                      ThreadletWork *work);
> +extern void submit_threadletwork(ThreadletWork *work);
> +extern int cancel_threadletwork_on_queue(ThreadletQueue *queue,
> +                                        ThreadletWork *work);
> +extern int cancel_threadletwork(ThreadletWork *work);
> +extern void threadlet_queue_init(ThreadletQueue *queue, int max_threads,
> +                                 int min_threads);
> +#endif
> 
>
Paolo Bonzini Oct. 19, 2010, 7:01 p.m. UTC | #2
On 10/19/2010 08:36 PM, Balbir Singh wrote:
> Ideally you need
>
>          s = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
>
> But qemu will need to wrap this around as well.

Why?  QEMU is never using thread cancellation.

Paolo
Balbir Singh Oct. 19, 2010, 7:12 p.m. UTC | #3
* Paolo Bonzini <pbonzini@redhat.com> [2010-10-19 21:01:03]:

> On 10/19/2010 08:36 PM, Balbir Singh wrote:
> >Ideally you need
> >
> >         s = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
> >
> >But qemu will need to wrap this around as well.
> 
> Why?  QEMU is never using thread cancellation.
>

Yes, I agree, in the longer run, cancellation is a good way to kill
threads, specially in a thread pool. My comment was more along the
lines of good practices and potential use of pthread_cancel(), not a
strict comment on something urgent or broken.
Paolo Bonzini Oct. 19, 2010, 7:29 p.m. UTC | #4
On 10/19/2010 09:12 PM, Balbir Singh wrote:
> > >Ideally you need
> > >
> > >           s = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
> > >
> > > But qemu will need to wrap this around as well.
> >
> >  Why?  QEMU is never using thread cancellation.
>
> Yes, I agree, in the longer run, cancellation is a good way to kill
> threads, specially in a thread pool. My comment was more along the
> lines of good practices and potential use of pthread_cancel(), not a
> strict comment on something urgent or broken.

But there is no such use; as long as we keep ourselves to the 
qemu-thread API, we know that nothing will use cancellation.

The day qemu-thread will introduce cancellation functions we'll care 
about enabling/disabling it in some threads.

Paolo
jvrao Oct. 19, 2010, 9 p.m. UTC | #5
On 10/19/2010 11:36 AM, Balbir Singh wrote:
> * Arun R B <arun@linux.vnet.ibm.com> [2010-10-19 23:12:45]:
> 
>> From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>
>> This patch creates a generic asynchronous-task-offloading infrastructure named
>> threadlets. The core idea has been borrowed from the threading framework that
>> is being used by paio.
>>
>> The reason for creating this generic infrastructure is so that other subsystems,
>> such as virtio-9p could make use of it for offloading tasks that could block.
>>
>> The patch creates a global queue on-to which subsystems can queue their tasks to
>> be executed asynchronously.
>>
>> The patch also provides API's that allow a subsystem to create a private queue
>> with an associated pool of threads.
>>
>> [ego@in.ibm.com: Facelift of the code, Documentation, cancel_threadlet
>> and other helpers]
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
>> Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
>> Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
>> ---
>>  Makefile.objs          |    3 +
>>  docs/async-support.txt |  141 +++++++++++++++++++++++++++++++++++++++++
>>  qemu-threadlets.c      |  165 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  qemu-threadlets.h      |   48 ++++++++++++++
>>  4 files changed, 356 insertions(+), 1 deletions(-)
>>  create mode 100644 docs/async-support.txt
>>  create mode 100644 qemu-threadlets.c
>>  create mode 100644 qemu-threadlets.h
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index cd5a24b..2cf8aba 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -9,6 +9,8 @@ qobject-obj-y += qerror.o
>>
>>  block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
>>  block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
>> +block-obj-$(CONFIG_POSIX) += qemu-thread.o
>> +block-obj-$(CONFIG_POSIX) += qemu-threadlets.o
>>  block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
>>  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>
>> @@ -124,7 +126,6 @@ endif
>>  common-obj-y += $(addprefix ui/, $(ui-obj-y))
>>
>>  common-obj-y += iov.o acl.o
>> -common-obj-$(CONFIG_THREAD) += qemu-thread.o
>>  common-obj-y += notify.o event_notifier.o
>>  common-obj-y += qemu-timer.o
>>
>> diff --git a/docs/async-support.txt b/docs/async-support.txt
>> new file mode 100644
>> index 0000000..9f22b9a
>> --- /dev/null
>> +++ b/docs/async-support.txt
>> @@ -0,0 +1,141 @@
>> +== How to use the threadlets infrastructure supported in Qemu ==
>> +
>> +== Threadlets ==
>> +
>> +Q.1: What are threadlets ?
>> +A.1: Threadlets is an infrastructure within QEMU that allows other subsystems
>> +     to offload possibly blocking work to a queue to be processed by a pool
>> +     of threads asynchronously.
>> +
>> +Q.2: When would one want to use threadlets ?
>> +A.2: Threadlets are useful when there are operations that can be performed
>> +     outside the context of the VCPU/IO threads inorder to free these latter
>> +     to service any other guest requests.
>> +
>> +Q.3: I have some work that can be executed in an asynchronous context. How
>> +     should I go about it ?
>> +A.3: One could follow the steps listed below:
>> +
>> +     - Define a function which would do the asynchronous work.
>> +	static void my_threadlet_func(ThreadletWork *work)
>> +	{
>> +	}
>> +
>> +     - Declare an object of type ThreadletWork;
>> +	ThreadletWork work;
>> +
>> +
>> +     - Assign a value to the "func" member of ThreadletWork object.
>> +	work.func = my_threadlet_func;
>> +
>> +     - Submit the threadlet to the global queue.
>> +	submit_threadletwork(&work);
>> +
>> +     - Continue servicing some other guest operations.
>> +
>> +Q.4: I want to my_threadlet_func to access some non-global data. How do I do
>> +     that ?
>> +A.4: Suppose you want my_threadlet_func to access some non-global data-object
>> +     of type myPrivateData. In that case one could follow the following steps.
>> +
>> +     - Define a member of the type ThreadletWork within myPrivateData.
>> +	typedef struct MyPrivateData {
>> +		...;
>> +		...;
>> +		...;
>> +		ThreadletWork work;
>> +	} MyPrivateData;
>> +
>> +	MyPrivateData my_data;
>> +
>> +     - Initialize myData.work as described in A.3
>> +	myData.work.func = my_threadlet_func;
>> +	submit_threadletwork(&myData.work);
>> +
>> +     - Access the myData object inside my_threadlet_func() using container_of
>> +       primitive
>> +	static void my_threadlet_func(ThreadletWork *work)
>> +	{
>> +		myPrivateData *mydata_ptr;
>> +		mydata_ptr = container_of(work, myPrivateData, work);
>> +
>> +		/* mydata_ptr now points to myData object */
>> +	}
>> +
>> +Q.5: Are there any precautions one must take while sharing data with the
>> +     Asynchronous thread-pool ?
>> +A.5: Yes, make sure that the helper function of the type my_threadlet_func()
>> +     does not access/modify data when it can be accessed or modified in the
>> +     context of VCPU thread or IO thread. This is because the asynchronous
>> +     threads in the pool can run in parallel with the VCPU/IOThreads as shown
>> +     in the figure.
>> +
>> +     A typical workflow is as follows:
>> +
>> +              VCPU/IOThread
>> +                   |
>> +                   | (1)
>> +                   |
>> +                   V
>> +                Offload work              (2)
>> +      |-------> to threadlets -----------------------------> Helper thread
>> +      |            |                                               |
>> +      |            |                                               |
>> +      |            | (3)                                           | (4)
>> +      |            |                                               |
>> +      |         Handle other Guest requests                        |
>> +      |            |                                               |
>> +      |            |                                               V
>> +      |            | (3)                                  Signal the I/O Thread
>> +      |(6)         |                                               |
>> +      |            |                                              /
>> +      |            |                                             /
>> +      |            V                                            /
>> +      |          Do the post <---------------------------------/
>> +      |          processing               (5)
>> +      |            |
>> +      |            | (6)
>> +      |            V
>> +      |-Yes------ More async work?
>> +                   |
>> +                   | (7)
>> +	           No
>> +                   |
>> +                   |
>> +                   .
>> +                   .
>> +
>> +    Hence one needs to make sure that in the steps (3) and (4) which run in
>> +    parallel, any global data is accessed within only one context.
>> +
>> +Q.6: I have queued a threadlet which I want to cancel. How do I do that ?
>> +A.6: Threadlets framework provides the API cancel_threadlet:
>> +       - int cancel_threadletwork(ThreadletWork *work)
>> +
>> +     The API scans the ThreadletQueue to see if (work) is present. If it finds
>> +     work, it'll dequeue work and return 0.
>> +
>> +     On the other hand, if it does not find the (work) in the ThreadletQueue,
>> +     then it'll return 1. This can imply two things. Either the work is being
>> +     processed by one of the helper threads or it has been processed. The
>> +     threadlet infrastructure currently _does_not_ distinguish between these
>> +     two and the onus is on the caller to do that.
>> +
>> +Q.7: Apart from the global pool of threads, can I have my own private Queue ?
>> +A.7: Yes, the threadlets framework allows subsystems to create their own private
>> +     queues with associated pools of threads.
>> +
>> +     - Define a PrivateQueue
>> +       ThreadletQueue myQueue;
>> +
>> +     - Initialize it:
>> +       threadlet_queue_init(&myQueue, my_max_threads, my_min_threads);
>> +       where my_max_threads is the maximum number of threads that can be in the
>> +       thread pool and my_min_threads is the minimum number of active threads
>> +       that can be in the thread-pool.
>> +
>> +     - Submit work:
>> +       submit_threadletwork_to_queue(&myQueue, &my_work);
>> +
>> +     - Cancel work:
>> +       cancel_threadletwork_on_queue(&myQueue, &my_work);
>> diff --git a/qemu-threadlets.c b/qemu-threadlets.c
>> new file mode 100644
>> index 0000000..fd33752
>> --- /dev/null
>> +++ b/qemu-threadlets.c
>> @@ -0,0 +1,165 @@
>> +/*
>> + * Threadlet support for offloading tasks to be executed asynchronously
>> + *
>> + * Copyright IBM, Corp. 2008
>> + * Copyright IBM, Corp. 2010
>> + *
>> + * Authors:
>> + *  Anthony Liguori     <aliguori@us.ibm.com>
>> + *  Aneesh Kumar K.V    <aneesh.kumar@linux.vnet.ibm.com>
>> + *  Gautham R Shenoy    <ego@in.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu-threadlets.h"
>> +#include "osdep.h"
>> +
>> +#define MAX_GLOBAL_THREADS  64
>> +#define MIN_GLOBAL_THREADS  64
>> +static ThreadletQueue globalqueue;
>> +static int globalqueue_init;
>> +
>> +static void *threadlet_worker(void *data)
>> +{
>> +    ThreadletQueue *queue = data;
>> +
> Ideally you need
> 
>         s = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
> 
> But qemu will need to wrap this around as well.
>> +    qemu_mutex_lock(&(queue->lock));
>> +    while (1) {
>> +        ThreadletWork *work;
>> +        int ret = 0;
>> +
>> +        while (QTAILQ_EMPTY(&(queue->request_list)) &&
>> +               (ret != ETIMEDOUT)) {
>> +            ret = qemu_cond_timedwait(&(queue->cond),
>> +					 &(queue->lock), 10*100000);
> 
> Ewww... what is 10*100000, can we use something more meaningful
> please?

Or at least some comment...
> 
>> +        }
>> +
>> +        assert(queue->idle_threads != 0);
> 
> This assertion holds because we believe one of the idle_threads
> actually did the dequeuing, right?

Correct.. or there is no work..and we are woken up by the time out.
Of course in that case #of worker threads should be min_threads.
> 
>> +        if (QTAILQ_EMPTY(&(queue->request_list))) {
>> +            if (queue->cur_threads > queue->min_threads) {
>> +                /* We retain the minimum number of threads */
>> +                break;
>> +            }
>> +        } else {
>> +            work = QTAILQ_FIRST(&(queue->request_list));
>> +            QTAILQ_REMOVE(&(queue->request_list), work, node);
>> +
>> +            queue->idle_threads--;
>> +            qemu_mutex_unlock(&(queue->lock));
>> +
>> +            /* execute the work function */
>> +            work->func(work);
>> +
>> +            qemu_mutex_lock(&(queue->lock));
>> +            queue->idle_threads++;
>> +        }
>> +    }
>> +
>> +    queue->idle_threads--;
>> +    queue->cur_threads--;
>> +    qemu_mutex_unlock(&(queue->lock));
>> +
>> +    return NULL;
> Does anybody do a join on the exiting thread from the pool?
Not sure what you mean here. We keep min threads and float
threads up to max limit on need basis.
> 
>> +}
>> +
>> +static void spawn_threadlet(ThreadletQueue *queue)
>> +{
>> +    QemuThread thread;
>> +
>> +    queue->cur_threads++;
>> +    queue->idle_threads++;
>> +
>> +    qemu_thread_create(&thread, threadlet_worker, queue);
> 
>> +}
>> +
>> +/**
>> + * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>> + *                            executed asynchronously.
>> + * @queue: Per-subsystem private queue to which the new task needs
>> + *         to be submitted.
>> + * @work: Contains information about the task that needs to be submitted.
>> + */
>> +void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
>> +{
>> +    qemu_mutex_lock(&(queue->lock));
>> +    if (queue->idle_threads == 0 && queue->cur_threads < queue->max_threads) {
>> +        spawn_threadlet(queue);
> 
> So we hold queue->lock, spawn the thread, the spawned thread tries to
> acquire queue->lock
> 
>> +    }
>> +    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>> +    qemu_mutex_unlock(&(queue->lock));
>> +    qemu_cond_signal(&(queue->cond));
> 
> In the case that we just spawned the threadlet, the cond_signal is
> spurious. If we need predictable scheduling behaviour,
> qemu_cond_signal needs to happen with queue->lock held.
> 
> I'd rewrite the function as 
> 
> /**
>  * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>  *                            executed asynchronously.
>  * @queue: Per-subsystem private queue to which the new task needs
>  *         to be submitted.
>  * @work: Contains information about the task that needs to be submitted.
>  */
> void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
> {
>     qemu_mutex_lock(&(queue->lock));
>     if (queue->idle_threads == 0 && (queue->cur_threads < queue->max_threads)) {
>         spawn_threadlet(queue);
>     } else {
>         qemu_cond_signal(&(queue->cond));
>     }
>     QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>     qemu_mutex_unlock(&(queue->lock));
> }
> 
This looks fine to me. may be more cleaner than the previous one..but functionally
not much different.

>> +/**
>> + * submit_threadletwork: Submit to the global queue a new task to be executed
>> + *                   asynchronously.
>> + * @work: Contains information about the task that needs to be submitted.
>> + */
>> +void submit_threadletwork(ThreadletWork *work)
>> +{
>> +    if (unlikely(!globalqueue_init)) {
>> +        threadlet_queue_init(&globalqueue, MAX_GLOBAL_THREADS,
>> +                                MIN_GLOBAL_THREADS);
>> +        globalqueue_init = 1;
>> +    }
> 
> What protects globalqueue_init?

It should be called in vCPU thread context which is serialized by definition.

- JV

> 
>> +
>> +    submit_threadletwork_to_queue(&globalqueue, work);
>> +}
>> +
>> +/**
>> + * cancel_threadletwork_on_queue: Cancel a task queued on a Queue.
>> + * @queue: The queue containing the task to be cancelled.
>> + * @work: Contains the information of the task that needs to be cancelled.
>> + *
>> + * Returns: 0 if the task is successfully cancelled.
>> + *          1 otherwise.
>> + */
>> +int cancel_threadletwork_on_queue(ThreadletQueue *queue, ThreadletWork *work)
>> +{
>> +    ThreadletWork *ret_work;
>> +    int ret = 0;
>> +
>> +    qemu_mutex_lock(&(queue->lock));
>> +    QTAILQ_FOREACH(ret_work, &(queue->request_list), node) {
>> +        if (ret_work == work) {
>> +            QTAILQ_REMOVE(&(queue->request_list), ret_work, node);
>> +            ret = 1;
>> +            break;
>> +        }
>> +    }
>> +    qemu_mutex_unlock(&(queue->lock));
>> +
>> +    return ret;
>> +}
>> +
>> +/**
>> + * cancel_threadletwork: Cancel a task queued on the global queue.
> 
> NOTE: cancel is a confusing term, thread cancel is different from
> cancelling a job on the global queue, I'd preferrably call this
> dequeue_threadletwork
> 
> Generic question, is thread a reason to use threadletwork as one word,
> instead of threadlet_work? Specially since the data structure is
> called ThreadletWork.
> 
>> + * @work: Contains the information of the task that needs to be cancelled.
>> + *
>> + * Returns: 0 if the task is successfully cancelled.
>> + *          1 otherwise.
>> + */
>> +int cancel_threadletwork(ThreadletWork *work)
>> +{
>> +    return cancel_threadletwork_on_queue(&globalqueue, work);
>> +}
>> +
>> +/**
>> + * threadlet_queue_init: Initialize a threadlet queue.
>> + * @queue: The threadlet queue to be initialized.
>> + * @max_threads: Maximum number of threads processing the queue.
>> + * @min_threads: Minimum number of threads processing the queue.
>> + */
>> +void threadlet_queue_init(ThreadletQueue *queue,
>> +				    int max_threads, int min_threads)
>> +{
>> +    queue->cur_threads  = 0;
>> +    queue->idle_threads = 0;
>> +    queue->max_threads  = max_threads;
>> +    queue->min_threads  = min_threads;
>> +    QTAILQ_INIT(&(queue->request_list));
>> +    qemu_mutex_init(&(queue->lock));
>> +    qemu_cond_init(&(queue->cond));
>> +}
>> diff --git a/qemu-threadlets.h b/qemu-threadlets.h
>> new file mode 100644
>> index 0000000..9c8f9e5
>> --- /dev/null
>> +++ b/qemu-threadlets.h
>> @@ -0,0 +1,48 @@
>> +/*
>> + * Threadlet support for offloading tasks to be executed asynchronously
>> + *
>> + * Copyright IBM, Corp. 2008
>> + * Copyright IBM, Corp. 2010
>> + *
>> + * Authors:
>> + *  Anthony Liguori     <aliguori@us.ibm.com>
>> + *  Aneesh Kumar K.V    <aneesh.kumar@linux.vnet.ibm.com>
>> + *  Gautham R Shenoy    <ego@in.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + */
>> +
>> +#ifndef QEMU_ASYNC_WORK_H
>> +#define QEMU_ASYNC_WORK_H
>> +
>> +#include "qemu-queue.h"
>> +#include "qemu-common.h"
>> +#include "qemu-thread.h"
>> +
>> +typedef struct ThreadletQueue
>> +{
>> +    QemuMutex lock;
>> +    QemuCond cond;
>> +    int max_threads;
>> +    int min_threads;
>> +    int cur_threads;
>> +    int idle_threads;
>> +    QTAILQ_HEAD(, ThreadletWork) request_list;
>> +} ThreadletQueue;
>> +
>> +typedef struct ThreadletWork
>> +{
>> +    QTAILQ_ENTRY(ThreadletWork) node;
>> +    void (*func)(struct ThreadletWork *work);
>> +} ThreadletWork;
>> +
>> +extern void submit_threadletwork_to_queue(ThreadletQueue *queue,
>> +                                      ThreadletWork *work);
>> +extern void submit_threadletwork(ThreadletWork *work);
>> +extern int cancel_threadletwork_on_queue(ThreadletQueue *queue,
>> +                                        ThreadletWork *work);
>> +extern int cancel_threadletwork(ThreadletWork *work);
>> +extern void threadlet_queue_init(ThreadletQueue *queue, int max_threads,
>> +                                 int min_threads);
>> +#endif
>>
>>
>
Anthony Liguori Oct. 19, 2010, 9:36 p.m. UTC | #6
On 10/19/2010 01:36 PM, Balbir Singh wrote:
>> +    qemu_mutex_lock(&(queue->lock));
>> +    while (1) {
>> +        ThreadletWork *work;
>> +        int ret = 0;
>> +
>> +        while (QTAILQ_EMPTY(&(queue->request_list))&&
>> +               (ret != ETIMEDOUT)) {
>> +            ret = qemu_cond_timedwait(&(queue->cond),
>> +					&(queue->lock), 10*100000);
>>      
> Ewww... what is 10*100000, can we use something more meaningful
> please?
>    

A define is fine but honestly, it's pretty darn obvious what it means...

>> +        }
>> +
>> +        assert(queue->idle_threads != 0);
>>      
> This assertion holds because we believe one of the idle_threads
> actually did the dequeuing, right?
>    

An idle thread is a thread is one that is not doing work.  At this point 
in the code, we are not doing any work (yet) so if idle_threads count is 
zero, something is horribly wrong.  We're also going to unconditionally 
decrement in the future code path which means that if idle_threads is 0, 
it's going to become -1.

The use of idle_thread is to detect whether it's necessary to spawn an 
additional thread.

>> +        if (QTAILQ_EMPTY(&(queue->request_list))) {
>> +            if (queue->cur_threads>  queue->min_threads) {
>> +                /* We retain the minimum number of threads */
>> +                break;
>> +            }
>> +        } else {
>> +            work = QTAILQ_FIRST(&(queue->request_list));
>> +            QTAILQ_REMOVE(&(queue->request_list), work, node);
>> +
>> +            queue->idle_threads--;
>> +            qemu_mutex_unlock(&(queue->lock));
>> +
>> +            /* execute the work function */
>> +            work->func(work);
>> +
>> +            qemu_mutex_lock(&(queue->lock));
>> +            queue->idle_threads++;
>> +        }
>> +    }
>> +
>> +    queue->idle_threads--;
>> +    queue->cur_threads--;
>> +    qemu_mutex_unlock(&(queue->lock));
>> +
>> +    return NULL;
>>      
> Does anybody do a join on the exiting thread from the pool?
>    

No.  The thread is created in a detached state.

>> +}
>> +
>> +static void spawn_threadlet(ThreadletQueue *queue)
>> +{
>> +    QemuThread thread;
>> +
>> +    queue->cur_threads++;
>> +    queue->idle_threads++;
>> +
>> +    qemu_thread_create(&thread, threadlet_worker, queue);
>>      
>    
>> +}
>> +
>> +/**
>> + * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>> + *                            executed asynchronously.
>> + * @queue: Per-subsystem private queue to which the new task needs
>> + *         to be submitted.
>> + * @work: Contains information about the task that needs to be submitted.
>> + */
>> +void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
>> +{
>> +    qemu_mutex_lock(&(queue->lock));
>> +    if (queue->idle_threads == 0&&  queue->cur_threads<  queue->max_threads) {
>> +        spawn_threadlet(queue);
>>      
> So we hold queue->lock, spawn the thread, the spawned thread tries to
> acquire queue->lock
>    

Yup.

>> +    }
>> +    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>> +    qemu_mutex_unlock(&(queue->lock));
>> +    qemu_cond_signal(&(queue->cond));
>>      
> In the case that we just spawned the threadlet, the cond_signal is
> spurious. If we need predictable scheduling behaviour,
> qemu_cond_signal needs to happen with queue->lock held.
>    

It doesn't really affect predictability..

> I'd rewrite the function as
>
> /**
>   * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>   *                            executed asynchronously.
>   * @queue: Per-subsystem private queue to which the new task needs
>   *         to be submitted.
>   * @work: Contains information about the task that needs to be submitted.
>   */
> void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
> {
>      qemu_mutex_lock(&(queue->lock));
>      if (queue->idle_threads == 0&&  (queue->cur_threads<  queue->max_threads)) {
>          spawn_threadlet(queue);
>      } else {
>          qemu_cond_signal(&(queue->cond));
>      }
>      QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>      qemu_mutex_unlock(&(queue->lock));
> }
>    

I think this is a lot more fragile.  You're relying on the fact that 
signal will not cause the signalled thread to actually awaken until we 
release the lock and doing work after signalling that the signalled 
thread needs to be completed before it wakes up.

I think you're a lot more robust in the long term if you treat condition 
signalling as a hand off point because it makes the code a lot more 
explicit about what's happening.

>> +/**
>> + * submit_threadletwork: Submit to the global queue a new task to be executed
>> + *                   asynchronously.
>> + * @work: Contains information about the task that needs to be submitted.
>> + */
>> +void submit_threadletwork(ThreadletWork *work)
>> +{
>> +    if (unlikely(!globalqueue_init)) {
>> +        threadlet_queue_init(&globalqueue, MAX_GLOBAL_THREADS,
>> +                                MIN_GLOBAL_THREADS);
>> +        globalqueue_init = 1;
>> +    }
>>      
> What protects globalqueue_init?
>    

qemu_mutex, and that unlikely is almost certainly a premature optimization.

Regards,

Anthony Liguori
Balbir Singh Oct. 20, 2010, 2:22 a.m. UTC | #7
* Anthony Liguori <anthony@codemonkey.ws> [2010-10-19 16:36:31]:

> On 10/19/2010 01:36 PM, Balbir Singh wrote:
> >>+    qemu_mutex_lock(&(queue->lock));
> >>+    while (1) {
> >>+        ThreadletWork *work;
> >>+        int ret = 0;
> >>+
> >>+        while (QTAILQ_EMPTY(&(queue->request_list))&&
> >>+               (ret != ETIMEDOUT)) {
> >>+            ret = qemu_cond_timedwait(&(queue->cond),
> >>+					&(queue->lock), 10*100000);
> >Ewww... what is 10*100000, can we use something more meaningful
> >please?
> 
> A define is fine but honestly, it's pretty darn obvious what it means...
> 
> >>+        }
> >>+
> >>+        assert(queue->idle_threads != 0);
> >This assertion holds because we believe one of the idle_threads
> >actually did the dequeuing, right?
> 
> An idle thread is a thread is one that is not doing work.  At this
> point in the code, we are not doing any work (yet) so if
> idle_threads count is zero, something is horribly wrong.  We're also
> going to unconditionally decrement in the future code path which
> means that if idle_threads is 0, it's going to become -1.
> 
> The use of idle_thread is to detect whether it's necessary to spawn
> an additional thread.
>

We can hit this assert if pthread_cond_signal() is called outside of
the mutex, let me try and explain below
 
> >>+        if (QTAILQ_EMPTY(&(queue->request_list))) {
> >>+            if (queue->cur_threads>  queue->min_threads) {
> >>+                /* We retain the minimum number of threads */
> >>+                break;
> >>+            }
> >>+        } else {
> >>+            work = QTAILQ_FIRST(&(queue->request_list));
> >>+            QTAILQ_REMOVE(&(queue->request_list), work, node);
> >>+
> >>+            queue->idle_threads--;
> >>+            qemu_mutex_unlock(&(queue->lock));
> >>+
> >>+            /* execute the work function */
> >>+            work->func(work);
> >>+
> >>+            qemu_mutex_lock(&(queue->lock));
> >>+            queue->idle_threads++;
> >>+        }
> >>+    }
> >>+
> >>+    queue->idle_threads--;
> >>+    queue->cur_threads--;
> >>+    qemu_mutex_unlock(&(queue->lock));
> >>+
> >>+    return NULL;
> >Does anybody do a join on the exiting thread from the pool?
> 
> No.  The thread is created in a detached state.
>

That makes sense, thanks for clarifying
 
> >>+}
> >>+
> >>+static void spawn_threadlet(ThreadletQueue *queue)
> >>+{
> >>+    QemuThread thread;
> >>+
> >>+    queue->cur_threads++;
> >>+    queue->idle_threads++;
> >>+
> >>+    qemu_thread_create(&thread, threadlet_worker, queue);
> >>+}
> >>+
> >>+/**
> >>+ * submit_threadletwork_to_queue: Submit a new task to a private queue to be
> >>+ *                            executed asynchronously.
> >>+ * @queue: Per-subsystem private queue to which the new task needs
> >>+ *         to be submitted.
> >>+ * @work: Contains information about the task that needs to be submitted.
> >>+ */
> >>+void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
> >>+{
> >>+    qemu_mutex_lock(&(queue->lock));
> >>+    if (queue->idle_threads == 0&&  queue->cur_threads<  queue->max_threads) {
> >>+        spawn_threadlet(queue);
> >So we hold queue->lock, spawn the thread, the spawned thread tries to
> >acquire queue->lock
> 
> Yup.
> 
> >>+    }
> >>+    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
> >>+    qemu_mutex_unlock(&(queue->lock));
> >>+    qemu_cond_signal(&(queue->cond));
> >In the case that we just spawned the threadlet, the cond_signal is
> >spurious. If we need predictable scheduling behaviour,
> >qemu_cond_signal needs to happen with queue->lock held.
> 
> It doesn't really affect predictability..
> 
> >I'd rewrite the function as
> >
> >/**
> >  * submit_threadletwork_to_queue: Submit a new task to a private queue to be
> >  *                            executed asynchronously.
> >  * @queue: Per-subsystem private queue to which the new task needs
> >  *         to be submitted.
> >  * @work: Contains information about the task that needs to be submitted.
> >  */
> >void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
> >{
> >     qemu_mutex_lock(&(queue->lock));
> >     if (queue->idle_threads == 0&&  (queue->cur_threads<  queue->max_threads)) {
> >         spawn_threadlet(queue);
> >     } else {
> >         qemu_cond_signal(&(queue->cond));
> >     }
> >     QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
> >     qemu_mutex_unlock(&(queue->lock));
> >}
> 
> I think this is a lot more fragile.  You're relying on the fact that
> signal will not cause the signalled thread to actually awaken until
> we release the lock and doing work after signalling that the
> signalled thread needs to be completed before it wakes up.
> 
> I think you're a lot more robust in the long term if you treat
> condition signalling as a hand off point because it makes the code a
> lot more explicit about what's happening.
>

OK, here is a situation that can happen

T1                              T2
---                             ---
threadlet                       submit_threadletwork_to_queue
(sees condition as no work)     mutex_lock
qemu_cond_timedwait             add_work
...                             mutex_unlock

T3
--
cancel_threadlet_work_on_queue
mutex_lock (grabs it) before T1 can
cancels the work


                                qemu_cond_signal

T1
--
Grabs mutex_lock (from within cond_timedwait)
Now there is no work to do, the condition
has changed before the thread wakes up


The man page also states

"however, if predictable scheduling behavior is required, then that
mutex shall be locked by the thread calling pthread_cond_broadcast()
or pthread_cond_signal()"

> >>+/**
> >>+ * submit_threadletwork: Submit to the global queue a new task to be executed
> >>+ *                   asynchronously.
> >>+ * @work: Contains information about the task that needs to be submitted.
> >>+ */
> >>+void submit_threadletwork(ThreadletWork *work)
> >>+{
> >>+    if (unlikely(!globalqueue_init)) {
> >>+        threadlet_queue_init(&globalqueue, MAX_GLOBAL_THREADS,
> >>+                                MIN_GLOBAL_THREADS);
> >>+        globalqueue_init = 1;
> >>+    }
> >What protects globalqueue_init?
> 
> qemu_mutex, and that unlikely is almost certainly a premature optimization.
> 
> Regards,
> 
> Anthony Liguori
>
Balbir Singh Oct. 20, 2010, 2:26 a.m. UTC | #8
* Venkateswararao Jujjuri (JV) <jvrao@linux.vnet.ibm.com> [2010-10-19 14:00:24]:

> > 
> > In the case that we just spawned the threadlet, the cond_signal is
> > spurious. If we need predictable scheduling behaviour,
> > qemu_cond_signal needs to happen with queue->lock held.
> > 
> > I'd rewrite the function as 
> > 
> > /**
> >  * submit_threadletwork_to_queue: Submit a new task to a private queue to be
> >  *                            executed asynchronously.
> >  * @queue: Per-subsystem private queue to which the new task needs
> >  *         to be submitted.
> >  * @work: Contains information about the task that needs to be submitted.
> >  */
> > void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
> > {
> >     qemu_mutex_lock(&(queue->lock));
> >     if (queue->idle_threads == 0 && (queue->cur_threads < queue->max_threads)) {
> >         spawn_threadlet(queue);
> >     } else {
> >         qemu_cond_signal(&(queue->cond));
> >     }
> >     QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
> >     qemu_mutex_unlock(&(queue->lock));
> > }
> > 
> This looks fine to me. may be more cleaner than the previous one..but functionally
> not much different.
>

It potentially does better at avoiding the spurious wakeup problem
(reduces the window). In another email I mentioned the man page says

"however, if predictable scheduling behavior is required, then that
mutex shall be locked by the thread calling pthread_cond_broadcast()
or pthread_cond_signal()"
jvrao Oct. 20, 2010, 3:19 a.m. UTC | #9
On 10/19/2010 2:36 PM, Anthony Liguori wrote:
> On 10/19/2010 01:36 PM, Balbir Singh wrote:
>>> +    qemu_mutex_lock(&(queue->lock));
>>> +    while (1) {
>>> +        ThreadletWork *work;
>>> +        int ret = 0;
>>> +
>>> +        while (QTAILQ_EMPTY(&(queue->request_list))&&
>>> +               (ret != ETIMEDOUT)) {
>>> +            ret = qemu_cond_timedwait(&(queue->cond),
>>> +                    &(queue->lock), 10*100000);
>>>      
>> Ewww... what is 10*100000, can we use something more meaningful
>> please?
>>    
> 
> A define is fine but honestly, it's pretty darn obvious what it means...
> 
>>> +        }
>>> +
>>> +        assert(queue->idle_threads != 0);
>>>      
>> This assertion holds because we believe one of the idle_threads
>> actually did the dequeuing, right?
>>    
> 
> An idle thread is a thread is one that is not doing work.  At this point in the
> code, we are not doing any work (yet) so if idle_threads count is zero,
> something is horribly wrong.  We're also going to unconditionally decrement in
> the future code path which means that if idle_threads is 0, it's going to become
> -1.
> 
> The use of idle_thread is to detect whether it's necessary to spawn an
> additional thread.
> 
>>> +        if (QTAILQ_EMPTY(&(queue->request_list))) {
>>> +            if (queue->cur_threads>  queue->min_threads) {
>>> +                /* We retain the minimum number of threads */
>>> +                break;
>>> +            }
>>> +        } else {
>>> +            work = QTAILQ_FIRST(&(queue->request_list));
>>> +            QTAILQ_REMOVE(&(queue->request_list), work, node);
>>> +
>>> +            queue->idle_threads--;
>>> +            qemu_mutex_unlock(&(queue->lock));
>>> +
>>> +            /* execute the work function */
>>> +            work->func(work);
>>> +
>>> +            qemu_mutex_lock(&(queue->lock));
>>> +            queue->idle_threads++;
>>> +        }
>>> +    }
>>> +
>>> +    queue->idle_threads--;
>>> +    queue->cur_threads--;
>>> +    qemu_mutex_unlock(&(queue->lock));
>>> +
>>> +    return NULL;
>>>      
>> Does anybody do a join on the exiting thread from the pool?
>>    
> 
> No.  The thread is created in a detached state.
> 
>>> +}
>>> +
>>> +static void spawn_threadlet(ThreadletQueue *queue)
>>> +{
>>> +    QemuThread thread;
>>> +
>>> +    queue->cur_threads++;
>>> +    queue->idle_threads++;
>>> +
>>> +    qemu_thread_create(&thread, threadlet_worker, queue);
>>>      
>>   
>>> +}
>>> +
>>> +/**
>>> + * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>>> + *                            executed asynchronously.
>>> + * @queue: Per-subsystem private queue to which the new task needs
>>> + *         to be submitted.
>>> + * @work: Contains information about the task that needs to be submitted.
>>> + */
>>> +void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
>>> +{
>>> +    qemu_mutex_lock(&(queue->lock));
>>> +    if (queue->idle_threads == 0&&  queue->cur_threads<  queue->max_threads) {
>>> +        spawn_threadlet(queue);
>>>      
>> So we hold queue->lock, spawn the thread, the spawned thread tries to
>> acquire queue->lock
>>    
> 
> Yup.
> 
>>> +    }
>>> +    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>>> +    qemu_mutex_unlock(&(queue->lock));
>>> +    qemu_cond_signal(&(queue->cond));
>>>      
>> In the case that we just spawned the threadlet, the cond_signal is
>> spurious. If we need predictable scheduling behaviour,
>> qemu_cond_signal needs to happen with queue->lock held.
>>    
> 
> It doesn't really affect predictability..
> 
>> I'd rewrite the function as
>>
>> /**
>>   * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>>   *                            executed asynchronously.
>>   * @queue: Per-subsystem private queue to which the new task needs
>>   *         to be submitted.
>>   * @work: Contains information about the task that needs to be submitted.
>>   */
>> void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
>> {
>>      qemu_mutex_lock(&(queue->lock));
>>      if (queue->idle_threads == 0&&  (queue->cur_threads<  queue->max_threads)) {
>>          spawn_threadlet(queue);
>>      } else {
>>          qemu_cond_signal(&(queue->cond));
>>      }
>>      QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>>      qemu_mutex_unlock(&(queue->lock));
>> }
>>    
> 
> I think this is a lot more fragile.  You're relying on the fact that signal will
> not cause the signalled thread to actually awaken until we release the lock and
> doing work after signalling that the signalled thread needs to be completed
> before it wakes up.

Given that qemu_cond_timedwait() need to get the queue->lock before returning
the singalled thread will wakeup and wait on the queue->lock.

- JV

> 
> I think you're a lot more robust in the long term if you treat condition
> signalling as a hand off point because it makes the code a lot more explicit
> about what's happening.
> 
>>> +/**
>>> + * submit_threadletwork: Submit to the global queue a new task to be executed
>>> + *                   asynchronously.
>>> + * @work: Contains information about the task that needs to be submitted.
>>> + */
>>> +void submit_threadletwork(ThreadletWork *work)
>>> +{
>>> +    if (unlikely(!globalqueue_init)) {
>>> +        threadlet_queue_init(&globalqueue, MAX_GLOBAL_THREADS,
>>> +                                MIN_GLOBAL_THREADS);
>>> +        globalqueue_init = 1;
>>> +    }
>>>      
>> What protects globalqueue_init?
>>    
> 
> qemu_mutex, and that unlikely is almost certainly a premature optimization.
> 
> Regards,
> 
> Anthony Liguori
> 
>
jvrao Oct. 20, 2010, 3:46 a.m. UTC | #10
On 10/19/2010 7:22 PM, Balbir Singh wrote:
> * Anthony Liguori <anthony@codemonkey.ws> [2010-10-19 16:36:31]:
> 
>> On 10/19/2010 01:36 PM, Balbir Singh wrote:
>>>> +    qemu_mutex_lock(&(queue->lock));
>>>> +    while (1) {
>>>> +        ThreadletWork *work;
>>>> +        int ret = 0;
>>>> +
>>>> +        while (QTAILQ_EMPTY(&(queue->request_list))&&
>>>> +               (ret != ETIMEDOUT)) {
>>>> +            ret = qemu_cond_timedwait(&(queue->cond),
>>>> +					&(queue->lock), 10*100000);
>>> Ewww... what is 10*100000, can we use something more meaningful
>>> please?
>>
>> A define is fine but honestly, it's pretty darn obvious what it means...
>>
>>>> +        }
>>>> +
>>>> +        assert(queue->idle_threads != 0);
>>> This assertion holds because we believe one of the idle_threads
>>> actually did the dequeuing, right?
>>
>> An idle thread is a thread is one that is not doing work.  At this
>> point in the code, we are not doing any work (yet) so if
>> idle_threads count is zero, something is horribly wrong.  We're also
>> going to unconditionally decrement in the future code path which
>> means that if idle_threads is 0, it's going to become -1.
>>
>> The use of idle_thread is to detect whether it's necessary to spawn
>> an additional thread.
>>
> 
> We can hit this assert if pthread_cond_signal() is called outside of
> the mutex, let me try and explain below
> 
>>>> +        if (QTAILQ_EMPTY(&(queue->request_list))) {
>>>> +            if (queue->cur_threads>  queue->min_threads) {
>>>> +                /* We retain the minimum number of threads */
>>>> +                break;
>>>> +            }
>>>> +        } else {
>>>> +            work = QTAILQ_FIRST(&(queue->request_list));
>>>> +            QTAILQ_REMOVE(&(queue->request_list), work, node);
>>>> +
>>>> +            queue->idle_threads--;
>>>> +            qemu_mutex_unlock(&(queue->lock));
>>>> +
>>>> +            /* execute the work function */
>>>> +            work->func(work);
>>>> +
>>>> +            qemu_mutex_lock(&(queue->lock));
>>>> +            queue->idle_threads++;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    queue->idle_threads--;
>>>> +    queue->cur_threads--;
>>>> +    qemu_mutex_unlock(&(queue->lock));
>>>> +
>>>> +    return NULL;
>>> Does anybody do a join on the exiting thread from the pool?
>>
>> No.  The thread is created in a detached state.
>>
> 
> That makes sense, thanks for clarifying
> 
>>>> +}
>>>> +
>>>> +static void spawn_threadlet(ThreadletQueue *queue)
>>>> +{
>>>> +    QemuThread thread;
>>>> +
>>>> +    queue->cur_threads++;
>>>> +    queue->idle_threads++;
>>>> +
>>>> +    qemu_thread_create(&thread, threadlet_worker, queue);
>>>> +}
>>>> +
>>>> +/**
>>>> + * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>>>> + *                            executed asynchronously.
>>>> + * @queue: Per-subsystem private queue to which the new task needs
>>>> + *         to be submitted.
>>>> + * @work: Contains information about the task that needs to be submitted.
>>>> + */
>>>> +void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
>>>> +{
>>>> +    qemu_mutex_lock(&(queue->lock));
>>>> +    if (queue->idle_threads == 0&&  queue->cur_threads<  queue->max_threads) {
>>>> +        spawn_threadlet(queue);
>>> So we hold queue->lock, spawn the thread, the spawned thread tries to
>>> acquire queue->lock
>>
>> Yup.
>>
>>>> +    }
>>>> +    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>>>> +    qemu_mutex_unlock(&(queue->lock));
>>>> +    qemu_cond_signal(&(queue->cond));
>>> In the case that we just spawned the threadlet, the cond_signal is
>>> spurious. If we need predictable scheduling behaviour,
>>> qemu_cond_signal needs to happen with queue->lock held.
>>
>> It doesn't really affect predictability..
>>
>>> I'd rewrite the function as
>>>
>>> /**
>>>  * submit_threadletwork_to_queue: Submit a new task to a private queue to be
>>>  *                            executed asynchronously.
>>>  * @queue: Per-subsystem private queue to which the new task needs
>>>  *         to be submitted.
>>>  * @work: Contains information about the task that needs to be submitted.
>>>  */
>>> void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
>>> {
>>>     qemu_mutex_lock(&(queue->lock));
>>>     if (queue->idle_threads == 0&&  (queue->cur_threads<  queue->max_threads)) {
>>>         spawn_threadlet(queue);
>>>     } else {
>>>         qemu_cond_signal(&(queue->cond));
>>>     }
>>>     QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
>>>     qemu_mutex_unlock(&(queue->lock));
>>> }
>>
>> I think this is a lot more fragile.  You're relying on the fact that
>> signal will not cause the signalled thread to actually awaken until
>> we release the lock and doing work after signalling that the
>> signalled thread needs to be completed before it wakes up.
>>
>> I think you're a lot more robust in the long term if you treat
>> condition signalling as a hand off point because it makes the code a
>> lot more explicit about what's happening.
>>
> 
> OK, here is a situation that can happen
> 
> T1                              T2
> ---                             ---
> threadlet                       submit_threadletwork_to_queue
> (sees condition as no work)     mutex_lock
> qemu_cond_timedwait             add_work
> ...                             mutex_unlock
> 
> T3
> --
> cancel_threadlet_work_on_queue
> mutex_lock (grabs it) before T1 can
> cancels the work
> 
> 
>                                 qemu_cond_signal
> 
> T1
> --
> Grabs mutex_lock (from within cond_timedwait)
> Now there is no work to do, the condition
> has changed before the thread wakes up

So what? It won't find any work and goes back to sleep or exits.

idle_threads is decremented only in threadlet_worker(). Given that
we have a threadlet that is not doing anywork the assert should never hit unless
something horribly wrong .

- JV

> 
> 
> The man page also states
> 
> "however, if predictable scheduling behavior is required, then that
> mutex shall be locked by the thread calling pthread_cond_broadcast()
> or pthread_cond_signal()"
> 
>>>> +/**
>>>> + * submit_threadletwork: Submit to the global queue a new task to be executed
>>>> + *                   asynchronously.
>>>> + * @work: Contains information about the task that needs to be submitted.
>>>> + */
>>>> +void submit_threadletwork(ThreadletWork *work)
>>>> +{
>>>> +    if (unlikely(!globalqueue_init)) {
>>>> +        threadlet_queue_init(&globalqueue, MAX_GLOBAL_THREADS,
>>>> +                                MIN_GLOBAL_THREADS);
>>>> +        globalqueue_init = 1;
>>>> +    }
>>> What protects globalqueue_init?
>>
>> qemu_mutex, and that unlikely is almost certainly a premature optimization.
>>
>> Regards,
>>
>> Anthony Liguori
>>
>
Stefan Hajnoczi Oct. 20, 2010, 8:16 a.m. UTC | #11
On Tue, Oct 19, 2010 at 6:42 PM, Arun R Bharadwaj
<arun@linux.vnet.ibm.com> wrote:
> +/**
> + * cancel_threadletwork_on_queue: Cancel a task queued on a Queue.
> + * @queue: The queue containing the task to be cancelled.
> + * @work: Contains the information of the task that needs to be cancelled.
> + *
> + * Returns: 0 if the task is successfully cancelled.
> + *          1 otherwise.

The return value comment doesn't correspond to how I read the code.
If the work was cancelled the code returns 1.  Otherwise it returns 0.

> + */
> +int cancel_threadletwork_on_queue(ThreadletQueue *queue, ThreadletWork *work)
> +{
> +    ThreadletWork *ret_work;
> +    int ret = 0;
> +
> +    qemu_mutex_lock(&(queue->lock));
> +    QTAILQ_FOREACH(ret_work, &(queue->request_list), node) {
> +        if (ret_work == work) {
> +            QTAILQ_REMOVE(&(queue->request_list), ret_work, node);
> +            ret = 1;
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&(queue->lock));
> +
> +    return ret;
> +}

Stefan
Balbir Singh Oct. 20, 2010, 1:05 p.m. UTC | #12
* Venkateswararao Jujjuri (JV) <jvrao@linux.vnet.ibm.com> [2010-10-19 20:46:35]:

> >> I think this is a lot more fragile.  You're relying on the fact that
> >> signal will not cause the signalled thread to actually awaken until
> >> we release the lock and doing work after signalling that the
> >> signalled thread needs to be completed before it wakes up.
> >>
> >> I think you're a lot more robust in the long term if you treat
> >> condition signalling as a hand off point because it makes the code a
> >> lot more explicit about what's happening.
> >>
> > 
> > OK, here is a situation that can happen
> > 
> > T1                              T2
> > ---                             ---
> > threadlet                       submit_threadletwork_to_queue
> > (sees condition as no work)     mutex_lock
> > qemu_cond_timedwait             add_work
> > ...                             mutex_unlock
> > 
> > T3
> > --
> > cancel_threadlet_work_on_queue
> > mutex_lock (grabs it) before T1 can
> > cancels the work
> > 
> > 
> >                                 qemu_cond_signal
> > 
> > T1
> > --
> > Grabs mutex_lock (from within cond_timedwait)
> > Now there is no work to do, the condition
> > has changed before the thread wakes up
> 
> So what? It won't find any work and goes back to sleep or exits.
>

Spurious wakeups are not good - they waste CPU cycles, consume energy.
Beyond that if we look at generic design

a. We want the thread condition to not change before it wakes up
(reduce that window at-least)
b. Although we don't care about thread priorities today in threadlet,
if we ever did and by good design you'd want the thread your waking up
to be contending for the mutex as soon the notifier releases the lock,
otherwise a lower priority thread can starve the original sleeper.

The code as posted today, does not have functional issues except for
opening up the window for spurious wakeups.
 
> idle_threads is decremented only in threadlet_worker(). Given that
> we have a threadlet that is not doing anywork the assert should never hit unless
> something horribly wrong .
>
Anthony Liguori Oct. 20, 2010, 1:13 p.m. UTC | #13
On 10/19/2010 09:22 PM, Balbir Singh wrote:
>
> OK, here is a situation that can happen
>
> T1                              T2
> ---                             ---
> threadlet                       submit_threadletwork_to_queue
> (sees condition as no work)     mutex_lock
> qemu_cond_timedwait             add_work
> ...                             mutex_unlock
>
> T3
> --
> cancel_threadlet_work_on_queue
> mutex_lock (grabs it) before T1 can
> cancels the work
>
>
>                                  qemu_cond_signal
>
> T1
> --
> Grabs mutex_lock (from within cond_timedwait)
> Now there is no work to do, the condition
> has changed before the thread wakes up
>
>
> The man page also states
>
> "however, if predictable scheduling behavior is required, then that
> mutex shall be locked by the thread calling pthread_cond_broadcast()
> or pthread_cond_signal()"
>    

The scenario you're describing is a spurious wakeup.  Any code that uses 
conditions ought to handle spurious wakeups.  The typical idiom for this is:

while (no_work_available()) {
    pthread_cond_wait(cond, lock);
}

So yes, pthread_cond_timedwait() will return but the while loop 
condition will be checked first.  In the scenario you describe, we'll go 
immediately back to sleep and the assert will not be triggered.

As I mentioned originally, in the absence of performance data, code 
readability trumps premature optimization.  I think the code is a lot 
more readable if the signaling point is outside of the mutex.

Regards,

Anthony Liguori
diff mbox

Patch

diff --git a/Makefile.objs b/Makefile.objs
index cd5a24b..2cf8aba 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -9,6 +9,8 @@  qobject-obj-y += qerror.o
 
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
 block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
+block-obj-$(CONFIG_POSIX) += qemu-thread.o
+block-obj-$(CONFIG_POSIX) += qemu-threadlets.o
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
@@ -124,7 +126,6 @@  endif
 common-obj-y += $(addprefix ui/, $(ui-obj-y))
 
 common-obj-y += iov.o acl.o
-common-obj-$(CONFIG_THREAD) += qemu-thread.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o
 
diff --git a/docs/async-support.txt b/docs/async-support.txt
new file mode 100644
index 0000000..9f22b9a
--- /dev/null
+++ b/docs/async-support.txt
@@ -0,0 +1,141 @@ 
+== How to use the threadlets infrastructure supported in Qemu ==
+
+== Threadlets ==
+
+Q.1: What are threadlets ?
+A.1: Threadlets is an infrastructure within QEMU that allows other subsystems
+     to offload possibly blocking work to a queue to be processed by a pool
+     of threads asynchronously.
+
+Q.2: When would one want to use threadlets ?
+A.2: Threadlets are useful when there are operations that can be performed
+     outside the context of the VCPU/IO threads inorder to free these latter
+     to service any other guest requests.
+
+Q.3: I have some work that can be executed in an asynchronous context. How
+     should I go about it ?
+A.3: One could follow the steps listed below:
+
+     - Define a function which would do the asynchronous work.
+	static void my_threadlet_func(ThreadletWork *work)
+	{
+	}
+
+     - Declare an object of type ThreadletWork;
+	ThreadletWork work;
+
+
+     - Assign a value to the "func" member of ThreadletWork object.
+	work.func = my_threadlet_func;
+
+     - Submit the threadlet to the global queue.
+	submit_threadletwork(&work);
+
+     - Continue servicing some other guest operations.
+
+Q.4: I want to my_threadlet_func to access some non-global data. How do I do
+     that ?
+A.4: Suppose you want my_threadlet_func to access some non-global data-object
+     of type myPrivateData. In that case one could follow the following steps.
+
+     - Define a member of the type ThreadletWork within myPrivateData.
+	typedef struct MyPrivateData {
+		...;
+		...;
+		...;
+		ThreadletWork work;
+	} MyPrivateData;
+
+	MyPrivateData my_data;
+
+     - Initialize myData.work as described in A.3
+	myData.work.func = my_threadlet_func;
+	submit_threadletwork(&myData.work);
+
+     - Access the myData object inside my_threadlet_func() using container_of
+       primitive
+	static void my_threadlet_func(ThreadletWork *work)
+	{
+		myPrivateData *mydata_ptr;
+		mydata_ptr = container_of(work, myPrivateData, work);
+
+		/* mydata_ptr now points to myData object */
+	}
+
+Q.5: Are there any precautions one must take while sharing data with the
+     Asynchronous thread-pool ?
+A.5: Yes, make sure that the helper function of the type my_threadlet_func()
+     does not access/modify data when it can be accessed or modified in the
+     context of VCPU thread or IO thread. This is because the asynchronous
+     threads in the pool can run in parallel with the VCPU/IOThreads as shown
+     in the figure.
+
+     A typical workflow is as follows:
+
+              VCPU/IOThread
+                   |
+                   | (1)
+                   |
+                   V
+                Offload work              (2)
+      |-------> to threadlets -----------------------------> Helper thread
+      |            |                                               |
+      |            |                                               |
+      |            | (3)                                           | (4)
+      |            |                                               |
+      |         Handle other Guest requests                        |
+      |            |                                               |
+      |            |                                               V
+      |            | (3)                                  Signal the I/O Thread
+      |(6)         |                                               |
+      |            |                                              /
+      |            |                                             /
+      |            V                                            /
+      |          Do the post <---------------------------------/
+      |          processing               (5)
+      |            |
+      |            | (6)
+      |            V
+      |-Yes------ More async work?
+                   |
+                   | (7)
+	           No
+                   |
+                   |
+                   .
+                   .
+
+    Hence one needs to make sure that in the steps (3) and (4) which run in
+    parallel, any global data is accessed within only one context.
+
+Q.6: I have queued a threadlet which I want to cancel. How do I do that ?
+A.6: Threadlets framework provides the API cancel_threadlet:
+       - int cancel_threadletwork(ThreadletWork *work)
+
+     The API scans the ThreadletQueue to see if (work) is present. If it finds
+     work, it'll dequeue work and return 0.
+
+     On the other hand, if it does not find the (work) in the ThreadletQueue,
+     then it'll return 1. This can imply two things. Either the work is being
+     processed by one of the helper threads or it has been processed. The
+     threadlet infrastructure currently _does_not_ distinguish between these
+     two and the onus is on the caller to do that.
+
+Q.7: Apart from the global pool of threads, can I have my own private Queue ?
+A.7: Yes, the threadlets framework allows subsystems to create their own private
+     queues with associated pools of threads.
+
+     - Define a PrivateQueue
+       ThreadletQueue myQueue;
+
+     - Initialize it:
+       threadlet_queue_init(&myQueue, my_max_threads, my_min_threads);
+       where my_max_threads is the maximum number of threads that can be in the
+       thread pool and my_min_threads is the minimum number of active threads
+       that can be in the thread-pool.
+
+     - Submit work:
+       submit_threadletwork_to_queue(&myQueue, &my_work);
+
+     - Cancel work:
+       cancel_threadletwork_on_queue(&myQueue, &my_work);
diff --git a/qemu-threadlets.c b/qemu-threadlets.c
new file mode 100644
index 0000000..fd33752
--- /dev/null
+++ b/qemu-threadlets.c
@@ -0,0 +1,165 @@ 
+/*
+ * Threadlet support for offloading tasks to be executed asynchronously
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Anthony Liguori     <aliguori@us.ibm.com>
+ *  Aneesh Kumar K.V    <aneesh.kumar@linux.vnet.ibm.com>
+ *  Gautham R Shenoy    <ego@in.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu-threadlets.h"
+#include "osdep.h"
+
+#define MAX_GLOBAL_THREADS  64
+#define MIN_GLOBAL_THREADS  64
+static ThreadletQueue globalqueue;
+static int globalqueue_init;
+
+static void *threadlet_worker(void *data)
+{
+    ThreadletQueue *queue = data;
+
+    qemu_mutex_lock(&(queue->lock));
+    while (1) {
+        ThreadletWork *work;
+        int ret = 0;
+
+        while (QTAILQ_EMPTY(&(queue->request_list)) &&
+               (ret != ETIMEDOUT)) {
+            ret = qemu_cond_timedwait(&(queue->cond),
+					 &(queue->lock), 10*100000);
+        }
+
+        assert(queue->idle_threads != 0);
+        if (QTAILQ_EMPTY(&(queue->request_list))) {
+            if (queue->cur_threads > queue->min_threads) {
+                /* We retain the minimum number of threads */
+                break;
+            }
+        } else {
+            work = QTAILQ_FIRST(&(queue->request_list));
+            QTAILQ_REMOVE(&(queue->request_list), work, node);
+
+            queue->idle_threads--;
+            qemu_mutex_unlock(&(queue->lock));
+
+            /* execute the work function */
+            work->func(work);
+
+            qemu_mutex_lock(&(queue->lock));
+            queue->idle_threads++;
+        }
+    }
+
+    queue->idle_threads--;
+    queue->cur_threads--;
+    qemu_mutex_unlock(&(queue->lock));
+
+    return NULL;
+}
+
+static void spawn_threadlet(ThreadletQueue *queue)
+{
+    QemuThread thread;
+
+    queue->cur_threads++;
+    queue->idle_threads++;
+
+    qemu_thread_create(&thread, threadlet_worker, queue);
+}
+
+/**
+ * submit_threadletwork_to_queue: Submit a new task to a private queue to be
+ *                            executed asynchronously.
+ * @queue: Per-subsystem private queue to which the new task needs
+ *         to be submitted.
+ * @work: Contains information about the task that needs to be submitted.
+ */
+void submit_threadletwork_to_queue(ThreadletQueue *queue, ThreadletWork *work)
+{
+    qemu_mutex_lock(&(queue->lock));
+    if (queue->idle_threads == 0 && queue->cur_threads < queue->max_threads) {
+        spawn_threadlet(queue);
+    }
+    QTAILQ_INSERT_TAIL(&(queue->request_list), work, node);
+    qemu_mutex_unlock(&(queue->lock));
+    qemu_cond_signal(&(queue->cond));
+}
+
+/**
+ * submit_threadletwork: Submit to the global queue a new task to be executed
+ *                   asynchronously.
+ * @work: Contains information about the task that needs to be submitted.
+ */
+void submit_threadletwork(ThreadletWork *work)
+{
+    if (unlikely(!globalqueue_init)) {
+        threadlet_queue_init(&globalqueue, MAX_GLOBAL_THREADS,
+                                MIN_GLOBAL_THREADS);
+        globalqueue_init = 1;
+    }
+
+    submit_threadletwork_to_queue(&globalqueue, work);
+}
+
+/**
+ * cancel_threadletwork_on_queue: Cancel a task queued on a Queue.
+ * @queue: The queue containing the task to be cancelled.
+ * @work: Contains the information of the task that needs to be cancelled.
+ *
+ * Returns: 0 if the task is successfully cancelled.
+ *          1 otherwise.
+ */
+int cancel_threadletwork_on_queue(ThreadletQueue *queue, ThreadletWork *work)
+{
+    ThreadletWork *ret_work;
+    int ret = 0;
+
+    qemu_mutex_lock(&(queue->lock));
+    QTAILQ_FOREACH(ret_work, &(queue->request_list), node) {
+        if (ret_work == work) {
+            QTAILQ_REMOVE(&(queue->request_list), ret_work, node);
+            ret = 1;
+            break;
+        }
+    }
+    qemu_mutex_unlock(&(queue->lock));
+
+    return ret;
+}
+
+/**
+ * cancel_threadletwork: Cancel a task queued on the global queue.
+ * @work: Contains the information of the task that needs to be cancelled.
+ *
+ * Returns: 0 if the task is successfully cancelled.
+ *          1 otherwise.
+ */
+int cancel_threadletwork(ThreadletWork *work)
+{
+    return cancel_threadletwork_on_queue(&globalqueue, work);
+}
+
+/**
+ * threadlet_queue_init: Initialize a threadlet queue.
+ * @queue: The threadlet queue to be initialized.
+ * @max_threads: Maximum number of threads processing the queue.
+ * @min_threads: Minimum number of threads processing the queue.
+ */
+void threadlet_queue_init(ThreadletQueue *queue,
+				    int max_threads, int min_threads)
+{
+    queue->cur_threads  = 0;
+    queue->idle_threads = 0;
+    queue->max_threads  = max_threads;
+    queue->min_threads  = min_threads;
+    QTAILQ_INIT(&(queue->request_list));
+    qemu_mutex_init(&(queue->lock));
+    qemu_cond_init(&(queue->cond));
+}
diff --git a/qemu-threadlets.h b/qemu-threadlets.h
new file mode 100644
index 0000000..9c8f9e5
--- /dev/null
+++ b/qemu-threadlets.h
@@ -0,0 +1,48 @@ 
+/*
+ * Threadlet support for offloading tasks to be executed asynchronously
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Anthony Liguori     <aliguori@us.ibm.com>
+ *  Aneesh Kumar K.V    <aneesh.kumar@linux.vnet.ibm.com>
+ *  Gautham R Shenoy    <ego@in.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_ASYNC_WORK_H
+#define QEMU_ASYNC_WORK_H
+
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "qemu-thread.h"
+
+typedef struct ThreadletQueue
+{
+    QemuMutex lock;
+    QemuCond cond;
+    int max_threads;
+    int min_threads;
+    int cur_threads;
+    int idle_threads;
+    QTAILQ_HEAD(, ThreadletWork) request_list;
+} ThreadletQueue;
+
+typedef struct ThreadletWork
+{
+    QTAILQ_ENTRY(ThreadletWork) node;
+    void (*func)(struct ThreadletWork *work);
+} ThreadletWork;
+
+extern void submit_threadletwork_to_queue(ThreadletQueue *queue,
+                                      ThreadletWork *work);
+extern void submit_threadletwork(ThreadletWork *work);
+extern int cancel_threadletwork_on_queue(ThreadletQueue *queue,
+                                        ThreadletWork *work);
+extern int cancel_threadletwork(ThreadletWork *work);
+extern void threadlet_queue_init(ThreadletQueue *queue, int max_threads,
+                                 int min_threads);
+#endif