Patchwork [UPDATED,1/3] vhost: replace vhost_workqueue with per-vhost kthread

login
register
mail settings
Submitter Michael S. Tsirkin
Date July 22, 2010, 3:58 p.m.
Message ID <20100722155840.GA1743@redhat.com>
Download mbox | patch
Permalink /patch/59590/
State Awaiting Upstream
Delegated to: David Miller
Headers show

Comments

Michael S. Tsirkin - July 22, 2010, 3:58 p.m.
On Wed, Jun 02, 2010 at 08:40:00PM +0200, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> argument change from struct work_struct * to struct vhost_work *,
> there's no visible change to vhost_poll_*() interface.
> 
> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
> 
> Partially based on Sridhar Samudrala's patch.
> 
> * Updated to use sub structure vhost_work instead of directly using
>   vhost_poll at Michael's suggestion.
> 
> * Added flusher wake_up() optimization at Michael's suggestion.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>

All the tricky barrier pairing made me uncomfortable.  So I came up with
this on top (untested): if we do all operations under the spinlock, we
can get by without barriers and atomics.  And since we need the lock for
list operations anyway, this should have no paerformance impact.

What do you think?


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - July 22, 2010, 9:21 p.m.
Hello,

On 07/22/2010 05:58 PM, Michael S. Tsirkin wrote:
> All the tricky barrier pairing made me uncomfortable.  So I came up with
> this on top (untested): if we do all operations under the spinlock, we
> can get by without barriers and atomics.  And since we need the lock for
> list operations anyway, this should have no paerformance impact.
> 
> What do you think?

I've created kthread_worker in wq#for-next tree and already converted
ivtv to use it.  Once this lands in mainline, I think converting vhost
to use it would be better choice.  kthread worker code uses basically
the same logic used in the vhost_workqueue code but is better
organized and documented.  So, I think it would be better to stick
with the original implementation, as otherwise we're likely to just
decrease test coverage without much gain.

  http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439

> @@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  static int vhost_worker(void *data)
>  {
>  	struct vhost_dev *dev = data;
> -	struct vhost_work *work;
> +	struct vhost_work *work = NULL;
>  
> -repeat:
> -	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> +	for (;;) {
> +		set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
>  
> -	if (kthread_should_stop()) {
> -		__set_current_state(TASK_RUNNING);
> -		return 0;
> -	}
> +		if (kthread_should_stop()) {
> +			__set_current_state(TASK_RUNNING);
> +			return 0;
> +		}
>  
> -	work = NULL;
> -	spin_lock_irq(&dev->work_lock);
> -	if (!list_empty(&dev->work_list)) {
> -		work = list_first_entry(&dev->work_list,
> -					struct vhost_work, node);
> -		list_del_init(&work->node);
> -	}
> -	spin_unlock_irq(&dev->work_lock);
> +		spin_lock_irq(&dev->work_lock);
> +		if (work) {
> +			work->done_seq = work->queue_seq;
> +			if (work->flushing)
> +				wake_up_all(&work->done);

I don't think doing this before executing the function is correct, so
you'll have to release the lock, execute the function, regrab the lock
and then do the flush processing.

Thanks.
Michael S. Tsirkin - July 24, 2010, 7:14 p.m.
On Thu, Jul 22, 2010 at 11:21:40PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/22/2010 05:58 PM, Michael S. Tsirkin wrote:
> > All the tricky barrier pairing made me uncomfortable.  So I came up with
> > this on top (untested): if we do all operations under the spinlock, we
> > can get by without barriers and atomics.  And since we need the lock for
> > list operations anyway, this should have no paerformance impact.
> > 
> > What do you think?
> 
> I've created kthread_worker in wq#for-next tree and already converted
> ivtv to use it.  Once this lands in mainline, I think converting vhost
> to use it would be better choice.  kthread worker code uses basically
> the same logic used in the vhost_workqueue code but is better
> organized and documented.  So, I think it would be better to stick
> with the original implementation, as otherwise we're likely to just
> decrease test coverage without much gain.
> 
>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439

Sure, if we keep using workqueue. But I'd like to investigate this
direction a bit more because there's discussion to switching from kthread to
regular threads altogether.

> > @@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> >  static int vhost_worker(void *data)
> >  {
> >  	struct vhost_dev *dev = data;
> > -	struct vhost_work *work;
> > +	struct vhost_work *work = NULL;
> >  
> > -repeat:
> > -	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> > +	for (;;) {
> > +		set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> >  
> > -	if (kthread_should_stop()) {
> > -		__set_current_state(TASK_RUNNING);
> > -		return 0;
> > -	}
> > +		if (kthread_should_stop()) {
> > +			__set_current_state(TASK_RUNNING);
> > +			return 0;
> > +		}
> >  
> > -	work = NULL;
> > -	spin_lock_irq(&dev->work_lock);
> > -	if (!list_empty(&dev->work_list)) {
> > -		work = list_first_entry(&dev->work_list,
> > -					struct vhost_work, node);
> > -		list_del_init(&work->node);
> > -	}
> > -	spin_unlock_irq(&dev->work_lock);
> > +		spin_lock_irq(&dev->work_lock);
> > +		if (work) {
> > +			work->done_seq = work->queue_seq;
> > +			if (work->flushing)
> > +				wake_up_all(&work->done);
> 
> I don't think doing this before executing the function is correct,

Well, before I execute the function work is NULL, so this is skipped.
Correct?

> so
> you'll have to release the lock, execute the function, regrab the lock
> and then do the flush processing.
> 
> Thanks.

It's done in the loop, so I thought we can reuse the locking
done for the sake of processing the next work item.
Makes sense?


> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - July 25, 2010, 7:41 a.m.
Hello,

On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
>> I've created kthread_worker in wq#for-next tree and already converted
>> ivtv to use it.  Once this lands in mainline, I think converting vhost
>> to use it would be better choice.  kthread worker code uses basically
>> the same logic used in the vhost_workqueue code but is better
>> organized and documented.  So, I think it would be better to stick
>> with the original implementation, as otherwise we're likely to just
>> decrease test coverage without much gain.
>>
>>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> 
> Sure, if we keep using workqueue. But I'd like to investigate this
> direction a bit more because there's discussion to switching from kthread to
> regular threads altogether.

Hmmm? It doesn't have much to do with workqueue.  kthread_worker is a
simple wrapper around kthread.  It now assumes kthread but changing it
to be useable with any thread shouldn't be too hard.  Wouldn't that be
better?

>> I don't think doing this before executing the function is correct,
> 
> Well, before I execute the function work is NULL, so this is skipped.
> Correct?
>
>> so
>> you'll have to release the lock, execute the function, regrab the lock
>> and then do the flush processing.
>>
>> Thanks.
> 
> It's done in the loop, so I thought we can reuse the locking
> done for the sake of processing the next work item.
> Makes sense?

Yeap, right.  I think it would make much more sense to use common code
when it becomes available but if you think the posted change is
necessary till then, please feel free to go ahead.

Thanks.
Michael S. Tsirkin - July 25, 2010, 10:04 a.m.
On Sun, Jul 25, 2010 at 09:41:22AM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
> >> I've created kthread_worker in wq#for-next tree and already converted
> >> ivtv to use it.  Once this lands in mainline, I think converting vhost
> >> to use it would be better choice.  kthread worker code uses basically
> >> the same logic used in the vhost_workqueue code but is better
> >> organized and documented.  So, I think it would be better to stick
> >> with the original implementation, as otherwise we're likely to just
> >> decrease test coverage without much gain.
> >>
> >>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> > 
> > Sure, if we keep using workqueue. But I'd like to investigate this
> > direction a bit more because there's discussion to switching from kthread to
> > regular threads altogether.
> 
> Hmmm? It doesn't have much to do with workqueue.  kthread_worker is a
> simple wrapper around kthread.  It now assumes kthread but changing it
> to be useable with any thread shouldn't be too hard.  Wouldn't that be
> better?

Yes, of course, when common code becomes available we should
switch to that.

> >> I don't think doing this before executing the function is correct,
> > 
> > Well, before I execute the function work is NULL, so this is skipped.
> > Correct?
> >
> >> so
> >> you'll have to release the lock, execute the function, regrab the lock
> >> and then do the flush processing.
> >>
> >> Thanks.
> > 
> > It's done in the loop, so I thought we can reuse the locking
> > done for the sake of processing the next work item.
> > Makes sense?
> 
> Yeap, right.  I think it would make much more sense to use common code
> when it becomes available but if you think the posted change is
> necessary till then, please feel free to go ahead.
> 
> Thanks.
> 
> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - July 26, 2010, 3:25 p.m.
On Sun, Jul 25, 2010 at 09:41:22AM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
> >> I've created kthread_worker in wq#for-next tree and already converted
> >> ivtv to use it.  Once this lands in mainline, I think converting vhost
> >> to use it would be better choice.  kthread worker code uses basically
> >> the same logic used in the vhost_workqueue code but is better
> >> organized and documented.  So, I think it would be better to stick
> >> with the original implementation, as otherwise we're likely to just
> >> decrease test coverage without much gain.
> >>
> >>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> > 
> > Sure, if we keep using workqueue. But I'd like to investigate this
> > direction a bit more because there's discussion to switching from kthread to
> > regular threads altogether.
> 
> Hmmm? It doesn't have much to do with workqueue.  kthread_worker is a
> simple wrapper around kthread.  It now assumes kthread but changing it
> to be useable with any thread shouldn't be too hard.  Wouldn't that be
> better?

BTW, kthread_worker would benefit from the optimization I implemented
here as well.
Tejun Heo - July 26, 2010, 3:34 p.m.
Hello,

On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> BTW, kthread_worker would benefit from the optimization I implemented
> here as well.

Hmmm... I'm not quite sure whether it's an optimization.  I thought
the patch was due to feeling uncomfortable about using barriers?  Is
it an optimization?

Thanks.
Tejun Heo - July 26, 2010, 3:46 p.m.
On 07/26/2010 05:34 PM, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
>> BTW, kthread_worker would benefit from the optimization I implemented
>> here as well.
> 
> Hmmm... I'm not quite sure whether it's an optimization.  I thought
> the patch was due to feeling uncomfortable about using barriers?  Is
> it an optimization?

Yeah, one less smp_mb() in execution path.  The lock dancing in
flush() is ugly but then again mucking with barriers could be harder
to understand.  Care to send a patch against wq#for-next tree?

Thanks.
Michael S. Tsirkin - July 26, 2010, 3:50 p.m.
On Mon, Jul 26, 2010 at 05:34:44PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> > BTW, kthread_worker would benefit from the optimization I implemented
> > here as well.
> 
> Hmmm... I'm not quite sure whether it's an optimization.  I thought
> the patch was due to feeling uncomfortable about using barriers?

Oh yes. But getting rid of barriers is what motivated me originally.

>  Is it an optimization?
> 
> Thanks.

Yes, sure. This removes atomic read and 2 barrier operations on data path.  And
it does not add any new synchronization: instead, we reuse the lock that we
take anyway.  The relevant part is:


+               if (work) {
+                       __set_current_state(TASK_RUNNING);
+                       work->fn(work);
+               } else
+                       schedule();

-       if (work) {
-               __set_current_state(TASK_RUNNING);
-               work->fn(work);
-               smp_wmb();      /* wmb worker-b0 paired with flush-b1 */
-               work->done_seq = work->queue_seq;
-               smp_mb();       /* mb worker-b1 paired with flush-b0 */
-               if (atomic_read(&work->flushing))
-                       wake_up_all(&work->done);
-       } else
-               schedule();
-
-       goto repeat;

Is there a git tree with kthread_worker applied?
I might do this just for fun ...


> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - July 26, 2010, 3:51 p.m.
On Mon, Jul 26, 2010 at 05:46:30PM +0200, Tejun Heo wrote:
> On 07/26/2010 05:34 PM, Tejun Heo wrote:
> > Hello,
> > 
> > On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> >> BTW, kthread_worker would benefit from the optimization I implemented
> >> here as well.
> > 
> > Hmmm... I'm not quite sure whether it's an optimization.  I thought
> > the patch was due to feeling uncomfortable about using barriers?  Is
> > it an optimization?
> 
> Yeah, one less smp_mb() in execution path.  The lock dancing in
> flush() is ugly but then again mucking with barriers could be harder
> to understand.  Care to send a patch against wq#for-next tree?
> 
> Thanks.

Sure. Where's that, exactly?

> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - July 26, 2010, 4:05 p.m.
Hello,

On 07/26/2010 05:50 PM, Michael S. Tsirkin wrote:
>> Hmmm... I'm not quite sure whether it's an optimization.  I thought
>> the patch was due to feeling uncomfortable about using barriers?
> 
> Oh yes. But getting rid of barriers is what motivated me originally.

Yeah, getting rid of barriers is always good.  :-)

> Is there a git tree with kthread_worker applied?
> I might do this just for fun ...

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-next

For the original implementaiton, please take a look at commit
b56c0d8937e665a27d90517ee7a746d0aa05af46.

* Can you please keep the outer goto repeat loop?  I just don't like
  outermost for (;;).

* Placing try_to_freeze() could be a bit annoying.  It shouldn't be
  executed when there's a work to flush.

* I think A - B <= 0 test would be more familiar.  At least
  time_before/after() are implemented that way.

Thanks.
Tejun Heo - July 26, 2010, 4:14 p.m.
Just one more thing.

On 07/26/2010 06:05 PM, Tejun Heo wrote:
> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>   executed when there's a work to flush.

* Similar issue exists for kthread_stop().  The kthread shouldn't exit
  while there's a work to flush (please note that kthread_worker
  interface allows detaching / attaching worker kthread during
  operation, so it should remain in consistent state with regard to
  flushing).

Thanks.
Michael S. Tsirkin - July 26, 2010, 4:23 p.m.
On Mon, Jul 26, 2010 at 06:05:27PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 05:50 PM, Michael S. Tsirkin wrote:
> >> Hmmm... I'm not quite sure whether it's an optimization.  I thought
> >> the patch was due to feeling uncomfortable about using barriers?
> > 
> > Oh yes. But getting rid of barriers is what motivated me originally.
> 
> Yeah, getting rid of barriers is always good.  :-)
> 
> > Is there a git tree with kthread_worker applied?
> > I might do this just for fun ...
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-next
> 
> For the original implementaiton, please take a look at commit
> b56c0d8937e665a27d90517ee7a746d0aa05af46.
> 
> * Can you please keep the outer goto repeat loop?  I just don't like
>   outermost for (;;).

Okay ... can we put the code in a {} scope to make it clear
where does the loop starts and ends?

> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>   executed when there's a work to flush.

It currently seems to be executed when there is work to flush.
Is this wrong?

> * I think A - B <= 0 test would be more familiar.  At least
>   time_before/after() are implemented that way.

I am concerned that this overflows a signed integer -
which I seem to remeber that C99 disallows.
timer macros are on data path so might be worth the risk there,
but flush is slow path so better be safe?

> Thanks.
> 
> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - July 26, 2010, 4:31 p.m.
On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
> Just one more thing.
> 
> On 07/26/2010 06:05 PM, Tejun Heo wrote:
> > * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
> >   executed when there's a work to flush.

BTW why is this important?
We could always get another work and flush right after
try_to_freeze, and then flush would block for a long time.


BTW the vhost patch you sent does not do this at all.
I am guessing it is because our thread is not freezable?

> * Similar issue exists for kthread_stop().  The kthread shouldn't exit
>   while there's a work to flush (please note that kthread_worker
>   interface allows detaching / attaching worker kthread during
>   operation, so it should remain in consistent state with regard to
>   flushing).
> 
> Thanks.

Not sure I agree here. Users must synchronise flush and stop calls.
Otherwise a work might get queued after stop is called, and
you won't be able to flush it.


> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - July 26, 2010, 4:51 p.m.
On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
> Just one more thing.

I noticed that with vhost, flush_work was getting the worker
pointer as well. Can we live with this API change?
Tejun Heo - July 26, 2010, 6:51 p.m.
Hello,

On 07/26/2010 06:31 PM, Michael S. Tsirkin wrote:
>> On 07/26/2010 06:05 PM, Tejun Heo wrote:
>>> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>>>   executed when there's a work to flush.
> 
> BTW why is this important?
> We could always get another work and flush right after
> try_to_freeze, and then flush would block for a long time.
> 
> BTW the vhost patch you sent does not do this at all.
> I am guessing it is because our thread is not freezable?

Yeap, I think so.

>> * Similar issue exists for kthread_stop().  The kthread shouldn't exit
>>   while there's a work to flush (please note that kthread_worker
>>   interface allows detaching / attaching worker kthread during
>>   operation, so it should remain in consistent state with regard to
>>   flushing).
> 
> Not sure I agree here. Users must synchronise flush and stop calls.
> Otherwise a work might get queued after stop is called, and
> you won't be able to flush it.

For freeze, it probably is okay but for stop, I think it's better to
keep the semantics straight forward.  It may be okay to do otherwise
but having such oddity in generic interface is nasty and may lead to
surprises which can be pretty difficult to track down later on.  It's
just a bit more of annoyance while writing the generic code, so...

Thanks.
Tejun Heo - July 26, 2010, 7:04 p.m.
On 07/26/2010 06:23 PM, Michael S. Tsirkin wrote:
>> * Can you please keep the outer goto repeat loop?  I just don't like
>>   outermost for (;;).
> 
> Okay ... can we put the code in a {} scope to make it clear
> where does the loop starts and ends?

If we're gonna do that, it would be better to put it inside a loop
construct.  The reason why I don't like it is that loops like that
don't really help read/writeability much while indenting the whole
logic unnecessarily and look more like a result of obsession against
goto rather than any practical reason.  It's just a cosmetic
preference and I might as well be the weirdo here, so if you feel
strong about it, please feel free to put everything in a loop.

>> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>>   executed when there's a work to flush.
> 
> It currently seems to be executed when there is work to flush.
> Is this wrong?

Oh, does it?  As I wrote in the other mail, things like that wouldn't
necessarily break correctness but I think it would be better to avoid
surprises in the generic code if not too difficult.

>> * I think A - B <= 0 test would be more familiar.  At least
>>   time_before/after() are implemented that way.
> 
> I am concerned that this overflows a signed integer -
> which I seem to remeber that C99 disallows.

Really?  Overflows of pointer isn't expected and that's why we have
weird RELOC_HIDE() macro for such calculations but integers not
expected to overflow is a news to me.  Are you sure?  That basically
means time_before/after() aren't safe either.

> timer macros are on data path so might be worth the risk there,
> but flush is slow path so better be safe?

I don't think performance matters much here.  I just think the sign
test is clearer / more familiar for the logic.

Thanks.
Tejun Heo - July 26, 2010, 7:14 p.m.
On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
>> Just one more thing.
> 
> I noticed that with vhost, flush_work was getting the worker
> pointer as well. Can we live with this API change?

Yeah, the flushing mechanism wouldn't work reliably if the work is
queued to a different worker without flushing, so yeah passing in
@worker might actually be better.

Thanks.
Tejun Heo - July 26, 2010, 7:31 p.m.
Hello,

On 07/26/2010 09:14 PM, Tejun Heo wrote:
> On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
>> I noticed that with vhost, flush_work was getting the worker
>> pointer as well. Can we live with this API change?
> 
> Yeah, the flushing mechanism wouldn't work reliably if the work is
> queued to a different worker without flushing, so yeah passing in
> @worker might actually be better.

Thinking a bit more about it, it kind of sucks that queueing to
another worker from worker->func() breaks flush.  Maybe the right
thing to do there is using atomic_t for done_seq?  It pays a bit more
overhead but maybe that's justifiable to keep the API saner?  It would
be great if it can be fixed somehow even if it means that the work has
to be separately flushed for each worker it has been on before being
destroyed.

Or, if flushing has to be associated with a specific worker anyway,
maybe it would be better to move the sequence counter to
kthread_worker and do it similarly with the original workqueue so that
work can be destroyed once execution starts?  Then, it can at least
remain semantically identical to the original workqueue.

Thanks.
Michael S. Tsirkin - July 26, 2010, 7:57 p.m.
On Mon, Jul 26, 2010 at 08:51:50PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 06:31 PM, Michael S. Tsirkin wrote:
> >> On 07/26/2010 06:05 PM, Tejun Heo wrote:
> >>> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
> >>>   executed when there's a work to flush.
> > 
> > BTW why is this important?
> > We could always get another work and flush right after
> > try_to_freeze, and then flush would block for a long time.
> > 
> > BTW the vhost patch you sent does not do this at all.
> > I am guessing it is because our thread is not freezable?
> 
> Yeap, I think so.
> 
> >> * Similar issue exists for kthread_stop().  The kthread shouldn't exit
> >>   while there's a work to flush (please note that kthread_worker
> >>   interface allows detaching / attaching worker kthread during
> >>   operation, so it should remain in consistent state with regard to
> >>   flushing).
> > 
> > Not sure I agree here. Users must synchronise flush and stop calls.
> > Otherwise a work might get queued after stop is called, and
> > you won't be able to flush it.
> 
> For freeze, it probably is okay but for stop, I think it's better to
> keep the semantics straight forward.

What are the semantics then? What do we want stop followed
by queue and flush to do?

>  It may be okay to do otherwise
> but having such oddity in generic interface is nasty and may lead to
> surprises which can be pretty difficult to track down later on.  It's
> just a bit more of annoyance while writing the generic code, so...
> 
> Thanks.
> 
> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - July 26, 2010, 7:59 p.m.
On Mon, Jul 26, 2010 at 09:31:58PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 09:14 PM, Tejun Heo wrote:
> > On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> >> I noticed that with vhost, flush_work was getting the worker
> >> pointer as well. Can we live with this API change?
> > 
> > Yeah, the flushing mechanism wouldn't work reliably if the work is
> > queued to a different worker without flushing, so yeah passing in
> > @worker might actually be better.
> 
> Thinking a bit more about it, it kind of sucks that queueing to
> another worker from worker->func() breaks flush.  Maybe the right
> thing to do there is using atomic_t for done_seq?  It pays a bit more
> overhead but maybe that's justifiable to keep the API saner?  It would
> be great if it can be fixed somehow even if it means that the work has
> to be separately flushed for each worker it has been on before being
> destroyed.
> 
> Or, if flushing has to be associated with a specific worker anyway,
> maybe it would be better to move the sequence counter to
> kthread_worker and do it similarly with the original workqueue so that
> work can be destroyed once execution starts?  Then, it can at least
> remain semantically identical to the original workqueue.
> 
> Thanks.

This last sounds sane: in fact I didn't know there is any difference.

> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - July 26, 2010, 8:19 p.m.
On Mon, Jul 26, 2010 at 09:04:17PM +0200, Tejun Heo wrote:
> On 07/26/2010 06:23 PM, Michael S. Tsirkin wrote:
> >> * Can you please keep the outer goto repeat loop?  I just don't like
> >>   outermost for (;;).
> > 
> > Okay ... can we put the code in a {} scope to make it clear
> > where does the loop starts and ends?
> 
> If we're gonna do that, it would be better to put it inside a loop
> construct.  The reason why I don't like it is that loops like that
> don't really help read/writeability much while indenting the whole
> logic unnecessarily and look more like a result of obsession against
> goto rather than any practical reason.  It's just a cosmetic
> preference and I might as well be the weirdo here, so if you feel
> strong about it, please feel free to put everything in a loop.
> 
> >> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
> >>   executed when there's a work to flush.
> > 
> > It currently seems to be executed when there is work to flush.
> > Is this wrong?
> 
> Oh, does it?  As I wrote in the other mail, things like that wouldn't
> necessarily break correctness but I think it would be better to avoid
> surprises in the generic code if not too difficult.

Let's try to define what do we want to achieve then.
Do you want code that flushes workers not to block
when workers are frozen? How will we handle work
submitted when worker is frozen?


> >> * I think A - B <= 0 test would be more familiar.  At least
> >>   time_before/after() are implemented that way.
> > 
> > I am concerned that this overflows a signed integer -
> > which I seem to remeber that C99 disallows.
> 
> Really?  Overflows of pointer isn't expected and that's why we have
> weird RELOC_HIDE() macro for such calculations but integers not
> expected to overflow is a news to me.  Are you sure?  That basically
> means time_before/after() aren't safe either.

As I said, in C99.
However,  the kernel is built with -fno-strict-overflow, so it will work.

> > timer macros are on data path so might be worth the risk there,
> > but flush is slow path so better be safe?
> 
> I don't think performance matters much here.  I just think the sign
> test is clearer / more familiar for the logic.
> 
> Thanks.
> 
> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - July 27, 2010, 8:18 a.m.
Hello,

On 07/26/2010 09:57 PM, Michael S. Tsirkin wrote:
>> For freeze, it probably is okay but for stop, I think it's better to
>> keep the semantics straight forward.
> 
> What are the semantics then? What do we want stop followed
> by queue and flush to do?

One scenario I can think of is the following.

 kthread_worker allows kthreads to be attached and stopped anytime, so
 if the caller stops the current worker while flushing is pending and
 attaches a new worker, the flushing which was pending will never
 happen.

But, in general, it's nasty to allow execution and its completion to
be separated.  Things like that are likely to bite us back in obscure
ways.  I think it would be silly to have such oddity in generic code
when it can be avoided without too much trouble.

Thanks.
Tejun Heo - July 27, 2010, 8:21 a.m.
Hello,

On 07/26/2010 10:19 PM, Michael S. Tsirkin wrote:
> Let's try to define what do we want to achieve then.  Do you want
> code that flushes workers not to block when workers are frozen? How
> will we handle work submitted when worker is frozen?

As I wrote earlier, it's not necessarily about correctness but rather
avoiding unnecessary surprises and of course flushing can and should
stall if the queue is frozen but let's not separate execution of a
work and its completion with something which can take undeterminate
amount of time.

Thanks.
Michael S. Tsirkin - July 27, 2010, 7:19 p.m.
On Mon, Jul 26, 2010 at 09:31:58PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 09:14 PM, Tejun Heo wrote:
> > On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> >> I noticed that with vhost, flush_work was getting the worker
> >> pointer as well. Can we live with this API change?
> > 
> > Yeah, the flushing mechanism wouldn't work reliably if the work is
> > queued to a different worker without flushing, so yeah passing in
> > @worker might actually be better.
> 
> Thinking a bit more about it, it kind of sucks that queueing to
> another worker from worker->func() breaks flush.  Maybe the right
> thing to do there is using atomic_t for done_seq?

I don't believe it will help: we might have:

worker1 runs work
work requeues itself queued index = 1
worker1 reads queued index = 1
worker2 runs work
work requeues itself queued index = 2
worker2 runs work
worker2 reads queued index = 2
worker2 writes done index = 2
worker1 writes done index = 1

As you see, done index got moved back.



>  It pays a bit more
> overhead but maybe that's justifiable to keep the API saner?  It would
> be great if it can be fixed somehow even if it means that the work has
> to be separately flushed for each worker it has been on before being
> destroyed.
> 
> Or, if flushing has to be associated with a specific worker anyway,
> maybe it would be better to move the sequence counter to
> kthread_worker and do it similarly with the original workqueue so that
> work can be destroyed once execution starts?  Then, it can at least
> remain semantically identical to the original workqueue.
> 
> Thanks.
> 
> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - July 28, 2010, 7:48 a.m.
On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
>> Thinking a bit more about it, it kind of sucks that queueing to
>> another worker from worker->func() breaks flush.  Maybe the right
>> thing to do there is using atomic_t for done_seq?
> 
> I don't believe it will help: we might have:
> 
> worker1 runs work
> work requeues itself queued index = 1
> worker1 reads queued index = 1
> worker2 runs work
> work requeues itself queued index = 2
> worker2 runs work
> worker2 reads queued index = 2
> worker2 writes done index = 2
> worker1 writes done index = 1
> 
> As you see, done index got moved back.

Yeah, I think the flushing logic should be moved to the worker.  Are
you interested in doing it w/ your change?

Thanks.
Michael S. Tsirkin - July 28, 2010, 10:48 a.m.
On Wed, Jul 28, 2010 at 09:48:31AM +0200, Tejun Heo wrote:
> On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
> >> Thinking a bit more about it, it kind of sucks that queueing to
> >> another worker from worker->func() breaks flush.  Maybe the right
> >> thing to do there is using atomic_t for done_seq?
> > 
> > I don't believe it will help: we might have:
> > 
> > worker1 runs work
> > work requeues itself queued index = 1
> > worker1 reads queued index = 1
> > worker2 runs work
> > work requeues itself queued index = 2
> > worker2 runs work
> > worker2 reads queued index = 2
> > worker2 writes done index = 2
> > worker1 writes done index = 1
> > 
> > As you see, done index got moved back.
> 
> Yeah, I think the flushing logic should be moved to the worker.
> Are you interested in doing it w/ your change?
> 
> Thanks.

I'm unsure how flush_work operates under these conditions.  E.g. in
workqueue.c, this seems to work by keeping a pointer to current
workqueue in the work.  But what prevents us from destroying the
workqueue when work might not be running?

Is this currently broken if you use multiple workqueues
for the same work? If yes, I propose we do as I did,
making flush_work get worker pointer, and only flushing
on that worker.

> -- 
> tejun
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - July 28, 2010, noon
Hello,

On 07/28/2010 12:48 PM, Michael S. Tsirkin wrote:
> I'm unsure how flush_work operates under these conditions.  E.g. in
> workqueue.c, this seems to work by keeping a pointer to current
> workqueue in the work.  But what prevents us from destroying the
> workqueue when work might not be running?

In cmwq, work points to the gcwq it was on, which keeps track of all
the works in progress, so flushing work which is on a destroyed
workqueue should be fine, but in the original implementation, it would
end up accessing freed memory.

> Is this currently broken if you use multiple workqueues
> for the same work? If yes, I propose we do as I did,
> making flush_work get worker pointer, and only flushing
> on that worker.

The original semantics of workqueue is that flush_work() guarantees
that the work has finished executing on the workqueue it was last
queued on.  Adding @worker to flush_work() is okay, I think.

Thanks.

Patch

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0c6b533..7730a30 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -73,7 +73,7 @@  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 	INIT_LIST_HEAD(&work->node);
 	work->fn = fn;
 	init_waitqueue_head(&work->done);
-	atomic_set(&work->flushing, 0);
+	work->flushing = 0;
 	work->queue_seq = work->done_seq = 0;
 }
 
@@ -99,13 +99,23 @@  void vhost_poll_stop(struct vhost_poll *poll)
 void vhost_poll_flush(struct vhost_poll *poll)
 {
 	struct vhost_work *work = &poll->work;
-	int seq = work->queue_seq;
+	unsigned seq, left;
+	int flushing;
 
-	atomic_inc(&work->flushing);
-	smp_mb__after_atomic_inc();	/* mb flush-b0 paired with worker-b1 */
-	wait_event(work->done, seq - work->done_seq <= 0);
-	atomic_dec(&work->flushing);
-	smp_mb__after_atomic_dec();	/* rmb flush-b1 paired with worker-b0 */
+	spin_lock_irq(&dev->work_lock);
+	seq = work->queue_seq;
+	work->flushing++;
+	spin_unlock_irq(&dev->work_lock);
+	wait_event(work->done, {
+		   spin_lock_irq(&dev->work_lock);
+		   left = work->done_seq - seq;
+		   spin_unlock_irq(&dev->work_lock);
+		   left < UINT_MAX / 2;
+	});
+	spin_lock_irq(&dev->work_lock);
+	flushing = --work->flushing;
+	spin_unlock_irq(&dev->work_lock);
+	BUG_ON(flushing < 0);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
@@ -151,37 +161,37 @@  static void vhost_vq_reset(struct vhost_dev *dev,
 static int vhost_worker(void *data)
 {
 	struct vhost_dev *dev = data;
-	struct vhost_work *work;
+	struct vhost_work *work = NULL;
 
-repeat:
-	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
 
-	if (kthread_should_stop()) {
-		__set_current_state(TASK_RUNNING);
-		return 0;
-	}
+		if (kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		}
 
-	work = NULL;
-	spin_lock_irq(&dev->work_lock);
-	if (!list_empty(&dev->work_list)) {
-		work = list_first_entry(&dev->work_list,
-					struct vhost_work, node);
-		list_del_init(&work->node);
-	}
-	spin_unlock_irq(&dev->work_lock);
+		spin_lock_irq(&dev->work_lock);
+		if (work) {
+			work->done_seq = work->queue_seq;
+			if (work->flushing)
+				wake_up_all(&work->done);
+		}
+		if (!list_empty(&dev->work_list)) {
+			work = list_first_entry(&dev->work_list,
+						struct vhost_work, node);
+			list_del_init(&work->node);
+		} else
+			work = NULL;
+		spin_unlock_irq(&dev->work_lock);
+
+		if (work) {
+			__set_current_state(TASK_RUNNING);
+			work->fn(work);
+		} else
+			schedule();
 
-	if (work) {
-		__set_current_state(TASK_RUNNING);
-		work->fn(work);
-		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
-		work->done_seq = work->queue_seq;
-		smp_mb();	/* mb worker-b1 paired with flush-b0 */
-		if (atomic_read(&work->flushing))
-			wake_up_all(&work->done);
-	} else
-		schedule();
-
-	goto repeat;
+	}
 }
 
 long vhost_dev_init(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0e63091..3693327 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -27,9 +27,9 @@  struct vhost_work {
 	struct list_head	  node;
 	vhost_work_fn_t		  fn;
 	wait_queue_head_t	  done;
-	atomic_t		  flushing;
-	int			  queue_seq;
-	int			  done_seq;
+	int			  flushing;
+	unsigned		  queue_seq;
+	unsigned		  done_seq;
 };
 
 /* Poll a file (eventfd or socket) */