Message ID | 1416343409-13502-1-git-send-email-tim.gardner@canonical.com |
---|---|
State | New |
Headers | show |
I think this is the safest option for fixing this in 3.13. --chris On 11/18/2014 02:43 PM, tim.gardner@canonical.com wrote: > From: Tim Gardner <tim.gardner@canonical.com> > > BugLink: http://bugs.launchpad.net/bugs/1383921 > > This reverts commit f60adf42ad55405d1b17e9e5c33fdb63f1eb8861. > > This commit was originally intended as a code improvement. However, > it appears to have either opened a locking race (which I cannot find), or > has abused HW in such a way as to cause it to stop. Given that this was > intended as an improvemnet, the most expedient approach appears to be to > revert it until we develop an upstream solution. > > Signed-off-by: Tim Gardner <tim.gardner@canonical.com> > --- > drivers/char/ipmi/ipmi_si_intf.c | 54 ++++++++++++++++++++++++---------------- > 1 file changed, 33 insertions(+), 21 deletions(-) > > diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c > index 4afc8b4..8c0ffdd 100644 > --- a/drivers/char/ipmi/ipmi_si_intf.c > +++ b/drivers/char/ipmi/ipmi_si_intf.c > @@ -175,6 +175,7 @@ struct smi_info { > struct si_sm_handlers *handlers; > enum si_type si_type; > spinlock_t si_lock; > + spinlock_t msg_lock; > struct list_head xmit_msgs; > struct list_head hp_xmit_msgs; > struct ipmi_smi_msg *curr_msg; > @@ -356,6 +357,13 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) > struct timeval t; > #endif > > + /* > + * No need to save flags, we aleady have interrupts off and we > + * already hold the SMI lock. > + */ > + if (!smi_info->run_to_completion) > + spin_lock(&(smi_info->msg_lock)); > + > /* Pick the high priority queue first. */ > if (!list_empty(&(smi_info->hp_xmit_msgs))) { > entry = smi_info->hp_xmit_msgs.next; > @@ -393,6 +401,9 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) > rv = SI_SM_CALL_WITHOUT_DELAY; > } > out: > + if (!smi_info->run_to_completion) > + spin_unlock(&(smi_info->msg_lock)); > + > return rv; > } > > @@ -879,6 +890,19 @@ static void sender(void *send_info, > printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec); > #endif > > + /* > + * last_timeout_jiffies is updated here to avoid > + * smi_timeout() handler passing very large time_diff > + * value to smi_event_handler() that causes > + * the send command to abort. > + */ > + smi_info->last_timeout_jiffies = jiffies; > + > + mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); > + > + if (smi_info->thread) > + wake_up_process(smi_info->thread); > + > if (smi_info->run_to_completion) { > /* > * If we are running to completion, then throw it in > @@ -901,26 +925,15 @@ static void sender(void *send_info, > return; > } > > - spin_lock_irqsave(&smi_info->si_lock, flags); > + spin_lock_irqsave(&smi_info->msg_lock, flags); > if (priority > 0) > list_add_tail(&msg->link, &smi_info->hp_xmit_msgs); > else > list_add_tail(&msg->link, &smi_info->xmit_msgs); > + spin_unlock_irqrestore(&smi_info->msg_lock, flags); > > + spin_lock_irqsave(&smi_info->si_lock, flags); > if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) { > - /* > - * last_timeout_jiffies is updated here to avoid > - * smi_timeout() handler passing very large time_diff > - * value to smi_event_handler() that causes > - * the send command to abort. > - */ > - smi_info->last_timeout_jiffies = jiffies; > - > - mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); > - > - if (smi_info->thread) > - wake_up_process(smi_info->thread); > - > start_next_msg(smi_info); > smi_event_handler(smi_info, 0); > } > @@ -1024,19 +1037,16 @@ static int ipmi_thread(void *data) > static void poll(void *send_info) > { > struct smi_info *smi_info = send_info; > - unsigned long flags = 0; > - int run_to_completion = smi_info->run_to_completion; > + unsigned long flags; > > /* > * Make sure there is some delay in the poll loop so we can > * drive time forward and timeout things. > */ > udelay(10); > - if (!run_to_completion) > - spin_lock_irqsave(&smi_info->si_lock, flags); > + spin_lock_irqsave(&smi_info->si_lock, flags); > smi_event_handler(smi_info, 10); > - if (!run_to_completion) > - spin_unlock_irqrestore(&smi_info->si_lock, flags); > + spin_unlock_irqrestore(&smi_info->si_lock, flags); > } > > static void request_events(void *send_info) > @@ -1702,8 +1712,10 @@ static struct smi_info *smi_info_alloc(void) > { > struct smi_info *info = kzalloc(sizeof(*info), GFP_KERNEL); > > - if (info) > + if (info) { > spin_lock_init(&info->si_lock); > + spin_lock_init(&info->msg_lock); > + } > return info; > } > >
That patch is in the kernel since 3.4 and it does not look like we even waited for the feedback about 3.16 kernels. Before randomly ripping out a change from the beginning of the delta between 3.2 and 3.13 and maybe risking other problems I would at least wait for the feedback about 3.16.
On 11/19/2014 03:12 AM, Stefan Bader wrote: > That patch is in the kernel since 3.4 and it does not look like we > even waited for the feedback about 3.16 kernels. Before randomly > ripping out a change from the beginning of the delta between 3.2 > and 3.13 and maybe risking other problems I would at least wait for > the feedback about 3.16. > Stefan, Fair enough, I suspect there are about 3-4 patches that may be needed to backport into 3.13. If that is too complex I still think reverting f60adf42a makes sense because its a reduction in locking vs removing a necessary feature. --chris
On 19.11.2014 15:47, Chris J Arges wrote: > On 11/19/2014 03:12 AM, Stefan Bader wrote: >> That patch is in the kernel since 3.4 and it does not look like we >> even waited for the feedback about 3.16 kernels. Before randomly >> ripping out a change from the beginning of the delta between 3.2 >> and 3.13 and maybe risking other problems I would at least wait for >> the feedback about 3.16. >> > Stefan, > Fair enough, I suspect there are about 3-4 patches that may be needed > to backport into 3.13. If that is too complex I still think reverting > f60adf42a makes sense because its a reduction in locking vs removing a > necessary feature. > --chris > My major worries would be that since there is several more patches since 3.4 and 3.14, that might subtly rely on the change to revert. And that a positive feedback from one tester (iirc the same code covers multiple ways to expose the impi interface as well as methods to access it, some need polling some have interrupts) may not uncover all possibilities. Having looked at the code again in more detail, I find the locking not very easy to grasp. While comments for smi_event_handler say must be called with the lock and interrupts disabled, it is not always obvious that that is done. My hopes would be that its something on top like commit 48e8ac2979920ffa39117e2d725afa3a749bfe8d Author: Bodo Stroesser <bstroesser@ts.fujitsu.com> Date: Mon Apr 14 09:46:51 2014 -0500 ipmi: Fix a race restarting the timer That way we would at least go forward and not deviate from upstream in a way we most certainly will forget for future releases. If its still broken upstream maybe we could get it fixed... Stefan
On 19.11.2014 10:12, Stefan Bader wrote: > That patch is in the kernel since 3.4 and it does not look like we even waited > for the feedback about 3.16 kernels. Before randomly ripping out a change from > the beginning of the delta between 3.2 and 3.13 and maybe risking other problems > I would at least wait for the feedback about 3.16. > I have spent a bit more time on the driver and at least feel now more conformable with the revert not causing problems for changes that came later. With 3.16 there may be a proper fix. The final remaining danger would be that future stable patches on that driver would. Hopefully such a thing will break when we apply it. So for now ACK for reverting in Trusty. -Stefan
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 4afc8b4..8c0ffdd 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -175,6 +175,7 @@ struct smi_info { struct si_sm_handlers *handlers; enum si_type si_type; spinlock_t si_lock; + spinlock_t msg_lock; struct list_head xmit_msgs; struct list_head hp_xmit_msgs; struct ipmi_smi_msg *curr_msg; @@ -356,6 +357,13 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) struct timeval t; #endif + /* + * No need to save flags, we aleady have interrupts off and we + * already hold the SMI lock. + */ + if (!smi_info->run_to_completion) + spin_lock(&(smi_info->msg_lock)); + /* Pick the high priority queue first. */ if (!list_empty(&(smi_info->hp_xmit_msgs))) { entry = smi_info->hp_xmit_msgs.next; @@ -393,6 +401,9 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) rv = SI_SM_CALL_WITHOUT_DELAY; } out: + if (!smi_info->run_to_completion) + spin_unlock(&(smi_info->msg_lock)); + return rv; } @@ -879,6 +890,19 @@ static void sender(void *send_info, printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec); #endif + /* + * last_timeout_jiffies is updated here to avoid + * smi_timeout() handler passing very large time_diff + * value to smi_event_handler() that causes + * the send command to abort. + */ + smi_info->last_timeout_jiffies = jiffies; + + mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); + + if (smi_info->thread) + wake_up_process(smi_info->thread); + if (smi_info->run_to_completion) { /* * If we are running to completion, then throw it in @@ -901,26 +925,15 @@ static void sender(void *send_info, return; } - spin_lock_irqsave(&smi_info->si_lock, flags); + spin_lock_irqsave(&smi_info->msg_lock, flags); if (priority > 0) list_add_tail(&msg->link, &smi_info->hp_xmit_msgs); else list_add_tail(&msg->link, &smi_info->xmit_msgs); + spin_unlock_irqrestore(&smi_info->msg_lock, flags); + spin_lock_irqsave(&smi_info->si_lock, flags); if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) { - /* - * last_timeout_jiffies is updated here to avoid - * smi_timeout() handler passing very large time_diff - * value to smi_event_handler() that causes - * the send command to abort. - */ - smi_info->last_timeout_jiffies = jiffies; - - mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); - - if (smi_info->thread) - wake_up_process(smi_info->thread); - start_next_msg(smi_info); smi_event_handler(smi_info, 0); } @@ -1024,19 +1037,16 @@ static int ipmi_thread(void *data) static void poll(void *send_info) { struct smi_info *smi_info = send_info; - unsigned long flags = 0; - int run_to_completion = smi_info->run_to_completion; + unsigned long flags; /* * Make sure there is some delay in the poll loop so we can * drive time forward and timeout things. */ udelay(10); - if (!run_to_completion) - spin_lock_irqsave(&smi_info->si_lock, flags); + spin_lock_irqsave(&smi_info->si_lock, flags); smi_event_handler(smi_info, 10); - if (!run_to_completion) - spin_unlock_irqrestore(&smi_info->si_lock, flags); + spin_unlock_irqrestore(&smi_info->si_lock, flags); } static void request_events(void *send_info) @@ -1702,8 +1712,10 @@ static struct smi_info *smi_info_alloc(void) { struct smi_info *info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info) + if (info) { spin_lock_init(&info->si_lock); + spin_lock_init(&info->msg_lock); + } return info; }