[5/6] powerpc/powernv: implement opal_put_chars_nonatomic

Message ID 20180409054056.27292-6-npiggin@gmail.com
State New
Headers show
Series
  • improve OPAL cosole flushing and locking
Related show

Commit Message

Nicholas Piggin April 9, 2018, 5:40 a.m.
The RAW console does not need writes to be atomic, so implement a
_nonatomic variant which does not take a spinlock. This API is used
in xmon, so the less locking thta's used, the better chance there is
that a crash can be debugged.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/opal.h       |  1 +
 arch/powerpc/platforms/powernv/opal.c | 35 +++++++++++++++++++--------
 drivers/tty/hvc/hvc_opal.c            |  4 +--
 3 files changed, 28 insertions(+), 12 deletions(-)

Comments

Benjamin Herrenschmidt April 9, 2018, 5:57 a.m. | #1
On Mon, 2018-04-09 at 15:40 +1000, Nicholas Piggin wrote:
> The RAW console does not need writes to be atomic, so implement a
> _nonatomic variant which does not take a spinlock. This API is used
> in xmon, so the less locking thta's used, the better chance there is
> that a crash can be debugged.

I find the term "nonatomic" confusing... don't we have a problem if we
start hitting OPAL without a lock where we can't trust
opal_console_write_buffer_space anymore ? I think we need to handle
partial writes in that case. Maybe we should return how much was
written and leave the caller to deal with it.

I was hoping (but that isn't the case) that by nonatomic you actually
meant calls that could be done in a non-atomic context, where we can do
msleep instead of mdelay. That would be handy for the console coming
from the hvc thread (the tty one).

Cheers,
Ben.

> 
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/include/asm/opal.h       |  1 +
>  arch/powerpc/platforms/powernv/opal.c | 35 +++++++++++++++++++--------
>  drivers/tty/hvc/hvc_opal.c            |  4 +--
>  3 files changed, 28 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index bbff49fab0e5..66954d671831 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -303,6 +303,7 @@ extern void opal_configure_cores(void);
>  
>  extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
>  extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
> +extern int opal_put_chars_nonatomic(uint32_t vtermno, const char *buf, int total_len);
>  extern int opal_flush_console(uint32_t vtermno);
>  
>  extern void hvc_opal_init_early(void);
> diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
> index b05500a70f58..dc77fc57d1e9 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -344,9 +344,9 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count)
>  	return 0;
>  }
>  
> -int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
> +static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
>  {
> -	unsigned long flags;
> +	unsigned long flags = 0 /* shut up gcc */;
>  	int written;
>  	__be64 olen;
>  	s64 rc;
> @@ -354,11 +354,8 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
>  	if (!opal.entry)
>  		return -ENODEV;
>  
> -	/* We want put_chars to be atomic to avoid mangling of hvsi
> -	 * packets. To do that, we first test for room and return
> -	 * -EAGAIN if there isn't enough.
> -	 */
> -	spin_lock_irqsave(&opal_write_lock, flags);
> +	if (atomic)
> +		spin_lock_irqsave(&opal_write_lock, flags);
>  	rc = opal_console_write_buffer_space(vtermno, &olen);
>  	if (rc || be64_to_cpu(olen) < total_len) {
>  		/* Closed -> drop characters */
> @@ -391,14 +388,18 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
>  
>  	written = be64_to_cpu(olen);
>  	if (written < total_len) {
> -		/* Should not happen */
> -		pr_warn("atomic console write returned partial len=%d written=%d\n", total_len, written);
> +		if (atomic) {
> +			/* Should not happen */
> +			pr_warn("atomic console write returned partial "
> +				"len=%d written=%d\n", total_len, written);
> +		}
>  		if (!written)
>  			written = -EAGAIN;
>  	}
>  
>  out:
> -	spin_unlock_irqrestore(&opal_write_lock, flags);
> +	if (atomic)
> +		spin_unlock_irqrestore(&opal_write_lock, flags);
>  
>  	/* In the -EAGAIN case, callers loop, so we have to flush the console
>  	 * here in case they have interrupts off (and we don't want to wait
> @@ -412,6 +413,20 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
>  	return written;
>  }
>  
> +int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
> +{
> +	/* We want put_chars to be atomic to avoid mangling of hvsi
> +	 * packets. To do that, we first test for room and return
> +	 * -EAGAIN if there isn't enough.
> +	 */
> +	return __opal_put_chars(vtermno, data, total_len, true);
> +}
> +
> +int opal_put_chars_nonatomic(uint32_t vtermno, const char *data, int total_len)
> +{
> +	return __opal_put_chars(vtermno, data, total_len, false);
> +}
> +
>  int opal_flush_console(uint32_t vtermno)
>  {
>  	s64 rc;
> diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c
> index af122ad7f06d..e151cfacf2a7 100644
> --- a/drivers/tty/hvc/hvc_opal.c
> +++ b/drivers/tty/hvc/hvc_opal.c
> @@ -51,7 +51,7 @@ static u32 hvc_opal_boot_termno;
>  
>  static const struct hv_ops hvc_opal_raw_ops = {
>  	.get_chars = opal_get_chars,
> -	.put_chars = opal_put_chars,
> +	.put_chars = opal_put_chars_nonatomic,
>  	.notifier_add = notifier_add_irq,
>  	.notifier_del = notifier_del_irq,
>  	.notifier_hangup = notifier_hangup_irq,
> @@ -269,7 +269,7 @@ static void udbg_opal_putc(char c)
>  	do {
>  		switch(hvc_opal_boot_priv.proto) {
>  		case HV_PROTOCOL_RAW:
> -			count = opal_put_chars(termno, &c, 1);
> +			count = opal_put_chars_nonatomic(termno, &c, 1);
>  			break;
>  		case HV_PROTOCOL_HVSI:
>  			count = hvc_opal_hvsi_put_chars(termno, &c, 1);
Nicholas Piggin April 9, 2018, 6:23 a.m. | #2
On Mon, 09 Apr 2018 15:57:55 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Mon, 2018-04-09 at 15:40 +1000, Nicholas Piggin wrote:
> > The RAW console does not need writes to be atomic, so implement a
> > _nonatomic variant which does not take a spinlock. This API is used
> > in xmon, so the less locking thta's used, the better chance there is
> > that a crash can be debugged.  
> 
> I find the term "nonatomic" confusing...

I guess it is to go with the "atomic" comment for the hvsi console
case -- all characters must get to the console together or not at
all.

> don't we have a problem if we
> start hitting OPAL without a lock where we can't trust
> opal_console_write_buffer_space anymore ? I think we need to handle
> partial writes in that case. Maybe we should return how much was
> written and leave the caller to deal with it.

Yes, the _nonatomic variant doesn't use opal_console_write_buffer_space
and it does handle partial writes by returning written bytes (although
callers generally tend to loop at the moment, we might do something
smarter with them later).

> I was hoping (but that isn't the case) that by nonatomic you actually
> meant calls that could be done in a non-atomic context, where we can do
> msleep instead of mdelay. That would be handy for the console coming
> from the hvc thread (the tty one).

Ah right, no. However we no longer loop until everything is written, so
the hvc console driver (or the console layer) should be able to deal with
that with sleeping. I don't think we need to put it at this level of the
driver, but I don't know much about the console code.

Thanks,
Nick
Benjamin Herrenschmidt April 9, 2018, 8:24 a.m. | #3
On Mon, 2018-04-09 at 16:23 +1000, Nicholas Piggin wrote:
> On Mon, 09 Apr 2018 15:57:55 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Mon, 2018-04-09 at 15:40 +1000, Nicholas Piggin wrote:
> > > The RAW console does not need writes to be atomic, so implement a
> > > _nonatomic variant which does not take a spinlock. This API is used
> > > in xmon, so the less locking thta's used, the better chance there is
> > > that a crash can be debugged.  
> > 
> > I find the term "nonatomic" confusing...
> 
> I guess it is to go with the "atomic" comment for the hvsi console
> case -- all characters must get to the console together or not at
> all.

Yeah ok, it's just that in Linux "atomic" usually means something else
:-) Why not just call it "unlocked" which is what it's about and
matches existing practices thorough the kernel ?

> > don't we have a problem if we
> > start hitting OPAL without a lock where we can't trust
> > opal_console_write_buffer_space anymore ? I think we need to handle
> > partial writes in that case. Maybe we should return how much was
> > written and leave the caller to deal with it.
> 
> Yes, the _nonatomic variant doesn't use opal_console_write_buffer_space
> and it does handle partial writes by returning written bytes (although
> callers generally tend to loop at the moment, we might do something
> smarter with them later).
> 
> > I was hoping (but that isn't the case) that by nonatomic you actually
> > meant calls that could be done in a non-atomic context, where we can do
> > msleep instead of mdelay. That would be handy for the console coming
> > from the hvc thread (the tty one).
> 
> Ah right, no. However we no longer loop until everything is written, so
> the hvc console driver (or the console layer) should be able to deal with
> that with sleeping. I don't think we need to put it at this level of the
> driver, but I don't know much about the console code.

Ok, so hopefully we shouldn't be hitting the delay..

Cheers,
Ben.
Nicholas Piggin April 9, 2018, 9:02 a.m. | #4
On Mon, 09 Apr 2018 18:24:46 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Mon, 2018-04-09 at 16:23 +1000, Nicholas Piggin wrote:
> > On Mon, 09 Apr 2018 15:57:55 +1000
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> >   
> > > On Mon, 2018-04-09 at 15:40 +1000, Nicholas Piggin wrote:  
> > > > The RAW console does not need writes to be atomic, so implement a
> > > > _nonatomic variant which does not take a spinlock. This API is used
> > > > in xmon, so the less locking thta's used, the better chance there is
> > > > that a crash can be debugged.    
> > > 
> > > I find the term "nonatomic" confusing...  
> > 
> > I guess it is to go with the "atomic" comment for the hvsi console
> > case -- all characters must get to the console together or not at
> > all.  
> 
> Yeah ok, it's just that in Linux "atomic" usually means something else
> :-) Why not just call it "unlocked" which is what it's about and
> matches existing practices thorough the kernel ?

Sure, I'll change it.
 
> > > don't we have a problem if we
> > > start hitting OPAL without a lock where we can't trust
> > > opal_console_write_buffer_space anymore ? I think we need to handle
> > > partial writes in that case. Maybe we should return how much was
> > > written and leave the caller to deal with it.  
> > 
> > Yes, the _nonatomic variant doesn't use opal_console_write_buffer_space
> > and it does handle partial writes by returning written bytes (although
> > callers generally tend to loop at the moment, we might do something
> > smarter with them later).
> >   
> > > I was hoping (but that isn't the case) that by nonatomic you actually
> > > meant calls that could be done in a non-atomic context, where we can do
> > > msleep instead of mdelay. That would be handy for the console coming
> > > from the hvc thread (the tty one).  
> > 
> > Ah right, no. However we no longer loop until everything is written, so
> > the hvc console driver (or the console layer) should be able to deal with
> > that with sleeping. I don't think we need to put it at this level of the
> > driver, but I don't know much about the console code.  
> 
> Ok, so hopefully we shouldn't be hitting the delay..

I *think* so. It may actually hit it once on the way out, but I don't
know if it's worth adding a new API to avoid it. Probably warrants
somebody to take a look and measure things though.

Actually I did just take a look. The bigger problem actually is because
we do the console flush here and even the "good" hvc path holds an irq
lock in this case:

   dmesg-8334   12d...    0us : _raw_spin_lock_irqsave
   dmesg-8334   12d...    0us : hvc_push <-hvc_write
   dmesg-8334   12d...    1us : opal_put_chars_nonatomic <-hvc_push
   dmesg-8334   12d...    1us : __opal_put_chars <-hvc_push
   dmesg-8334   12d...    2us : opal_flush_console <-__opal_put_chars
   dmesg-8334   12d...    4us!: udelay <-opal_flush_console
   dmesg-8334   12d...  787us : soft_nmi_interrupt <-soft_nmi_common
   dmesg-8334   12d...  787us : printk_nmi_enter <-soft_nmi_interrupt
   dmesg-8334   12d.Z.  788us : rcu_nmi_enter <-soft_nmi_interrupt
   dmesg-8334   12d.Z.  788us : rcu_nmi_exit <-soft_nmi_interrupt
   dmesg-8334   12d...  788us#: printk_nmi_exit <-soft_nmi_interrupt
   dmesg-8334   12d... 10005us*: udelay <-opal_flush_console
   dmesg-8334   12d... 20007us*: udelay <-opal_flush_console
   dmesg-8334   12d... 30020us*: udelay <-opal_flush_console
   dmesg-8334   12d... 40022us*: udelay <-opal_flush_console
   dmesg-8334   12d... 50023us*: udelay <-opal_flush_console
   dmesg-8334   12d... 60024us : opal_error_code <-opal_flush_console
   dmesg-8334   12d... 60025us : _raw_spin_unlock_irqrestore <-hvc_write
   dmesg-8334   12d... 60025us : _raw_spin_unlock_irqrestore
   dmesg-8334   12d... 60025us : trace_hardirqs_on <-_raw_spin_unlock_irqrestore
   dmesg-8334   12d... 60027us : <stack trace>

60ms interrupt off latency waiting for the console to flush (just
running `dmesg` from OpenBMC console).

That requires some reworking of the hvc code, we can't fix it in
the OPAL driver alone.

Thanks,
Nick

Patch

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index bbff49fab0e5..66954d671831 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -303,6 +303,7 @@  extern void opal_configure_cores(void);
 
 extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
 extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
+extern int opal_put_chars_nonatomic(uint32_t vtermno, const char *buf, int total_len);
 extern int opal_flush_console(uint32_t vtermno);
 
 extern void hvc_opal_init_early(void);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index b05500a70f58..dc77fc57d1e9 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -344,9 +344,9 @@  int opal_get_chars(uint32_t vtermno, char *buf, int count)
 	return 0;
 }
 
-int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
 {
-	unsigned long flags;
+	unsigned long flags = 0 /* shut up gcc */;
 	int written;
 	__be64 olen;
 	s64 rc;
@@ -354,11 +354,8 @@  int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 	if (!opal.entry)
 		return -ENODEV;
 
-	/* We want put_chars to be atomic to avoid mangling of hvsi
-	 * packets. To do that, we first test for room and return
-	 * -EAGAIN if there isn't enough.
-	 */
-	spin_lock_irqsave(&opal_write_lock, flags);
+	if (atomic)
+		spin_lock_irqsave(&opal_write_lock, flags);
 	rc = opal_console_write_buffer_space(vtermno, &olen);
 	if (rc || be64_to_cpu(olen) < total_len) {
 		/* Closed -> drop characters */
@@ -391,14 +388,18 @@  int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 
 	written = be64_to_cpu(olen);
 	if (written < total_len) {
-		/* Should not happen */
-		pr_warn("atomic console write returned partial len=%d written=%d\n", total_len, written);
+		if (atomic) {
+			/* Should not happen */
+			pr_warn("atomic console write returned partial "
+				"len=%d written=%d\n", total_len, written);
+		}
 		if (!written)
 			written = -EAGAIN;
 	}
 
 out:
-	spin_unlock_irqrestore(&opal_write_lock, flags);
+	if (atomic)
+		spin_unlock_irqrestore(&opal_write_lock, flags);
 
 	/* In the -EAGAIN case, callers loop, so we have to flush the console
 	 * here in case they have interrupts off (and we don't want to wait
@@ -412,6 +413,20 @@  int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 	return written;
 }
 
+int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+{
+	/* We want put_chars to be atomic to avoid mangling of hvsi
+	 * packets. To do that, we first test for room and return
+	 * -EAGAIN if there isn't enough.
+	 */
+	return __opal_put_chars(vtermno, data, total_len, true);
+}
+
+int opal_put_chars_nonatomic(uint32_t vtermno, const char *data, int total_len)
+{
+	return __opal_put_chars(vtermno, data, total_len, false);
+}
+
 int opal_flush_console(uint32_t vtermno)
 {
 	s64 rc;
diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c
index af122ad7f06d..e151cfacf2a7 100644
--- a/drivers/tty/hvc/hvc_opal.c
+++ b/drivers/tty/hvc/hvc_opal.c
@@ -51,7 +51,7 @@  static u32 hvc_opal_boot_termno;
 
 static const struct hv_ops hvc_opal_raw_ops = {
 	.get_chars = opal_get_chars,
-	.put_chars = opal_put_chars,
+	.put_chars = opal_put_chars_nonatomic,
 	.notifier_add = notifier_add_irq,
 	.notifier_del = notifier_del_irq,
 	.notifier_hangup = notifier_hangup_irq,
@@ -269,7 +269,7 @@  static void udbg_opal_putc(char c)
 	do {
 		switch(hvc_opal_boot_priv.proto) {
 		case HV_PROTOCOL_RAW:
-			count = opal_put_chars(termno, &c, 1);
+			count = opal_put_chars_nonatomic(termno, &c, 1);
 			break;
 		case HV_PROTOCOL_HVSI:
 			count = hvc_opal_hvsi_put_chars(termno, &c, 1);