diff mbox series

[1/2] i2c: gpio: fault-injector: add 'lose_arbitration' injector

Message ID 20190217124126.7257-2-wsa+renesas@sang-engineering.com
State Superseded
Headers show
Series i2c: gpio: fault-injector: add two new injectors | expand

Commit Message

Wolfram Sang Feb. 17, 2019, 12:41 p.m. UTC
Add a fault injector simulating 'arbitration lost' from multi-master
setups. Read the docs for its usage.

A helper function for future fault injectors using SCL interrupts is
created to achieve this.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---

Change since RFC:

* user supplied value is now duration of interference instead of number of
  interferences
* refactored code to suppy a resusable helper function to install SCL interrupt
  handlers
* added error checks to interrupt number
* slightly renamed the SCL interrupt when registering

Thanks again, Geert, for the review!

 Documentation/i2c/gpio-fault-injection | 25 ++++++++++++
 drivers/i2c/busses/i2c-gpio.c          | 71 ++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

Comments

Geert Uytterhoeven Feb. 18, 2019, 9:17 a.m. UTC | #1
Hi Wolfram,

On Sun, Feb 17, 2019 at 1:41 PM Wolfram Sang
<wsa+renesas@sang-engineering.com> wrote:
> Add a fault injector simulating 'arbitration lost' from multi-master
> setups. Read the docs for its usage.
>
> A helper function for future fault injectors using SCL interrupts is
> created to achieve this.
>
> Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
> ---
>
> Change since RFC:
>
> * user supplied value is now duration of interference instead of number of
>   interferences
> * refactored code to suppy a resusable helper function to install SCL interrupt
>   handlers
> * added error checks to interrupt number
> * slightly renamed the SCL interrupt when registering

Thanks for the update!

> --- a/Documentation/i2c/gpio-fault-injection
> +++ b/Documentation/i2c/gpio-fault-injection
> @@ -83,3 +83,28 @@ This is why bus recovery (up to 9 clock pulses) must either check SDA or send
>  additional STOP conditions to ensure the bus has been released. Otherwise
>  random data will be written to a device!
>
> +Lost arbitration
> +================
> +
> +Here, we want to simulate the condition where the master under tests loses the

test

> +bus arbitration against another master in a multi-master setup.
> +
> +"lose_arbitration"
> +------------------
> +
> +This file is write only and you need to write the duration of the arbitration
> +interference (in us). The calling process will then sleep and wait for the

µs

We do UTF-8 in documentation, don't we?

> --- a/drivers/i2c/busses/i2c-gpio.c
> +++ b/drivers/i2c/busses/i2c-gpio.c

> @@ -162,6 +167,67 @@ static int fops_incomplete_write_byte_set(void *data, u64 addr)
>  }
>  DEFINE_DEBUGFS_ATTRIBUTE(fops_incomplete_write_byte, NULL, fops_incomplete_write_byte_set, "%llu\n");
>
> +static int i2c_gpio_fi_act_on_scl_irq(struct i2c_gpio_private_data *priv,
> +                                      irqreturn_t handler(int, void*))
> +{
> +       int ret, irq = gpiod_to_irq(priv->scl);
> +
> +       if (irq < 0)
> +               return irq;
> +
> +       i2c_lock_bus(&priv->adap, I2C_LOCK_ROOT_ADAPTER);
> +
> +       ret = gpiod_direction_input(priv->scl);
> +       if (ret)
> +               goto unlock;
> +
> +       reinit_completion(&priv->scl_irq_completion);
> +
> +       ret = request_irq(irq, handler, IRQF_TRIGGER_FALLING,
> +                         "i2c_gpio_fault_injector_scl_irq", priv);
> +       if (ret)
> +               goto output;
> +
> +       wait_for_completion_interruptible(&priv->scl_irq_completion);

Error checking/propagation (-ERESTARTSYS)?

> +
> +       free_irq(irq, priv);
> + output:
> +       ret = gpiod_direction_output(priv->scl, 1);

This may overwrite the error code returned by request_irq().

> + unlock:
> +       i2c_unlock_bus(&priv->adap, I2C_LOCK_ROOT_ADAPTER);
> +
> +       return ret;
> +}
> +
> +static irqreturn_t lose_arbitration_irq(int irq, void *dev_id)
> +{
> +       struct i2c_gpio_private_data *priv = dev_id;
> +
> +       setsda(&priv->bit_data, 0);
> +       udelay(priv->scl_irq_data);

On 32-bit, u64 scl_irq_data is truncated...

> +       setsda(&priv->bit_data, 1);
> +
> +       complete(&priv->scl_irq_completion);
> +
> +       return IRQ_HANDLED;
> +}
> +
> +static int fops_lose_arbitration_set(void *data, u64 duration)
> +{
> +       struct i2c_gpio_private_data *priv = data;
> +
> +       priv->scl_irq_data = duration;

... since calling udelay() with large numbers can be dangerous, perhaps you
want to limit it to say 100 ms max anyway?

> +       /*
> +        * Interrupt on falling SCL. This ensures that the master under test has
> +        * really started the transfer. Interrupt on falling SDA did only
> +        * exercise 'bus busy' detection on some HW but not 'arbitration lost'.
> +        * Note that the interrupt latency may cause the first bits to be
> +        * transmitted correctly.
> +        */
> +       return i2c_gpio_fi_act_on_scl_irq(priv, lose_arbitration_irq);
> +}

Gr{oetje,eeting}s,

                        Geert
Wolfram Sang Feb. 18, 2019, 8:41 p.m. UTC | #2
Hi Geert,

thanks for this review, too!

> >
> > +Lost arbitration
> > +================
> > +
> > +Here, we want to simulate the condition where the master under tests loses the
> 
> test

Ack.

> > +This file is write only and you need to write the duration of the arbitration
> > +interference (in us). The calling process will then sleep and wait for the
> 
> µs
> 
> We do UTF-8 in documentation, don't we?

Dunno, I can change. Only 4 occcurences of 'µs' in Documentation/ so far.

> > +       wait_for_completion_interruptible(&priv->scl_irq_completion);
> 
> Error checking/propagation (-ERESTARTSYS)?

Are you sure? ERESTARTSYS belongs to the "These should never be seen by
user programs." group.

> 
> > +
> > +       free_irq(irq, priv);
> > + output:
> > +       ret = gpiod_direction_output(priv->scl, 1);
> 
> This may overwrite the error code returned by request_irq().

Yeah. What do you think about this, is this too dense?

	ret = gpiod_direction_output(priv->scl, 1) ?: ret;

> > +       priv->scl_irq_data = duration;
> 
> ... since calling udelay() with large numbers can be dangerous, perhaps you
> want to limit it to say 100 ms max anyway?

Yeah, good idea. Will apply it for both injectors.

Happy hacking,

   Wolfram
Peter Rosin Feb. 18, 2019, 11:48 p.m. UTC | #3
On 2019-02-18 21:41, Wolfram Sang wrote:
> Hi Geert,
>>> +
>>> +       free_irq(irq, priv);
>>> + output:
>>> +       ret = gpiod_direction_output(priv->scl, 1);
>>
>> This may overwrite the error code returned by request_irq().
> 
> Yeah. What do you think about this, is this too dense?
> 
> 	ret = gpiod_direction_output(priv->scl, 1) ?: ret;

That may also overwrite the error code, of course. Isn't it
usually best to return the first error? I have no clue if that
guideline does not apply here, though...

Cheers,
Peter
Geert Uytterhoeven Feb. 19, 2019, 7:53 a.m. UTC | #4
Hi Wolfram,

On Mon, Feb 18, 2019 at 9:41 PM Wolfram Sang <wsa@the-dreams.de> wrote:
> > > +       wait_for_completion_interruptible(&priv->scl_irq_completion);
> >
> > Error checking/propagation (-ERESTARTSYS)?
>
> Are you sure? ERESTARTSYS belongs to the "These should never be seen by
> user programs." group.

How else can you inform the user the operation has been interrupted?

Gr{oetje,eeting}s,

                        Geert
Wolfram Sang Feb. 19, 2019, 1:13 p.m. UTC | #5
> >>> +       ret = gpiod_direction_output(priv->scl, 1);
> >>
> >> This may overwrite the error code returned by request_irq().
> > 
> > Yeah. What do you think about this, is this too dense?
> > 
> > 	ret = gpiod_direction_output(priv->scl, 1) ?: ret;
> 
> That may also overwrite the error code, of course. Isn't it
> usually best to return the first error? I have no clue if that
> guideline does not apply here, though...

I am neither entirely sure here. My take was that the above was the more
severe error. Because if setting to output fails, the GPIO I2C bus will
be broken. If the former stuff fails, well, the injection didn't work or
was interrupted.

However, the GPIO was set to output before the injector. So, if setting
it back fails, then the system likely has more severe problems anyhow?

I am open to any better solution. However, let's not forget, this is
debug code aimed to be used by devs.
Wolfram Sang Feb. 19, 2019, 1:18 p.m. UTC | #6
> > > > +       wait_for_completion_interruptible(&priv->scl_irq_completion);
> > >
> > > Error checking/propagation (-ERESTARTSYS)?
> >
> > Are you sure? ERESTARTSYS belongs to the "These should never be seen by
> > user programs." group.
> 
> How else can you inform the user the operation has been interrupted?

Definately not by using something which is marked "should never be seen
by user programs" :)

In the worst case, I'll add:
	if (ret) ret = -EINTR;

I tested the current code with CTRL+C, there we get EOWNERDEAD back to
userspace, even with my code not propagating anything. With sending
SIGKILL, I got 143 which is not defined. I want to double check the
latter first, might be my tests were flaky...
Geert Uytterhoeven Feb. 19, 2019, 1:33 p.m. UTC | #7
Hi Wolfram,

On Tue, Feb 19, 2019 at 2:18 PM Wolfram Sang <wsa@the-dreams.de> wrote:
> > > > > +       wait_for_completion_interruptible(&priv->scl_irq_completion);
> > > >
> > > > Error checking/propagation (-ERESTARTSYS)?
> > >
> > > Are you sure? ERESTARTSYS belongs to the "These should never be seen by
> > > user programs." group.
> >
> > How else can you inform the user the operation has been interrupted?
>
> Definately not by using something which is marked "should never be seen
> by user programs" :)
>
> In the worst case, I'll add:
>         if (ret) ret = -EINTR;
>
> I tested the current code with CTRL+C, there we get EOWNERDEAD back to
> userspace, even with my code not propagating anything. With sending
> SIGKILL, I got 143 which is not defined. I want to double check the
> latter first, might be my tests were flaky...

Where's the kernel code that returns EOWNERDEAD?
Must be hidden in a complex preprocessor macro, as git grep cannot find it :-(

Gr{oetje,eeting}s,

                        Geert
Wolfram Sang Feb. 19, 2019, 1:37 p.m. UTC | #8
> > > > > > +       wait_for_completion_interruptible(&priv->scl_irq_completion);
> > > > >
> > > > > Error checking/propagation (-ERESTARTSYS)?
> > > >
> > > > Are you sure? ERESTARTSYS belongs to the "These should never be seen by
> > > > user programs." group.
> > >
> > > How else can you inform the user the operation has been interrupted?
> >
> > Definately not by using something which is marked "should never be seen
> > by user programs" :)
> >
> > In the worst case, I'll add:
> >         if (ret) ret = -EINTR;
> >
> > I tested the current code with CTRL+C, there we get EOWNERDEAD back to
> > userspace, even with my code not propagating anything. With sending
> > SIGKILL, I got 143 which is not defined. I want to double check the
> > latter first, might be my tests were flaky...
> 
> Where's the kernel code that returns EOWNERDEAD?
> Must be hidden in a complex preprocessor macro, as git grep cannot find it :-(

I rather think it is glibc returning it when it discovered that the
owner of a robust mutex was gone:

http://man7.org/linux/man-pages/man3/pthread_mutexattr_setrobust.3.html
Peter Rosin Feb. 19, 2019, 2:07 p.m. UTC | #9
On 2019-02-19 14:13, Wolfram Sang wrote:
> 
>>>>> +       ret = gpiod_direction_output(priv->scl, 1);
>>>>
>>>> This may overwrite the error code returned by request_irq().
>>>
>>> Yeah. What do you think about this, is this too dense?
>>>
>>> 	ret = gpiod_direction_output(priv->scl, 1) ?: ret;
>>
>> That may also overwrite the error code, of course. Isn't it
>> usually best to return the first error? I have no clue if that
>> guideline does not apply here, though...
> 
> I am neither entirely sure here. My take was that the above was the more
> severe error. Because if setting to output fails, the GPIO I2C bus will
> be broken. If the former stuff fails, well, the injection didn't work or
> was interrupted.
> 
> However, the GPIO was set to output before the injector. So, if setting
> it back fails, then the system likely has more severe problems anyhow?

One way to end up with that is if there is an irq attached to the gpio
pin. If you disable the irq, you are (sometimes) allowed to change the
gpio to output, but not if the irq is active. So, if some other part of
the driver "steals" the gpio by activating an irq while the injector is
doing its thing, it is possible to end up with this.

But that seems like a gigantic corner case.

> I am open to any better solution. However, let's not forget, this is
> debug code aimed to be used by devs.
> 

You are right, please ignore me.

Cheers,
Peter
Wolfram Sang Feb. 19, 2019, 4:26 p.m. UTC | #10
> I tested the current code with CTRL+C, there we get EOWNERDEAD back to
> userspace, even with my code not propagating anything. With sending
> SIGKILL, I got 143 which is not defined. I want to double check the
> latter first, might be my tests were flaky...

Yeah, I mixed it up. That was SIGTERM, of course. SIGKILL gives 137. A
signal gives you errorcode 128 + signal_number.

So, I don't think we need to propagate the wait_for_completion result
value.
diff mbox series

Patch

diff --git a/Documentation/i2c/gpio-fault-injection b/Documentation/i2c/gpio-fault-injection
index 1a44e3edc0c4..5262801438e6 100644
--- a/Documentation/i2c/gpio-fault-injection
+++ b/Documentation/i2c/gpio-fault-injection
@@ -83,3 +83,28 @@  This is why bus recovery (up to 9 clock pulses) must either check SDA or send
 additional STOP conditions to ensure the bus has been released. Otherwise
 random data will be written to a device!
 
+Lost arbitration
+================
+
+Here, we want to simulate the condition where the master under tests loses the
+bus arbitration against another master in a multi-master setup.
+
+"lose_arbitration"
+------------------
+
+This file is write only and you need to write the duration of the arbitration
+interference (in us). The calling process will then sleep and wait for the
+next bus clock. The process is interruptible, though.
+
+Arbitration lost is achieved by waiting for SCL going down by the master under
+test and then pulling SDA low for some time. So, the I2C address sent out
+should be corrupted and that should be detected properly. That means that the
+address sent out should have a lot of '1' bits to be able to detect corruption.
+There doesn't need to be a device at this address because arbitration lost
+should be detected beforehand. Also note, that SCL going down is monitored
+using interrupts, so the interrupt latency might cause the first bits to be not
+corrupted. A good starting point for using this fault injector on an otherwise
+idle bus is:
+
+# echo 200 > lose_arbitration &
+# i2cget -y <bus_to_test> 0x3f
diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c
index 2d532cc042f5..f3645148d120 100644
--- a/drivers/i2c/busses/i2c-gpio.c
+++ b/drivers/i2c/busses/i2c-gpio.c
@@ -7,12 +7,14 @@ 
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#include <linux/completion.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/gpio/consumer.h>
 #include <linux/i2c-algo-bit.h>
 #include <linux/i2c.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_data/i2c-gpio.h>
@@ -27,6 +29,9 @@  struct i2c_gpio_private_data {
 	struct i2c_gpio_platform_data pdata;
 #ifdef CONFIG_I2C_GPIO_FAULT_INJECTOR
 	struct dentry *debug_dir;
+	/* these must be protected by bus lock */
+	struct completion scl_irq_completion;
+	u64 scl_irq_data;
 #endif
 };
 
@@ -162,6 +167,67 @@  static int fops_incomplete_write_byte_set(void *data, u64 addr)
 }
 DEFINE_DEBUGFS_ATTRIBUTE(fops_incomplete_write_byte, NULL, fops_incomplete_write_byte_set, "%llu\n");
 
+static int i2c_gpio_fi_act_on_scl_irq(struct i2c_gpio_private_data *priv,
+				       irqreturn_t handler(int, void*))
+{
+	int ret, irq = gpiod_to_irq(priv->scl);
+
+	if (irq < 0)
+		return irq;
+
+	i2c_lock_bus(&priv->adap, I2C_LOCK_ROOT_ADAPTER);
+
+	ret = gpiod_direction_input(priv->scl);
+	if (ret)
+		goto unlock;
+
+	reinit_completion(&priv->scl_irq_completion);
+
+	ret = request_irq(irq, handler, IRQF_TRIGGER_FALLING,
+			  "i2c_gpio_fault_injector_scl_irq", priv);
+	if (ret)
+		goto output;
+
+	wait_for_completion_interruptible(&priv->scl_irq_completion);
+
+	free_irq(irq, priv);
+ output:
+	ret = gpiod_direction_output(priv->scl, 1);
+ unlock:
+	i2c_unlock_bus(&priv->adap, I2C_LOCK_ROOT_ADAPTER);
+
+	return ret;
+}
+
+static irqreturn_t lose_arbitration_irq(int irq, void *dev_id)
+{
+	struct i2c_gpio_private_data *priv = dev_id;
+
+	setsda(&priv->bit_data, 0);
+	udelay(priv->scl_irq_data);
+	setsda(&priv->bit_data, 1);
+
+	complete(&priv->scl_irq_completion);
+
+	return IRQ_HANDLED;
+}
+
+static int fops_lose_arbitration_set(void *data, u64 duration)
+{
+	struct i2c_gpio_private_data *priv = data;
+
+	priv->scl_irq_data = duration;
+	/*
+	 * Interrupt on falling SCL. This ensures that the master under test has
+	 * really started the transfer. Interrupt on falling SDA did only
+	 * exercise 'bus busy' detection on some HW but not 'arbitration lost'.
+	 * Note that the interrupt latency may cause the first bits to be
+	 * transmitted correctly.
+	 */
+	return i2c_gpio_fi_act_on_scl_irq(priv, lose_arbitration_irq);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_lose_arbitration, NULL, fops_lose_arbitration_set, "%llu\n");
+
 static void i2c_gpio_fault_injector_init(struct platform_device *pdev)
 {
 	struct i2c_gpio_private_data *priv = platform_get_drvdata(pdev);
@@ -181,10 +247,15 @@  static void i2c_gpio_fault_injector_init(struct platform_device *pdev)
 	if (!priv->debug_dir)
 		return;
 
+	init_completion(&priv->scl_irq_completion);
+
 	debugfs_create_file_unsafe("incomplete_address_phase", 0200, priv->debug_dir,
 				   priv, &fops_incomplete_addr_phase);
 	debugfs_create_file_unsafe("incomplete_write_byte", 0200, priv->debug_dir,
 				   priv, &fops_incomplete_write_byte);
+	if (priv->bit_data.getscl)
+		debugfs_create_file_unsafe("lose_arbitration", 0200, priv->debug_dir,
+					   priv, &fops_lose_arbitration);
 	debugfs_create_file_unsafe("scl", 0600, priv->debug_dir, priv, &fops_scl);
 	debugfs_create_file_unsafe("sda", 0600, priv->debug_dir, priv, &fops_sda);
 }