[1/4] gpio: Remove VLA from gpiolib

Message ID 20180310001021.6437-2-labbott@redhat.com
State New
Headers show
Series
  • VLA removal from the GPIO subsystem
Related show

Commit Message

Laura Abbott March 10, 2018, 12:10 a.m.
The new challenge is to remove VLAs from the kernel
(see https://lkml.org/lkml/2018/3/7/621)

This patch replaces several VLAs with an appropriate call to
kmalloc_array.

Signed-off-by: Laura Abbott <labbott@redhat.com>
---
 drivers/gpio/gpiolib.c | 55 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 7 deletions(-)

Comments

Rasmus Villemoes March 12, 2018, 3 p.m. | #1
On 2018-03-10 01:10, Laura Abbott wrote:
>  		/* collect all inputs belonging to the same chip */
>  		first = i;
> -		memset(mask, 0, sizeof(mask));
> +		memset(mask, 0, sizeof(*mask));

see below

> @@ -2887,14 +2909,30 @@ void gpiod_set_array_value_complex(bool raw, bool can_sleep,
>  
>  	while (i < array_size) {
>  		struct gpio_chip *chip = desc_array[i]->gdev->chip;
> -		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
> -		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
> +		unsigned long *mask;
> +		unsigned long *bits;
>  		int count = 0;
>  
> +		mask = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
> +				sizeof(*mask),
> +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> +
> +		if (!mask)
> +			return;
> +
> +		bits = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
> +				sizeof(*bits),
> +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> +
> +		if (!bits) {
> +			kfree(mask);
> +			return;
> +		}
> +
>  		if (!can_sleep)
>  			WARN_ON(chip->can_sleep);
>  
> -		memset(mask, 0, sizeof(mask));
> +		memset(mask, 0, sizeof(*mask));

Hm, it seems you're now only clearing the first word of mask, not the
entire allocation. Why not just use kcalloc() instead of kmalloc_array
to have it automatically cleared?

Other random thoughts: maybe two allocations for each loop iteration is
a bit much. Maybe do a first pass over the array and collect the maximal
chip->ngpio, do the memory allocation and freeing outside the loop (then
you'd of course need to preserve the memset() with appropriate length
computed). And maybe even just do one allocation, making bits point at
the second half.

Does the set function need to be updated to return an int to be able to
inform the caller that memory allocation failed?

Rasmus
--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Laura Abbott March 12, 2018, 11:40 p.m. | #2
On 03/12/2018 08:00 AM, Rasmus Villemoes wrote:
> On 2018-03-10 01:10, Laura Abbott wrote:
>>   		/* collect all inputs belonging to the same chip */
>>   		first = i;
>> -		memset(mask, 0, sizeof(mask));
>> +		memset(mask, 0, sizeof(*mask));
> 
> see below
> 
>> @@ -2887,14 +2909,30 @@ void gpiod_set_array_value_complex(bool raw, bool can_sleep,
>>   
>>   	while (i < array_size) {
>>   		struct gpio_chip *chip = desc_array[i]->gdev->chip;
>> -		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
>> -		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
>> +		unsigned long *mask;
>> +		unsigned long *bits;
>>   		int count = 0;
>>   
>> +		mask = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
>> +				sizeof(*mask),
>> +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
>> +
>> +		if (!mask)
>> +			return;
>> +
>> +		bits = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
>> +				sizeof(*bits),
>> +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
>> +
>> +		if (!bits) {
>> +			kfree(mask);
>> +			return;
>> +		}
>> +
>>   		if (!can_sleep)
>>   			WARN_ON(chip->can_sleep);
>>   
>> -		memset(mask, 0, sizeof(mask));
>> +		memset(mask, 0, sizeof(*mask));
> 
> Hm, it seems you're now only clearing the first word of mask, not the
> entire allocation. Why not just use kcalloc() instead of kmalloc_array
> to have it automatically cleared?
> 

Bleh, I didn't think through that carefully. I'll just switch
to kcalloc, especially since it calls kmalloc_array.

> Other random thoughts: maybe two allocations for each loop iteration is
> a bit much. Maybe do a first pass over the array and collect the maximal
> chip->ngpio, do the memory allocation and freeing outside the loop (then
> you'd of course need to preserve the memset() with appropriate length
> computed). And maybe even just do one allocation, making bits point at
> the second half.
> 

I was trying to make minimal changes and match the existing code. Is this
likely to be an actual hot path to optimize?

> Does the set function need to be updated to return an int to be able to
> inform the caller that memory allocation failed?
> 

That would involve changing the public API. I don't have a problem
doing so if that's what you want.

> Rasmus
> 

Thanks,
Laura
--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rasmus Villemoes March 13, 2018, 7:23 a.m. | #3
On 2018-03-13 00:40, Laura Abbott wrote:
> On 03/12/2018 08:00 AM, Rasmus Villemoes wrote:
>>
>> Hm, it seems you're now only clearing the first word of mask, not the
>> entire allocation. Why not just use kcalloc() instead of kmalloc_array
>> to have it automatically cleared?
> 
> Bleh, I didn't think through that carefully. I'll just switch
> to kcalloc, especially since it calls kmalloc_array.
> 
>> Other random thoughts: maybe two allocations for each loop iteration is
>> a bit much. Maybe do a first pass over the array and collect the maximal
>> chip->ngpio, do the memory allocation and freeing outside the loop (then
>> you'd of course need to preserve the memset() with appropriate length
>> computed). And maybe even just do one allocation, making bits point at
>> the second half.
>>
> 
> I was trying to make minimal changes and match the existing code.

Well, textually you may be making the minimal changes (though the error
handling adds some "boilerplate" kfree()s etc.), but semantically adding
two kmallocs in a loop could be a big deal. Dunno, maybe in practice all
the gpios (almost always) belong to the same chip, so there's only one
iteration through the loop anyway.

> Is this likely to be an actual hot path to optimize?

No idea, it was just a drive-by comment, so also...

>> Does the set function need to be updated to return an int to be able to
>> inform the caller that memory allocation failed?
> 
> That would involve changing the public API. I don't have a problem
> doing so if that's what you want.

... I don't "want" anything, that'll be for the gpio maintainers to decide.

Rasmus
--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Wunner March 17, 2018, 8:25 a.m. | #4
On Mon, Mar 12, 2018 at 04:00:36PM +0100, Rasmus Villemoes wrote:
> On 2018-03-10 01:10, Laura Abbott wrote:
> > @@ -2887,14 +2909,30 @@ void gpiod_set_array_value_complex(bool raw, bool can_sleep,
> >  
> >  	while (i < array_size) {
> >  		struct gpio_chip *chip = desc_array[i]->gdev->chip;
> > -		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
> > -		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
> > +		unsigned long *mask;
> > +		unsigned long *bits;
> >  		int count = 0;
> >  
> > +		mask = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
> > +				sizeof(*mask),
> > +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> > +
> > +		if (!mask)
> > +			return;
> > +
> > +		bits = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
> > +				sizeof(*bits),
> > +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> > +
> > +		if (!bits) {
> > +			kfree(mask);
> > +			return;
> > +		}
> > +
> >  		if (!can_sleep)
> >  			WARN_ON(chip->can_sleep);
> >  
> > -		memset(mask, 0, sizeof(mask));
> > +		memset(mask, 0, sizeof(*mask));
> 
> Other random thoughts: maybe two allocations for each loop iteration is
> a bit much. Maybe do a first pass over the array and collect the maximal
> chip->ngpio, do the memory allocation and freeing outside the loop (then
> you'd of course need to preserve the memset() with appropriate length
> computed). And maybe even just do one allocation, making bits point at
> the second half.

I think those are great ideas because the function is kind of a hotpath
and usage of VLAs was motivated by the desire to make it fast.

I'd go one step further and store the maximum ngpio of all registered
chips in a global variable (and update it in gpiochip_add_data_with_key()),
then allocate 2 * max_ngpio once before entering the loop (as you've
suggested).  That would avoid the first pass to determine the maximum
chip->ngpio.  In most systems max_ngpio will be < 64, so one or two
unsigned longs depending on the arch's bitness.

FWIW, to achieve a stack overflow the platform or a driver need to specify
a huge number of GPIOs for a chip.  So the exploitability is limited,
but of course it's still better to get rid of the VLAs.

Running v2 of this patch through checkpatch --strict results in a few
"Alignment should match open parenthesis" and one "Please don't use
multiple blank lines" complaint, granted those are nits but it may
be worth fixing them up front lest the usual suspects come along and
submit bikeshedding patches.

Thanks,

Lukas
--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Wunner March 18, 2018, 2:23 p.m. | #5
On Sat, Mar 17, 2018 at 09:25:09AM +0100, Lukas Wunner wrote:
> On Mon, Mar 12, 2018 at 04:00:36PM +0100, Rasmus Villemoes wrote:
> > On 2018-03-10 01:10, Laura Abbott wrote:
> > > @@ -2887,14 +2909,30 @@ void gpiod_set_array_value_complex(bool raw, bool can_sleep,
> > >  
> > >  	while (i < array_size) {
> > >  		struct gpio_chip *chip = desc_array[i]->gdev->chip;
> > > -		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
> > > -		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
> > > +		unsigned long *mask;
> > > +		unsigned long *bits;
> > >  		int count = 0;
> > >  
> > > +		mask = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
> > > +				sizeof(*mask),
> > > +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> > > +
> > > +		if (!mask)
> > > +			return;
> > > +
> > > +		bits = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
> > > +				sizeof(*bits),
> > > +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> > > +
> > > +		if (!bits) {
> > > +			kfree(mask);
> > > +			return;
> > > +		}
> > > +
> > >  		if (!can_sleep)
> > >  			WARN_ON(chip->can_sleep);
> > >  
> > > -		memset(mask, 0, sizeof(mask));
> > > +		memset(mask, 0, sizeof(*mask));
> > 
> > Other random thoughts: maybe two allocations for each loop iteration is
> > a bit much. Maybe do a first pass over the array and collect the maximal
> > chip->ngpio, do the memory allocation and freeing outside the loop (then
> > you'd of course need to preserve the memset() with appropriate length
> > computed). And maybe even just do one allocation, making bits point at
> > the second half.
> 
> I think those are great ideas because the function is kind of a hotpath
> and usage of VLAs was motivated by the desire to make it fast.
> 
> I'd go one step further and store the maximum ngpio of all registered
> chips in a global variable (and update it in gpiochip_add_data_with_key()),
> then allocate 2 * max_ngpio once before entering the loop (as you've
> suggested).  That would avoid the first pass to determine the maximum
> chip->ngpio.  In most systems max_ngpio will be < 64, so one or two
> unsigned longs depending on the arch's bitness.

Actually, scratch that.  If ngpio is usually smallish, we can just
allocate reasonably sized space for mask and bits on the stack,
and fall back to the kcalloc slowpath only if chip->ngpio exceeds
that limit.  Basically the below (likewise compile-tested only),
this is on top of Laura's patch, could be squashed together.
Let me know what you think, thanks.

-- >8 --
Subject: [PATCH] gpio: Add fastpath to gpiod_get/set_array_value_complex()

Signed-off-by: Lukas Wunner <lukas@wunner.de>
---
 drivers/gpio/gpiolib.c | 76 ++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 429bc251392b..ffc67b0b866c 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2432,6 +2432,8 @@ static int gpio_chip_get_multiple(struct gpio_chip *chip,
 	return -EIO;
 }
 
+#define FASTPATH_NGPIO 256
+
 int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 				  unsigned int array_size,
 				  struct gpio_desc **desc_array,
@@ -2441,27 +2443,24 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 
 	while (i < array_size) {
 		struct gpio_chip *chip = desc_array[i]->gdev->chip;
-		unsigned long *mask;
-		unsigned long *bits;
+		unsigned long fastpath[2 * BITS_TO_LONGS(FASTPATH_NGPIO)];
+		unsigned long *slowpath = NULL, *mask, *bits;
 		int first, j, ret;
 
-		mask = kcalloc(BITS_TO_LONGS(chip->ngpio),
-				sizeof(*mask),
-				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
-
-		if (!mask)
-			return -ENOMEM;
-
-		bits = kcalloc(BITS_TO_LONGS(chip->ngpio),
-				sizeof(*bits),
-				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
-
-		if (!bits) {
-			kfree(mask);
-			return -ENOMEM;
+		if (likely(chip->ngpio <= FASTPATH_NGPIO)) {
+			memset(fastpath, 0, sizeof(fastpath));
+			mask = fastpath;
+			bits = fastpath + BITS_TO_LONGS(FASTPATH_NGPIO);
+		} else {
+			slowpath = kcalloc(2 * BITS_TO_LONGS(chip->ngpio),
+					   sizeof(*slowpath),
+					   can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+			if (!slowpath)
+				return -ENOMEM;
+			mask = slowpath;
+			bits = slowpath + BITS_TO_LONGS(chip->ngpio);
 		}
 
-
 		if (!can_sleep)
 			WARN_ON(chip->can_sleep);
 
@@ -2478,8 +2477,8 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 
 		ret = gpio_chip_get_multiple(chip, mask, bits);
 		if (ret) {
-			kfree(bits);
-			kfree(mask);
+			if (slowpath)
+				kfree(slowpath);
 			return ret;
 		}
 
@@ -2493,8 +2492,9 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 			value_array[j] = value;
 			trace_gpio_value(desc_to_gpio(desc), 1, value);
 		}
-		kfree(bits);
-		kfree(mask);
+
+		if (slowpath)
+			kfree(slowpath);
 	}
 	return 0;
 }
@@ -2699,24 +2699,22 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 
 	while (i < array_size) {
 		struct gpio_chip *chip = desc_array[i]->gdev->chip;
-		unsigned long *mask;
-		unsigned long *bits;
+		unsigned long fastpath[2 * BITS_TO_LONGS(FASTPATH_NGPIO)];
+		unsigned long *slowpath = NULL, *mask, *bits;
 		int count = 0;
 
-		mask = kcalloc(BITS_TO_LONGS(chip->ngpio),
-				sizeof(*mask),
-				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
-
-		if (!mask)
-			return -ENOMEM;
-
-		bits = kcalloc(BITS_TO_LONGS(chip->ngpio),
-				sizeof(*bits),
-				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
-
-		if (!bits) {
-			kfree(mask);
-			return -ENOMEM;
+		if (likely(chip->ngpio <= FASTPATH_NGPIO)) {
+			memset(fastpath, 0, sizeof(fastpath));
+			mask = fastpath;
+			bits = fastpath + BITS_TO_LONGS(FASTPATH_NGPIO);
+		} else {
+			slowpath = kcalloc(2 * BITS_TO_LONGS(chip->ngpio),
+					   sizeof(*slowpath),
+					   can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+			if (!slowpath)
+				return -ENOMEM;
+			mask = slowpath;
+			bits = slowpath + BITS_TO_LONGS(chip->ngpio);
 		}
 
 		if (!can_sleep)
@@ -2753,8 +2751,8 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 		if (count != 0)
 			gpio_chip_set_multiple(chip, mask, bits);
 
-		kfree(mask);
-		kfree(bits);
+		if (slowpath)
+			kfree(slowpath);
 	}
 	return 0;
 }
Rasmus Villemoes March 18, 2018, 8:34 p.m. | #6
On 2018-03-18 15:23, Lukas Wunner wrote:
>>>
>>> Other random thoughts: maybe two allocations for each loop iteration is
>>> a bit much. Maybe do a first pass over the array and collect the maximal
>>> chip->ngpio, do the memory allocation and freeing outside the loop (then
>>> you'd of course need to preserve the memset() with appropriate length
>>> computed). And maybe even just do one allocation, making bits point at
>>> the second half.
>>
>> I think those are great ideas because the function is kind of a hotpath
>> and usage of VLAs was motivated by the desire to make it fast.
>>
>> I'd go one step further and store the maximum ngpio of all registered
>> chips in a global variable (and update it in gpiochip_add_data_with_key()),
>> then allocate 2 * max_ngpio once before entering the loop (as you've
>> suggested).  That would avoid the first pass to determine the maximum
>> chip->ngpio.  In most systems max_ngpio will be < 64, so one or two
>> unsigned longs depending on the arch's bitness.
> 
> Actually, scratch that.  If ngpio is usually smallish, we can just
> allocate reasonably sized space for mask and bits on the stack,

Yes.

> and fall back to the kcalloc slowpath only if chip->ngpio exceeds
> that limit.

Well, I'd suggest not adding that fallback code now, but simply add a
check in gpiochip_add_data_with_key to ensure ngpio is sane (and refuse
to register the chip otherwise), at least if we know that every
currently supported/known chip is covered by the 256 (?). That keeps the
code simple and fast, and then if somebody has a chip with 40000 gpio
lines, we can add a fallback path. Or we could consider alternative
solutions, to avoid a 10000 byte GFP_ATOMIC allocation (maybe hang a
pre-allocation off the gpio_chip; that's only two more bits per
descriptor, and there's already a whole gpio_desc for each - but not
sure about the locking in that case).

Rasmus
--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Wunner March 19, 2018, 7 a.m. | #7
On Sun, Mar 18, 2018 at 09:34:12PM +0100, Rasmus Villemoes wrote:
> On 2018-03-18 15:23, Lukas Wunner wrote:
> >>> Other random thoughts: maybe two allocations for each loop iteration is
> >>> a bit much. Maybe do a first pass over the array and collect the maximal
> >>> chip->ngpio, do the memory allocation and freeing outside the loop (then
> >>> you'd of course need to preserve the memset() with appropriate length
> >>> computed). And maybe even just do one allocation, making bits point at
> >>> the second half.
> >>
> >> I think those are great ideas because the function is kind of a hotpath
> >> and usage of VLAs was motivated by the desire to make it fast.
> >>
> >> I'd go one step further and store the maximum ngpio of all registered
> >> chips in a global variable (and update it in gpiochip_add_data_with_key()),
> >> then allocate 2 * max_ngpio once before entering the loop (as you've
> >> suggested).  That would avoid the first pass to determine the maximum
> >> chip->ngpio.  In most systems max_ngpio will be < 64, so one or two
> >> unsigned longs depending on the arch's bitness.
> > 
> > Actually, scratch that.  If ngpio is usually smallish, we can just
> > allocate reasonably sized space for mask and bits on the stack,
> 
> Yes.
> 
> > and fall back to the kcalloc slowpath only if chip->ngpio exceeds
> > that limit.
> 
> Well, I'd suggest not adding that fallback code now, but simply add a
> check in gpiochip_add_data_with_key to ensure ngpio is sane (and refuse
> to register the chip otherwise), at least if we know that every
> currently supported/known chip is covered by the 256 (?).

The number 256 was arbitrarily chosen.  I really wouldn't be surprised
if gpiochips with more pins exist, but they're probably rare, so using
the slowpath seems fine, but dropping support for them completely would
be a regression.

E.g. many serially attached chips such as MAX3191X are daisy-chainable
and the driver deliberately doesn't impose an upper limit on the number
of chips because the spec doesn't contain one either.  To the OS a
daisy-chain of such chips appears as a single gpiochip with many pins.

Thanks,

Lukas
--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Shevchenko March 19, 2018, 3:09 p.m. | #8
On Mon, Mar 19, 2018 at 9:00 AM, Lukas Wunner <lukas@wunner.de> wrote:
> On Sun, Mar 18, 2018 at 09:34:12PM +0100, Rasmus Villemoes wrote:
>> On 2018-03-18 15:23, Lukas Wunner wrote:

>> > Actually, scratch that.  If ngpio is usually smallish, we can just
>> > allocate reasonably sized space for mask and bits on the stack,
>> > and fall back to the kcalloc slowpath only if chip->ngpio exceeds
>> > that limit.

>> Well, I'd suggest not adding that fallback code now, but simply add a
>> check in gpiochip_add_data_with_key to ensure ngpio is sane (and refuse
>> to register the chip otherwise), at least if we know that every
>> currently supported/known chip is covered by the 256 (?).
>
> The number 256 was arbitrarily chosen.  I really wouldn't be surprised
> if gpiochips with more pins exist, but they're probably rare, so using
> the slowpath seems fine, but dropping support for them completely would
> be a regression.

All modern Intel SoCs have GPIO count in between of ~230-380.
Though, few of them are split to communities by (much) less than 256
pin in each when there is a 1:1 mapping between community and
gpiochip.

OTOH, the function you are fixing is most likely is not used together
with the drivers for x86.
Laura Abbott March 28, 2018, 12:37 a.m. | #9
On 03/18/2018 07:23 AM, Lukas Wunner wrote:
> On Sat, Mar 17, 2018 at 09:25:09AM +0100, Lukas Wunner wrote:
>> On Mon, Mar 12, 2018 at 04:00:36PM +0100, Rasmus Villemoes wrote:
>>> On 2018-03-10 01:10, Laura Abbott wrote:
>>>> @@ -2887,14 +2909,30 @@ void gpiod_set_array_value_complex(bool raw, bool can_sleep,
>>>>   
>>>>   	while (i < array_size) {
>>>>   		struct gpio_chip *chip = desc_array[i]->gdev->chip;
>>>> -		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
>>>> -		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
>>>> +		unsigned long *mask;
>>>> +		unsigned long *bits;
>>>>   		int count = 0;
>>>>   
>>>> +		mask = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
>>>> +				sizeof(*mask),
>>>> +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
>>>> +
>>>> +		if (!mask)
>>>> +			return;
>>>> +
>>>> +		bits = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
>>>> +				sizeof(*bits),
>>>> +				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
>>>> +
>>>> +		if (!bits) {
>>>> +			kfree(mask);
>>>> +			return;
>>>> +		}
>>>> +
>>>>   		if (!can_sleep)
>>>>   			WARN_ON(chip->can_sleep);
>>>>   
>>>> -		memset(mask, 0, sizeof(mask));
>>>> +		memset(mask, 0, sizeof(*mask));
>>>
>>> Other random thoughts: maybe two allocations for each loop iteration is
>>> a bit much. Maybe do a first pass over the array and collect the maximal
>>> chip->ngpio, do the memory allocation and freeing outside the loop (then
>>> you'd of course need to preserve the memset() with appropriate length
>>> computed). And maybe even just do one allocation, making bits point at
>>> the second half.
>>
>> I think those are great ideas because the function is kind of a hotpath
>> and usage of VLAs was motivated by the desire to make it fast.
>>
>> I'd go one step further and store the maximum ngpio of all registered
>> chips in a global variable (and update it in gpiochip_add_data_with_key()),
>> then allocate 2 * max_ngpio once before entering the loop (as you've
>> suggested).  That would avoid the first pass to determine the maximum
>> chip->ngpio.  In most systems max_ngpio will be < 64, so one or two
>> unsigned longs depending on the arch's bitness.
> 
> Actually, scratch that.  If ngpio is usually smallish, we can just
> allocate reasonably sized space for mask and bits on the stack,
> and fall back to the kcalloc slowpath only if chip->ngpio exceeds
> that limit.  Basically the below (likewise compile-tested only),
> this is on top of Laura's patch, could be squashed together.
> Let me know what you think, thanks.
> 

It seems like there's general consensus this is okay so I'm going
to fold it into the next version. If not, we can discuss again.

> -- >8 --
> Subject: [PATCH] gpio: Add fastpath to gpiod_get/set_array_value_complex()
> 
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> ---
>   drivers/gpio/gpiolib.c | 76 ++++++++++++++++++++++++--------------------------
>   1 file changed, 37 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
> index 429bc251392b..ffc67b0b866c 100644
> --- a/drivers/gpio/gpiolib.c
> +++ b/drivers/gpio/gpiolib.c
> @@ -2432,6 +2432,8 @@ static int gpio_chip_get_multiple(struct gpio_chip *chip,
>   	return -EIO;
>   }
>   
> +#define FASTPATH_NGPIO 256
> +
>   int gpiod_get_array_value_complex(bool raw, bool can_sleep,
>   				  unsigned int array_size,
>   				  struct gpio_desc **desc_array,
> @@ -2441,27 +2443,24 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
>   
>   	while (i < array_size) {
>   		struct gpio_chip *chip = desc_array[i]->gdev->chip;
> -		unsigned long *mask;
> -		unsigned long *bits;
> +		unsigned long fastpath[2 * BITS_TO_LONGS(FASTPATH_NGPIO)];
> +		unsigned long *slowpath = NULL, *mask, *bits;
>   		int first, j, ret;
>   
> -		mask = kcalloc(BITS_TO_LONGS(chip->ngpio),
> -				sizeof(*mask),
> -				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> -
> -		if (!mask)
> -			return -ENOMEM;
> -
> -		bits = kcalloc(BITS_TO_LONGS(chip->ngpio),
> -				sizeof(*bits),
> -				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> -
> -		if (!bits) {
> -			kfree(mask);
> -			return -ENOMEM;
> +		if (likely(chip->ngpio <= FASTPATH_NGPIO)) {
> +			memset(fastpath, 0, sizeof(fastpath));
> +			mask = fastpath;
> +			bits = fastpath + BITS_TO_LONGS(FASTPATH_NGPIO);
> +		} else {
> +			slowpath = kcalloc(2 * BITS_TO_LONGS(chip->ngpio),
> +					   sizeof(*slowpath),
> +					   can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> +			if (!slowpath)
> +				return -ENOMEM;
> +			mask = slowpath;
> +			bits = slowpath + BITS_TO_LONGS(chip->ngpio);
>   		}
>   
> -
>   		if (!can_sleep)
>   			WARN_ON(chip->can_sleep);
>   
> @@ -2478,8 +2477,8 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
>   
>   		ret = gpio_chip_get_multiple(chip, mask, bits);
>   		if (ret) {
> -			kfree(bits);
> -			kfree(mask);
> +			if (slowpath)
> +				kfree(slowpath);
>   			return ret;
>   		}
>   
> @@ -2493,8 +2492,9 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
>   			value_array[j] = value;
>   			trace_gpio_value(desc_to_gpio(desc), 1, value);
>   		}
> -		kfree(bits);
> -		kfree(mask);
> +
> +		if (slowpath)
> +			kfree(slowpath);
>   	}
>   	return 0;
>   }
> @@ -2699,24 +2699,22 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep,
>   
>   	while (i < array_size) {
>   		struct gpio_chip *chip = desc_array[i]->gdev->chip;
> -		unsigned long *mask;
> -		unsigned long *bits;
> +		unsigned long fastpath[2 * BITS_TO_LONGS(FASTPATH_NGPIO)];
> +		unsigned long *slowpath = NULL, *mask, *bits;
>   		int count = 0;
>   
> -		mask = kcalloc(BITS_TO_LONGS(chip->ngpio),
> -				sizeof(*mask),
> -				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> -
> -		if (!mask)
> -			return -ENOMEM;
> -
> -		bits = kcalloc(BITS_TO_LONGS(chip->ngpio),
> -				sizeof(*bits),
> -				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> -
> -		if (!bits) {
> -			kfree(mask);
> -			return -ENOMEM;
> +		if (likely(chip->ngpio <= FASTPATH_NGPIO)) {
> +			memset(fastpath, 0, sizeof(fastpath));
> +			mask = fastpath;
> +			bits = fastpath + BITS_TO_LONGS(FASTPATH_NGPIO);
> +		} else {
> +			slowpath = kcalloc(2 * BITS_TO_LONGS(chip->ngpio),
> +					   sizeof(*slowpath),
> +					   can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> +			if (!slowpath)
> +				return -ENOMEM;
> +			mask = slowpath;
> +			bits = slowpath + BITS_TO_LONGS(chip->ngpio);
>   		}
>   
>   		if (!can_sleep)
> @@ -2753,8 +2751,8 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep,
>   		if (count != 0)
>   			gpio_chip_set_multiple(chip, mask, bits);
>   
> -		kfree(mask);
> -		kfree(bits);
> +		if (slowpath)
> +			kfree(slowpath);
>   	}
>   	return 0;
>   }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukas Wunner March 28, 2018, 3:54 a.m. | #10
On Tue, Mar 27, 2018 at 05:37:18PM -0700, Laura Abbott wrote:
> On 03/18/2018 07:23 AM, Lukas Wunner wrote:
> > Actually, scratch that.  If ngpio is usually smallish, we can just
> > allocate reasonably sized space for mask and bits on the stack,
> > and fall back to the kcalloc slowpath only if chip->ngpio exceeds
> > that limit.  Basically the below (likewise compile-tested only),
> > this is on top of Laura's patch, could be squashed together.
> > Let me know what you think, thanks.
> >
> 
> It seems like there's general consensus this is okay so I'm going
> to fold it into the next version. If not, we can discuss again.

Yes, feel free to squash into your original patch with my S-o-b,
keep your authorship.

You may want to raise FASTPATH_NGPIO to something like 384, 448 or 512
to accommodate for the Intel chips Andy mentioned.  It's kind of a
"640k should be enough for everyone" thing but I'd expect the performance
impact of the extra bytes on the stack / memsetting them to zero
to be absolutely negligible.

Thanks!

Lukas
--
To unsubscribe from this list: send the line "unsubscribe linux-gpio" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d66de67ef307..124727c74931 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2662,16 +2662,33 @@  int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 
 	while (i < array_size) {
 		struct gpio_chip *chip = desc_array[i]->gdev->chip;
-		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
-		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
+		unsigned long *mask;
+		unsigned long *bits;
 		int first, j, ret;
 
+		mask = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
+				sizeof(*mask),
+				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+
+		if (!mask)
+			return -ENOMEM;
+
+		bits = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
+				sizeof(*bits),
+				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+
+		if (!bits) {
+			kfree(mask);
+			return -ENOMEM;
+		}
+
+
 		if (!can_sleep)
 			WARN_ON(chip->can_sleep);
 
 		/* collect all inputs belonging to the same chip */
 		first = i;
-		memset(mask, 0, sizeof(mask));
+		memset(mask, 0, sizeof(*mask));
 		do {
 			const struct gpio_desc *desc = desc_array[i];
 			int hwgpio = gpio_chip_hwgpio(desc);
@@ -2682,8 +2699,11 @@  int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 			 (desc_array[i]->gdev->chip == chip));
 
 		ret = gpio_chip_get_multiple(chip, mask, bits);
-		if (ret)
+		if (ret) {
+			kfree(bits);
+			kfree(mask);
 			return ret;
+		}
 
 		for (j = first; j < i; j++) {
 			const struct gpio_desc *desc = desc_array[j];
@@ -2695,6 +2715,8 @@  int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 			value_array[j] = value;
 			trace_gpio_value(desc_to_gpio(desc), 1, value);
 		}
+		kfree(bits);
+		kfree(mask);
 	}
 	return 0;
 }
@@ -2887,14 +2909,30 @@  void gpiod_set_array_value_complex(bool raw, bool can_sleep,
 
 	while (i < array_size) {
 		struct gpio_chip *chip = desc_array[i]->gdev->chip;
-		unsigned long mask[BITS_TO_LONGS(chip->ngpio)];
-		unsigned long bits[BITS_TO_LONGS(chip->ngpio)];
+		unsigned long *mask;
+		unsigned long *bits;
 		int count = 0;
 
+		mask = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
+				sizeof(*mask),
+				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+
+		if (!mask)
+			return;
+
+		bits = kmalloc_array(BITS_TO_LONGS(chip->ngpio),
+				sizeof(*bits),
+				can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+
+		if (!bits) {
+			kfree(mask);
+			return;
+		}
+
 		if (!can_sleep)
 			WARN_ON(chip->can_sleep);
 
-		memset(mask, 0, sizeof(mask));
+		memset(mask, 0, sizeof(*mask));
 		do {
 			struct gpio_desc *desc = desc_array[i];
 			int hwgpio = gpio_chip_hwgpio(desc);
@@ -2925,6 +2963,9 @@  void gpiod_set_array_value_complex(bool raw, bool can_sleep,
 		/* push collected bits to outputs */
 		if (count != 0)
 			gpio_chip_set_multiple(chip, mask, bits);
+
+		kfree(mask);
+		kfree(bits);
 	}
 }