diff mbox series

[5/6] pwm: renesas-tpu: Improve maths to compute register settings

Message ID 20220413085050.61144-5-u.kleine-koenig@pengutronix.de
State Superseded
Headers show
Series [1/6] pwm: renesas-tpu: Make use of dev_err_probe() | expand

Commit Message

Uwe Kleine-König April 13, 2022, 8:50 a.m. UTC
The newly computed register values are intended to exactly match the
previously computed values. The main improvement is that the prescaler
is computed directly instead of with a loop. This uses the fact, that
prescalers[i] = 1 << (2 * i).

Assuming a moderately smart compiler, the needed number of divisions for
the case where the requested period is too big, is reduced from 5 to 2.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-renesas-tpu.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

Comments

Geert Uytterhoeven April 14, 2022, 10:10 a.m. UTC | #1
Hi Uwe,

Thanks for your patch!

On Wed, Apr 13, 2022 at 10:51 AM Uwe Kleine-König
<u.kleine-koenig@pengutronix.de> wrote:
> The newly computed register values are intended to exactly match the
> previously computed values. The main improvement is that the prescaler
> is computed directly instead of with a loop. This uses the fact, that
> prescalers[i] = 1 << (2 * i).
>
> Assuming a moderately smart compiler, the needed number of divisions for
> the case where the requested period is too big, is reduced from 5 to 2.

I'm not worried about the divisions, but about the ilog2(), which
uses fls().  The TPU block also exists on SuperH SoCs (although
currently no SH Linux code has it enabled), and SH uses the fls()
implementation from asm-generic.

> --- a/drivers/pwm/pwm-renesas-tpu.c
> +++ b/drivers/pwm/pwm-renesas-tpu.c
> @@ -244,7 +244,6 @@ static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
>  static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
>                           int duty_ns, int period_ns, bool enabled)
>  {
> -       static const unsigned int prescalers[] = { 1, 4, 16, 64 };
>         struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm);
>         struct tpu_device *tpu = to_tpu_device(chip);
>         unsigned int prescaler;
> @@ -254,26 +253,21 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
>         u32 duty;
>         int ret;
>
> -       /*
> -        * Pick a prescaler to avoid overflowing the counter.
> -        * TODO: Pick the highest acceptable prescaler.
> -        */
>         clk_rate = clk_get_rate(tpu->clk);
>
> -       for (prescaler = 0; prescaler < ARRAY_SIZE(prescalers); ++prescaler) {
> -               period = clk_rate / prescalers[prescaler]
> -                      / (NSEC_PER_SEC / period_ns);
> -               if (period <= 0xffff)
> -                       break;
> -       }
> +       period = clk_rate / (NSEC_PER_SEC / period_ns);
> +       if (period >= 64 * 0x10000 || period == 0)
> +               return -EINVAL;
>
> -       if (prescaler == ARRAY_SIZE(prescalers) || period == 0) {
> -               dev_err(&tpu->pdev->dev, "clock rate mismatch\n");
> -               return -ENOTSUPP;
> -       }
> +       if (period < 0x10000)
> +               prescaler = 0;
> +       else
> +               prescaler = ilog2(period / 0x10000) / 2 + 1;
> +
> +       period >>= 2 * prescaler;

Although the above is correct, I find it hard to read.
Hence I'd keep a loop, like:

    unsigned int prescaler = 0;
    ...
    while (period > 0x10000) {
            period >>= 2;
            prescalar++;
    }

This would even save 2 lines of code ;-)

>
>         if (duty_ns) {
> -               duty = clk_rate / prescalers[prescaler]
> +               duty = (clk_rate >> 2 * prescaler)
>                      / (NSEC_PER_SEC / duty_ns);
>                 if (duty > period)
>                         return -EINVAL;

Anyway:
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>

The display backlight still works fine on r8a7740/armadillo, so
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
Uwe Kleine-König April 20, 2022, 10:27 a.m. UTC | #2
Hello Geert,

On Thu, Apr 14, 2022 at 12:10:02PM +0200, Geert Uytterhoeven wrote:
> On Wed, Apr 13, 2022 at 10:51 AM Uwe Kleine-König
> <u.kleine-koenig@pengutronix.de> wrote:
> > The newly computed register values are intended to exactly match the
> > previously computed values. The main improvement is that the prescaler
> > is computed directly instead of with a loop. This uses the fact, that
> > prescalers[i] = 1 << (2 * i).
> >
> > Assuming a moderately smart compiler, the needed number of divisions for
> > the case where the requested period is too big, is reduced from 5 to 2.
> 
> I'm not worried about the divisions, but about the ilog2(), which
> uses fls().  The TPU block also exists on SuperH SoCs (although
> currently no SH Linux code has it enabled), and SH uses the fls()
> implementation from asm-generic.
> 
> > --- a/drivers/pwm/pwm-renesas-tpu.c
> > +++ b/drivers/pwm/pwm-renesas-tpu.c
> > @@ -244,7 +244,6 @@ static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
> >  static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
> >                           int duty_ns, int period_ns, bool enabled)
> >  {
> > -       static const unsigned int prescalers[] = { 1, 4, 16, 64 };
> >         struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm);
> >         struct tpu_device *tpu = to_tpu_device(chip);
> >         unsigned int prescaler;
> > @@ -254,26 +253,21 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
> >         u32 duty;
> >         int ret;
> >
> > -       /*
> > -        * Pick a prescaler to avoid overflowing the counter.
> > -        * TODO: Pick the highest acceptable prescaler.
> > -        */
> >         clk_rate = clk_get_rate(tpu->clk);
> >
> > -       for (prescaler = 0; prescaler < ARRAY_SIZE(prescalers); ++prescaler) {
> > -               period = clk_rate / prescalers[prescaler]
> > -                      / (NSEC_PER_SEC / period_ns);
> > -               if (period <= 0xffff)
> > -                       break;
> > -       }
> > +       period = clk_rate / (NSEC_PER_SEC / period_ns);
> > +       if (period >= 64 * 0x10000 || period == 0)
> > +               return -EINVAL;
> >
> > -       if (prescaler == ARRAY_SIZE(prescalers) || period == 0) {
> > -               dev_err(&tpu->pdev->dev, "clock rate mismatch\n");
> > -               return -ENOTSUPP;
> > -       }
> > +       if (period < 0x10000)
> > +               prescaler = 0;
> > +       else
> > +               prescaler = ilog2(period / 0x10000) / 2 + 1;
> > +
> > +       period >>= 2 * prescaler;
> 
> Although the above is correct, I find it hard to read.
> Hence I'd keep a loop, like:
> 
>     unsigned int prescaler = 0;
>     ...
>     while (period > 0x10000) {
>             period >>= 2;
>             prescalar++;
>     }
> 
> This would even save 2 lines of code ;-)

The "hard to read" part is subjective, I understand it just fine. (But I
admit I wouldn't be surprised if I'm the exception here as I do much
math.) I suggest to judge this by looking at the generated code. I'm not
an expert here (no sh toolchain here, no sh asm foo), but my expectation
is that the compiler notices that 1 <= period / 0x10000 < 64 and then
the inlined fls code should be simplified such that

	ilog2(period / 0x10000) / 2 + 1

simplifies to something like:

	x = period >> 16
	prescaler = 4
	if (!(x & 0xf0u)) {
		x <<= 4;
		prescaler -= 2;
	}
	if (!(x & 0xc0u)) {
		x <<= 2;
		prescaler -= 1;
	}

which I expect to be more efficient than the loop you suggested.

Best regards
Uwe
diff mbox series

Patch

diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c
index 671f1f824da8..fce7df418d62 100644
--- a/drivers/pwm/pwm-renesas-tpu.c
+++ b/drivers/pwm/pwm-renesas-tpu.c
@@ -244,7 +244,6 @@  static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 			  int duty_ns, int period_ns, bool enabled)
 {
-	static const unsigned int prescalers[] = { 1, 4, 16, 64 };
 	struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm);
 	struct tpu_device *tpu = to_tpu_device(chip);
 	unsigned int prescaler;
@@ -254,26 +253,21 @@  static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	u32 duty;
 	int ret;
 
-	/*
-	 * Pick a prescaler to avoid overflowing the counter.
-	 * TODO: Pick the highest acceptable prescaler.
-	 */
 	clk_rate = clk_get_rate(tpu->clk);
 
-	for (prescaler = 0; prescaler < ARRAY_SIZE(prescalers); ++prescaler) {
-		period = clk_rate / prescalers[prescaler]
-		       / (NSEC_PER_SEC / period_ns);
-		if (period <= 0xffff)
-			break;
-	}
+	period = clk_rate / (NSEC_PER_SEC / period_ns);
+	if (period >= 64 * 0x10000 || period == 0)
+		return -EINVAL;
 
-	if (prescaler == ARRAY_SIZE(prescalers) || period == 0) {
-		dev_err(&tpu->pdev->dev, "clock rate mismatch\n");
-		return -ENOTSUPP;
-	}
+	if (period < 0x10000)
+		prescaler = 0;
+	else
+		prescaler = ilog2(period / 0x10000) / 2 + 1;
+
+	period >>= 2 * prescaler;
 
 	if (duty_ns) {
-		duty = clk_rate / prescalers[prescaler]
+		duty = (clk_rate >> 2 * prescaler)
 		     / (NSEC_PER_SEC / duty_ns);
 		if (duty > period)
 			return -EINVAL;
@@ -283,7 +277,7 @@  static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
 	dev_dbg(&tpu->pdev->dev,
 		"rate %u, prescaler %u, period %u, duty %u\n",
-		clk_rate, prescalers[prescaler], period, duty);
+		clk_rate, 1 << (2 * prescaler), period, duty);
 
 	if (tpd->prescaler == prescaler && tpd->period == period)
 		duty_only = true;