diff mbox

[v3,2/5] powerpc/lib/sstep: Add popcnt instruction emulation

Message ID 20170725033320.17893-2-matthew.brown.dev@gmail.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Matt Brown July 25, 2017, 3:33 a.m. UTC
This adds emulations for the popcntb, popcntw, and popcntd instructions.
Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.

Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
---
v3:
	- optimised using the Giles-Miller method of side-ways addition
v2:
	- fixed opcodes
	- fixed typecasting
	- fixed bitshifting error for both 32 and 64bit arch
---
 arch/powerpc/lib/sstep.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

Comments

Balbir Singh July 25, 2017, 7:12 a.m. UTC | #1
On Tue, 2017-07-25 at 13:33 +1000, Matt Brown wrote:
> This adds emulations for the popcntb, popcntw, and popcntd instructions.
> Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.
> 
> Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
> ---
> v3:
> 	- optimised using the Giles-Miller method of side-ways addition
> v2:
> 	- fixed opcodes
> 	- fixed typecasting
> 	- fixed bitshifting error for both 32 and 64bit arch
> ---
>  arch/powerpc/lib/sstep.c | 40 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 87d277f..c1f9cdb 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -612,6 +612,32 @@ static nokprobe_inline void do_cmpb(struct pt_regs *regs, unsigned long v1,
>  	regs->gpr[rd] = out_val;
>  }
>  
> +/*
> + * The size parameter is used to adjust the equivalent popcnt instruction.
> + * popcntb = 8, popcntw = 32, popcntd = 64
> + */
> +static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long v1,
> +				int size, int ra)
> +{
> +	unsigned long long out = v1;
> +
> +	out = (0x5555555555555555 & out) + (0x5555555555555555 & (out >> 1));
> +	out = (0x3333333333333333 & out) + (0x3333333333333333 & (out >> 2));
> +	out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));
> +	if (size == 8) {	/* popcntb */
> +		regs->gpr[ra] = out;
> +		return;
> +	}
> +	out = (0x001f001f001f001f & out) + (0x001f001f001f001f & (out >> 8));

Why are we using 0x001f001f here? Now that we've got things in the
bytes with 0's prefixing, we can directly use

out = out + out >> 8

> +	out = (0x0000003f0000003f & out) + (0x0000003f0000003f & (out >> 16));

Same as above

> +	if (size == 32) {	/* popcntw */
> +		regs->gpr[ra] = out;
> +		return;
> +	}
> +	out = (0x000000000000007f & out) + (0x000000000000007f & (out >> 32));
> +	regs->gpr[ra] = out;	/* popcntd */

Ditto

Otherwise looks good!

Balbir Singh.
David Laight July 25, 2017, 10:24 a.m. UTC | #2
From: Linuxppc-dev [mailto:linuxppc-dev-bounces+david.laight=aculab.com@lists.ozlabs.org] On Behalf Of
> Matt Brown
> Sent: 25 July 2017 04:33
> To: linuxppc-dev@lists.ozlabs.org
> Subject: [PATCH v3 2/5] powerpc/lib/sstep: Add popcnt instruction emulation
> 
> This adds emulations for the popcntb, popcntw, and popcntd instructions.
> Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.
> 
> Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
> ---
> v3:
> 	- optimised using the Giles-Miller method of side-ways addition
> v2:
> 	- fixed opcodes
> 	- fixed typecasting
> 	- fixed bitshifting error for both 32 and 64bit arch
> ---
>  arch/powerpc/lib/sstep.c | 40 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 87d277f..c1f9cdb 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -612,6 +612,32 @@ static nokprobe_inline void do_cmpb(struct pt_regs *regs, unsigned long v1,
>  	regs->gpr[rd] = out_val;
>  }
> 
> +/*
> + * The size parameter is used to adjust the equivalent popcnt instruction.
> + * popcntb = 8, popcntw = 32, popcntd = 64
> + */
> +static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long v1,
> +				int size, int ra)
> +{
> +	unsigned long long out = v1;
> +
> +	out = (0x5555555555555555 & out) + (0x5555555555555555 & (out >> 1));
> +	out = (0x3333333333333333 & out) + (0x3333333333333333 & (out >> 2));
> +	out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));
> +	if (size == 8) {	/* popcntb */
> +		regs->gpr[ra] = out;

I'm pretty sure you need to mask the result with 7.

	David
Balbir Singh July 25, 2017, 1:32 p.m. UTC | #3
On Tue, Jul 25, 2017 at 8:24 PM, David Laight <David.Laight@aculab.com> wrote:
> From: Linuxppc-dev [mailto:linuxppc-dev-bounces+david.laight=aculab.com@lists.ozlabs.org] On Behalf Of
>> Matt Brown
>> Sent: 25 July 2017 04:33
>> To: linuxppc-dev@lists.ozlabs.org
>> Subject: [PATCH v3 2/5] powerpc/lib/sstep: Add popcnt instruction emulation
>>
>> This adds emulations for the popcntb, popcntw, and popcntd instructions.
>> Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.
>>
>> Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
>> ---
>> v3:
>>       - optimised using the Giles-Miller method of side-ways addition
>> v2:
>>       - fixed opcodes
>>       - fixed typecasting
>>       - fixed bitshifting error for both 32 and 64bit arch
>> ---
>>  arch/powerpc/lib/sstep.c | 40 +++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 39 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
>> index 87d277f..c1f9cdb 100644
>> --- a/arch/powerpc/lib/sstep.c
>> +++ b/arch/powerpc/lib/sstep.c
>> @@ -612,6 +612,32 @@ static nokprobe_inline void do_cmpb(struct pt_regs *regs, unsigned long v1,
>>       regs->gpr[rd] = out_val;
>>  }
>>
>> +/*
>> + * The size parameter is used to adjust the equivalent popcnt instruction.
>> + * popcntb = 8, popcntw = 32, popcntd = 64
>> + */
>> +static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long v1,
>> +                             int size, int ra)
>> +{
>> +     unsigned long long out = v1;
>> +
>> +     out = (0x5555555555555555 & out) + (0x5555555555555555 & (out >> 1));
>> +     out = (0x3333333333333333 & out) + (0x3333333333333333 & (out >> 2));
>> +     out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));
>> +     if (size == 8) {        /* popcntb */
>> +             regs->gpr[ra] = out;
>
> I'm pretty sure you need to mask the result with 7.
>
Absolutely! Good catch!

Balbir Singh.
Gabriel Paubert July 26, 2017, 7:29 a.m. UTC | #4
On Tue, Jul 25, 2017 at 01:33:17PM +1000, Matt Brown wrote:
> This adds emulations for the popcntb, popcntw, and popcntd instructions.
> Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.
> 
> Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com>
> ---
> v3:
> 	- optimised using the Giles-Miller method of side-ways addition
> v2:
> 	- fixed opcodes
> 	- fixed typecasting
> 	- fixed bitshifting error for both 32 and 64bit arch
> ---
>  arch/powerpc/lib/sstep.c | 40 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 87d277f..c1f9cdb 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -612,6 +612,32 @@ static nokprobe_inline void do_cmpb(struct pt_regs *regs, unsigned long v1,
>  	regs->gpr[rd] = out_val;
>  }
>  
> +/*
> + * The size parameter is used to adjust the equivalent popcnt instruction.
> + * popcntb = 8, popcntw = 32, popcntd = 64
> + */
> +static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long v1,
> +				int size, int ra)
> +{
> +	unsigned long long out = v1;
> +
> +	out = (0x5555555555555555 & out) + (0x5555555555555555 & (out >> 1));

This can be simplified in a less obvious way as:
	out -= (out >> 1) & 0x5555555555555555;

which maps each pair of bits according to the following:
00 -> 00
01 -> 01
10 -> 01
11 -> 10

This should save one instruction.

> +	out = (0x3333333333333333 & out) + (0x3333333333333333 & (out >> 2));

Ok, but now each nibble is between 0 and 4, so the addition of two
nibbles can't overflow or generate carry into the higher one.

> +	out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));

	out += out >> 4;
	out &= 0x0f0f0f0f0f0f0f0f;

which should also save one instruction

> +	if (size == 8) {	/* popcntb */
> +		regs->gpr[ra] = out;
> +		return;
> +	}

At this point each count occupies at least one byte and can no more
overflow, so masking is only needed before returning.

> +	out = (0x001f001f001f001f & out) + (0x001f001f001f001f & (out >> 8));
	out += out >> 8;

> +	out = (0x0000003f0000003f & out) + (0x0000003f0000003f & (out >> 16));

	out += out >> 16;

> +	if (size == 32) {	/* popcntw */
> +		regs->gpr[ra] = out;
		regs->gpr[ra] = out & 0x0000003f0000003f;

> +		return;
> +	}
> +	out = (0x000000000000007f & out) + (0x000000000000007f & (out >> 32));
	out = (out + (out >> 32)) & 0x7f;


	Gabriel

> +	regs->gpr[ra] = out;	/* popcntd */
> +}
> +
>  static nokprobe_inline int trap_compare(long v1, long v2)
>  {
>  	int ret = 0;
> @@ -1194,6 +1220,10 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
>  			regs->gpr[ra] = regs->gpr[rd] & ~regs->gpr[rb];
>  			goto logical_done;
>  
> +		case 122:	/* popcntb */
> +			do_popcnt(regs, regs->gpr[rd], 8, ra);
> +			goto logical_done;
> +
>  		case 124:	/* nor */
>  			regs->gpr[ra] = ~(regs->gpr[rd] | regs->gpr[rb]);
>  			goto logical_done;
> @@ -1206,6 +1236,10 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
>  			regs->gpr[ra] = regs->gpr[rd] ^ regs->gpr[rb];
>  			goto logical_done;
>  
> +		case 378:	/* popcntw */
> +			do_popcnt(regs, regs->gpr[rd], 32, ra);
> +			goto logical_done;
> +
>  		case 412:	/* orc */
>  			regs->gpr[ra] = regs->gpr[rd] | ~regs->gpr[rb];
>  			goto logical_done;
> @@ -1217,7 +1251,11 @@ int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
>  		case 476:	/* nand */
>  			regs->gpr[ra] = ~(regs->gpr[rd] & regs->gpr[rb]);
>  			goto logical_done;
> -
> +#ifdef __powerpc64__
> +		case 506:	/* popcntd */
> +			do_popcnt(regs, regs->gpr[rd], 64, ra);
> +			goto logical_done;
> +#endif
>  		case 922:	/* extsh */
>  			regs->gpr[ra] = (signed short) regs->gpr[rd];
>  			goto logical_done;
> -- 
> 2.9.3
diff mbox

Patch

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 87d277f..c1f9cdb 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -612,6 +612,32 @@  static nokprobe_inline void do_cmpb(struct pt_regs *regs, unsigned long v1,
 	regs->gpr[rd] = out_val;
 }
 
+/*
+ * The size parameter is used to adjust the equivalent popcnt instruction.
+ * popcntb = 8, popcntw = 32, popcntd = 64
+ */
+static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long v1,
+				int size, int ra)
+{
+	unsigned long long out = v1;
+
+	out = (0x5555555555555555 & out) + (0x5555555555555555 & (out >> 1));
+	out = (0x3333333333333333 & out) + (0x3333333333333333 & (out >> 2));
+	out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));
+	if (size == 8) {	/* popcntb */
+		regs->gpr[ra] = out;
+		return;
+	}
+	out = (0x001f001f001f001f & out) + (0x001f001f001f001f & (out >> 8));
+	out = (0x0000003f0000003f & out) + (0x0000003f0000003f & (out >> 16));
+	if (size == 32) {	/* popcntw */
+		regs->gpr[ra] = out;
+		return;
+	}
+	out = (0x000000000000007f & out) + (0x000000000000007f & (out >> 32));
+	regs->gpr[ra] = out;	/* popcntd */
+}
+
 static nokprobe_inline int trap_compare(long v1, long v2)
 {
 	int ret = 0;
@@ -1194,6 +1220,10 @@  int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 			regs->gpr[ra] = regs->gpr[rd] & ~regs->gpr[rb];
 			goto logical_done;
 
+		case 122:	/* popcntb */
+			do_popcnt(regs, regs->gpr[rd], 8, ra);
+			goto logical_done;
+
 		case 124:	/* nor */
 			regs->gpr[ra] = ~(regs->gpr[rd] | regs->gpr[rb]);
 			goto logical_done;
@@ -1206,6 +1236,10 @@  int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 			regs->gpr[ra] = regs->gpr[rd] ^ regs->gpr[rb];
 			goto logical_done;
 
+		case 378:	/* popcntw */
+			do_popcnt(regs, regs->gpr[rd], 32, ra);
+			goto logical_done;
+
 		case 412:	/* orc */
 			regs->gpr[ra] = regs->gpr[rd] | ~regs->gpr[rb];
 			goto logical_done;
@@ -1217,7 +1251,11 @@  int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 		case 476:	/* nand */
 			regs->gpr[ra] = ~(regs->gpr[rd] & regs->gpr[rb]);
 			goto logical_done;
-
+#ifdef __powerpc64__
+		case 506:	/* popcntd */
+			do_popcnt(regs, regs->gpr[rd], 64, ra);
+			goto logical_done;
+#endif
 		case 922:	/* extsh */
 			regs->gpr[ra] = (signed short) regs->gpr[rd];
 			goto logical_done;