diff mbox series

[v2,5/8] crypto: convert xts_mult_x to use xts_uint128 type

Message ID 20181016100918.21030-6-berrange@redhat.com
State New
Headers show
Series crypto: improve performance of XTS cipher mode | expand

Commit Message

Daniel P. Berrangé Oct. 16, 2018, 10:09 a.m. UTC
Using 64-bit arithmetic increases the performance for xts-aes-128
when built with gcrypt:

  Encrypt: 355 MB/s -> 545 MB/s
  Decrypt: 362 MB/s -> 568 MB/s

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

Comments

Alberto Garcia Oct. 16, 2018, 1:35 p.m. UTC | #1
On Tue 16 Oct 2018 12:09:15 PM CEST, Daniel P. Berrangé wrote:
> Using 64-bit arithmetic increases the performance for xts-aes-128
> when built with gcrypt:
>
>   Encrypt: 355 MB/s -> 545 MB/s
>   Decrypt: 362 MB/s -> 568 MB/s
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>

This patch is also fine, but I have a couple of minor comments:

> +static void xts_mult_x(xts_uint128 *I)
> +{
> +    uint64_t tt;
> +
> +    xts_uint128_cpu_to_les(I);
> +
> +    tt = I->u[0] >> 63;
> +    I->u[0] = I->u[0] << 1;

Perhaps I->u[0] <<= 1 , for clarity and consistency with the following
line (I->u[0] ^= 0x87) ? But I don't mind if you prefer to keep it as is
now.

> +    if (I->u[1] >> 63) {
> +        I->u[0] ^= 0x87;
>      }
> +    I->u[1] = (I->u[1] << 1) | tt;
> +
> +    xts_uint128_le_to_cpus(I);

I think both endianness conversion calls should be flipped. First you
convert from the buffer byte order (LE) to the CPU byte order so you can
do the bit shifts, then back to the original byte order (LE).

Changing this doesn't have any practical effect because both calls
perform the exact same operation, but it documents better what's going
on.

With this changed,

Reviewed-by: Alberto Garcia <berto@igalia.com>

Berto
Daniel P. Berrangé Oct. 16, 2018, 1:59 p.m. UTC | #2
On Tue, Oct 16, 2018 at 03:35:01PM +0200, Alberto Garcia wrote:
> On Tue 16 Oct 2018 12:09:15 PM CEST, Daniel P. Berrangé wrote:
> > Using 64-bit arithmetic increases the performance for xts-aes-128
> > when built with gcrypt:
> >
> >   Encrypt: 355 MB/s -> 545 MB/s
> >   Decrypt: 362 MB/s -> 568 MB/s
> >
> > Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> 
> This patch is also fine, but I have a couple of minor comments:
> 
> > +static void xts_mult_x(xts_uint128 *I)
> > +{
> > +    uint64_t tt;
> > +
> > +    xts_uint128_cpu_to_les(I);
> > +
> > +    tt = I->u[0] >> 63;
> > +    I->u[0] = I->u[0] << 1;
> 
> Perhaps I->u[0] <<= 1 , for clarity and consistency with the following
> line (I->u[0] ^= 0x87) ? But I don't mind if you prefer to keep it as is
> now.

In fact I could do the following:

@@ -59,12 +59,13 @@ static void xts_mult_x(xts_uint128 *I)
     xts_uint128_cpu_to_les(I);
 
     tt = I->u[0] >> 63;
-    I->u[0] = I->u[0] << 1;
+    I->u[0] <<= 1;
 
     if (I->u[1] >> 63) {
         I->u[0] ^= 0x87;
     }
-    I->u[1] = (I->u[1] << 1) | tt;
+    I->u[1] <<= 1;
+    I->u[1] |= tt;
 
     xts_uint128_le_to_cpus(I);
 }

either way it generates the exact same asm code

> 
> > +    if (I->u[1] >> 63) {
> > +        I->u[0] ^= 0x87;
> >      }
> > +    I->u[1] = (I->u[1] << 1) | tt;
> > +
> > +    xts_uint128_le_to_cpus(I);
> 
> I think both endianness conversion calls should be flipped. First you
> convert from the buffer byte order (LE) to the CPU byte order so you can
> do the bit shifts, then back to the original byte order (LE).
> 
> Changing this doesn't have any practical effect because both calls
> perform the exact same operation, but it documents better what's going
> on.

Yep, ok

> With this changed,
> 
> Reviewed-by: Alberto Garcia <berto@igalia.com>

Regards,
Daniel
Alberto Garcia Oct. 16, 2018, 2:22 p.m. UTC | #3
On Tue 16 Oct 2018 03:59:27 PM CEST, Daniel P. Berrangé wrote:
> In fact I could do the following:
>
> @@ -59,12 +59,13 @@ static void xts_mult_x(xts_uint128 *I)
>      xts_uint128_cpu_to_les(I);
>  
>      tt = I->u[0] >> 63;
> -    I->u[0] = I->u[0] << 1;
> +    I->u[0] <<= 1;
>  
>      if (I->u[1] >> 63) {
>          I->u[0] ^= 0x87;
>      }
> -    I->u[1] = (I->u[1] << 1) | tt;
> +    I->u[1] <<= 1;
> +    I->u[1] |= tt;
>  
>      xts_uint128_le_to_cpus(I);
>  }

Yep, that looks good.

Berto
diff mbox series

Patch

diff --git a/crypto/xts.c b/crypto/xts.c
index 2e3430672c..56c0e4e6ed 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -24,6 +24,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/bswap.h"
 #include "crypto/xts.h"
 
 typedef union {
@@ -39,19 +40,33 @@  static inline void xts_uint128_xor(xts_uint128 *D,
     D->u[1] = S1->u[1] ^ S2->u[1];
 }
 
-static void xts_mult_x(uint8_t *I)
+static inline void xts_uint128_cpu_to_les(xts_uint128 *v)
 {
-    int x;
-    uint8_t t, tt;
+    cpu_to_le64s(&v->u[0]);
+    cpu_to_le64s(&v->u[1]);
+}
 
-    for (x = t = 0; x < 16; x++) {
-        tt = I[x] >> 7;
-        I[x] = ((I[x] << 1) | t) & 0xFF;
-        t = tt;
-    }
-    if (tt) {
-        I[0] ^= 0x87;
+static inline void xts_uint128_le_to_cpus(xts_uint128 *v)
+{
+    le64_to_cpus(&v->u[0]);
+    le64_to_cpus(&v->u[1]);
+}
+
+static void xts_mult_x(xts_uint128 *I)
+{
+    uint64_t tt;
+
+    xts_uint128_cpu_to_les(I);
+
+    tt = I->u[0] >> 63;
+    I->u[0] = I->u[0] << 1;
+
+    if (I->u[1] >> 63) {
+        I->u[0] ^= 0x87;
     }
+    I->u[1] = (I->u[1] << 1) | tt;
+
+    xts_uint128_le_to_cpus(I);
 }
 
 
@@ -79,7 +94,7 @@  static void xts_tweak_encdec(const void *ctx,
     xts_uint128_xor(dst, dst, iv);
 
     /* LFSR the tweak */
-    xts_mult_x(iv->b);
+    xts_mult_x(iv);
 }
 
 
@@ -134,7 +149,7 @@  void xts_decrypt(const void *datactx,
     if (mo > 0) {
         xts_uint128 S, D;
         memcpy(&CC, &T, XTS_BLOCK_SIZE);
-        xts_mult_x(CC.b);
+        xts_mult_x(&CC);
 
         /* PP = tweak decrypt block m-1 */
         memcpy(&S, src, XTS_BLOCK_SIZE);