diff mbox series

[u-boot,v2019.04-aspeed-openbmc,10/11] crypto: Add driver for Aspeed HACE

Message ID 20210413080755.73572-11-joel@jms.id.au
State New
Headers show
Series Use HACE to | expand

Commit Message

Joel Stanley April 13, 2021, 8:07 a.m. UTC
The HACE supports MD5, SHA1 and SHA2 family hash functions. This driver
uses it in a polling mode to perform hash calculations over buffers
placed in DRAM.

Co-developed-by: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 drivers/crypto/Kconfig       |  16 +++
 drivers/crypto/Makefile      |   1 +
 drivers/crypto/aspeed_hace.c | 250 +++++++++++++++++++++++++++++++++++
 3 files changed, 267 insertions(+)
 create mode 100644 drivers/crypto/aspeed_hace.c

Comments

Klaus Heinrich Kiwi April 13, 2021, 8:41 p.m. UTC | #1
Hi Joel,

On 4/13/2021 5:07 AM, Joel Stanley wrote:
> The HACE supports MD5, SHA1 and SHA2 family hash functions. This driver
> uses it in a polling mode to perform hash calculations over buffers
> placed in DRAM.
> 
> Co-developed-by: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
> Signed-off-by: Joel Stanley <joel@jms.id.au>
> ---
>   drivers/crypto/Kconfig       |  16 +++
>   drivers/crypto/Makefile      |   1 +
>   drivers/crypto/aspeed_hace.c | 250 +++++++++++++++++++++++++++++++++++
>   3 files changed, 267 insertions(+)
>   create mode 100644 drivers/crypto/aspeed_hace.c
> 
> diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> index 1ea116be7503..f78e41e0e9e7 100644
> --- a/drivers/crypto/Kconfig
> +++ b/drivers/crypto/Kconfig
> @@ -2,4 +2,20 @@ menu "Hardware crypto devices"
> 
>   source drivers/crypto/fsl/Kconfig
> 
> +config ASPEED_HACE
> +	bool "ASPEED Hash and Crypto Engine"
> +	select SHA_HW_ACCEL
> +	select SHA_PROG_HW_ACCEL
> +	depends on ASPEED_AST2600
> +	help
> +	 Select this option to enable a driver for using the SHA engine in
> +	 the ASPEED BMC SoCs.
> +
> +	 Enabling this allows the use of SHA operations in hardware without requiring the
> +	 SHA software implementations, saving code size.
> +
> +	 Due to hardware limitations it cannot be used with a FIT placed in SPI
> +	 FLASH. Data can only be hashed if it is in SDRAM, making this relevant
> +	 for MMC and network boot only.
> +
>   endmenu
> diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
> index efbd1d3fca05..ac93b1295954 100644
> --- a/drivers/crypto/Makefile
> +++ b/drivers/crypto/Makefile
> @@ -4,5 +4,6 @@
>   # 	http://www.samsung.com
> 
>   obj-$(CONFIG_EXYNOS_ACE_SHA)	+= ace_sha.o
> +obj-$(CONFIG_ASPEED_HACE)	+= aspeed_hace.o
>   obj-y += rsa_mod_exp/
>   obj-y += fsl/
> diff --git a/drivers/crypto/aspeed_hace.c b/drivers/crypto/aspeed_hace.c
> new file mode 100644
> index 000000000000..473d4d7391b7
> --- /dev/null
> +++ b/drivers/crypto/aspeed_hace.c
> @@ -0,0 +1,250 @@
> +/*
> + * (C) Copyright ASPEED Technology Inc.
> + * Copyright 2021 IBM Corp.
> + *
> + * SPDX-License-Identifier:	GPL-2.0-or-later
> + */
> +
> +#include <common.h>
> +#include <clk.h>
> +
> +#include <log.h>
> +#include <asm/io.h>
> +#include <malloc.h>
> +#include <hash.h>
> +
> +#include <dm/device.h>
> +#include <dm/fdtaddr.h>
> +
> +#include <linux/bitops.h>
> +#include <linux/delay.h>
> +#include <linux/kernel.h>
> +#include <linux/iopoll.h>
> +
> +#define ASPEED_HACE_STS			0x1C
> +#define  HACE_RSA_ISR			BIT(13)
> +#define  HACE_CRYPTO_ISR		BIT(12)
> +#define  HACE_HASH_ISR			BIT(9)
> +#define  HACE_RSA_BUSY			BIT(2)
> +#define  HACE_CRYPTO_BUSY		BIT(1)
> +#define  HACE_HASH_BUSY			BIT(0)
> +#define ASPEED_HACE_HASH_SRC		0x20
> +#define ASPEED_HACE_HASH_DIGEST_BUFF	0x24
> +#define ASPEED_HACE_HASH_KEY_BUFF	0x28
> +#define ASPEED_HACE_HASH_DATA_LEN	0x2C
> +#define  HACE_SG_LAST			BIT(31)
> +#define ASPEED_HACE_HASH_CMD		0x30
> +#define  HACE_SHA_BE_EN			BIT(3)
> +#define  HACE_MD5_LE_EN			BIT(2)
> +#define  HACE_ALGO_MD5			0
> +#define  HACE_ALGO_SHA1			BIT(5)
> +#define  HACE_ALGO_SHA224		BIT(6)
> +#define  HACE_ALGO_SHA256		(BIT(4) | BIT(6))
> +#define  HACE_ALGO_SHA512		(BIT(5) | BIT(6))
> +#define  HACE_ALGO_SHA384		(BIT(5) | BIT(6) | BIT(10))
> +#define  HACE_SG_EN			BIT(18)
> +
> +#define ASPEED_MAX_SG			32
> +
> +struct aspeed_sg {
> +	u32 len;
> +	u32 addr;
> +};
> +
> +struct aspeed_hash_ctx {
> +	u32 method;
> +	u32 digest_size;
> +	u32 len;
> +	u32 count;
> +	struct aspeed_sg list[ASPEED_MAX_SG] __attribute__((aligned(8)));
> +};
> +
> +struct aspeed_hace {
> +	struct clk clk;
> +};
> +
> +static phys_addr_t base;
> +
> +static int aspeed_hace_wait_completion(u32 reg, u32 flag, int timeout_us)
> +{
> +	u32 val;
> +
> +	return readl_poll_timeout(reg, val, (val & flag) == flag, timeout_us);
> +}
> +
> +static int digest_object(const void *src, unsigned int length, void *digest,
> +		  u32 method)
> +{
> +	if (!((u32)src & BIT(31))) {
> +		debug("HACE src out of bounds: can only copy from SDRAM\n");
> +		return -EINVAL;
> +	}
> +
> +	if ((u32)digest & 0x7) {
> +		debug("HACE dest alignment incorrect: %p\n", digest);
> +		return -EINVAL;
> +	}
> +
> +	writel((u32)src, base + ASPEED_HACE_HASH_SRC);
> +	writel((u32)digest, base + ASPEED_HACE_HASH_DIGEST_BUFF);
> +	writel(length, base + ASPEED_HACE_HASH_DATA_LEN);
> +	writel(HACE_SHA_BE_EN | method, base + ASPEED_HACE_HASH_CMD);
> +
> +	/* SHA512 hashing appears to have a througput of about 12MB/s */
> +	return aspeed_hace_wait_completion(base + ASPEED_HACE_STS,
> +			HACE_HASH_ISR,
> +			1000 + (length >> 3));

In some of my previous testing with the un-cleaned patchset
(https://github.com/klauskiwi/u-boot/tree/hace_sg_work), the
Qemu implementation (Cedric's Aspeed-6.0 branch) worked fine,
but on hardware I was getting errors until I explicitly
cleared the HACE_HASH_ISR before attemptinga new command..
It makes sense since the readl_poll_timeout() would return
immediately, without completing the command, if the HASH_ISR bit
is set.

> +}
> +
> +void hw_sha1(const unsigned char *pbuf, unsigned int buf_len,
> +	       unsigned char *pout, unsigned int chunk_size)
> +{
> +	int rc;
> +
> +	rc = digest_object(pbuf, buf_len, pout, HACE_ALGO_SHA1);
> +	if (rc)
> +		debug("HACE failure: %d\n", rc);
> +}
> +
> +void hw_sha256(const unsigned char *pbuf, unsigned int buf_len,
> +	       unsigned char *pout, unsigned int chunk_size)
> +{
> +	int rc;
> +
> +	rc = digest_object(pbuf, buf_len, pout, HACE_ALGO_SHA256);
> +	if (rc)
> +		debug("HACE failure: %d\n", rc);
> +}
> +
> +void hw_sha512(const unsigned char *pbuf, unsigned int buf_len,
> +	       unsigned char *pout, unsigned int chunk_size)
> +{
> +	int rc;
> +
> +	rc = digest_object(pbuf, buf_len, pout, HACE_ALGO_SHA512);
> +	if (rc)
> +		debug("HACE failure: %d\n", rc);
> +}
> +
> +#if IS_ENABLED(CONFIG_SHA_PROG_HW_ACCEL)
> +int hw_sha_init(struct hash_algo *algo, void **ctxp)
> +{
> +	struct aspeed_hash_ctx *ctx;
> +	u32 method;
> +
> +	if (!strcmp(algo->name, "sha1")) {
> +		method = HACE_ALGO_SHA1;
> +	}
> +	else if (!strcmp(algo->name, "sha256")) {
> +		method = HACE_ALGO_SHA256;
> +	}
> +	else if (!strcmp(algo->name, "sha512")) {
> +		method = HACE_ALGO_SHA512;
> +	}
> +	else  {
> +		return -ENOTSUPP;
> +	}
> +
> +	ctx = calloc(1, sizeof(*ctx));
> +
> +	if (ctx == NULL) {
> +		debug("Cannot allocate memory for context\n");
> +		return -ENOMEM;
> +	}
> +	ctx->method = method | HACE_SG_EN;
> +	ctx->digest_size = algo->digest_size;
> +	*ctxp = ctx;
> +
> +	return 0;
> +}
> +
> +int hw_sha_update(struct hash_algo *algo, void *hash_ctx, const void *buf,
> +		  unsigned int size, int is_last)
> +{
> +	struct aspeed_hash_ctx *ctx = hash_ctx;
> +	struct aspeed_sg *sg = &ctx->list[ctx->count];
> +
> +	if (ctx->count >= ARRAY_SIZE(ctx->list)) {
> +		debug("HACE error: Reached maximum number of hash segments\n");
> +		free(ctx);
> +		return -EINVAL;
> +	}
> +
> +	sg->addr = (u32)buf;
> +	sg->len = size;
> +	if (is_last)
> +		sg->len |= HACE_SG_LAST;
> +
> +	ctx->count++;
> +	ctx->len += size;
> +
> +	return 0;
> +}
> +
> +int hw_sha_finish(struct hash_algo *algo, void *hash_ctx, void *dest_buf, int size)
> +{
> +	struct aspeed_hash_ctx *ctx = hash_ctx;
> +	int rc;
> +
> +	if (size < ctx->digest_size) {
> +		debug("HACE error: insufficient size on destination buffer\n");
> +		free(ctx);
> +		return -EINVAL;
> +	}
> +
> +	rc = digest_object(ctx->list, ctx->len, dest_buf, ctx->method);
> +	if (rc)
> +		debug("HACE Scatter-Gather failure\n");
> +
> +	free(ctx);
> +
> +	return rc;
> +}
> +#endif
> +
> +static int aspeed_hace_probe(struct udevice *dev)
> +{
> +	struct aspeed_hace *hace = dev_get_priv(dev);
> +	int ret;
> +
> +	ret = clk_get_by_index(dev, 0, &hace->clk);
> +	if (ret < 0) {
> +		debug("Can't get clock for %s: %d\n", dev->name, ret);
> +		return ret;
> +	}
> +
> +	ret = clk_enable(&hace->clk);
> +	if (ret) {
> +		debug("Failed to enable fsi clock (%d)\n", ret);
> +		return ret;
> +	}
> +
> +	/* As the crypto code does not pass us any driver state */
> +	base = devfdt_get_addr(dev);
> +
> +	return ret;
> +}
> +
> +static int aspeed_hace_remove(struct udevice *dev)
> +{
> +	struct aspeed_hace *hace = dev_get_priv(dev);
> +
> +	clk_disable(&hace->clk);
> +
> +	return 0;
> +}
> +
> +static const struct udevice_id aspeed_hace_ids[] = {
> +	{ .compatible = "aspeed,ast2600-hace" },
> +	{ }
> +};
> +
> +U_BOOT_DRIVER(aspeed_hace) = {
> +	.name		= "aspeed_hace",
> +	.id		= UCLASS_MISC,
> +	.of_match	= aspeed_hace_ids,
> +	.probe		= aspeed_hace_probe,
> +	.remove 	= aspeed_hace_remove,
> +	.priv_auto_alloc_size = sizeof(struct aspeed_hace),
> +	.flags  = DM_FLAG_PRE_RELOC,
> +};
> 

I've tested your patchset with Cedric's Aspeed-6.0 but looks
like the probe function is never called. Reading through the code
a bit more, looks like you need to explicitly probe this device
somewhere in board_init_r (that is, after sdram was initialized),
since functions like dm_scan_fdt() and dm_extended_scan_fdt() will
only scan subnodes of the top level, and the clocks node.

This is what I get (with some added printfs of mine):

qemu-system-arm: warning: Aspeed iBT has no chardev backend
qemu-system-arm: warning: nic ftgmac100.0 has no peer
qemu-system-arm: warning: nic ftgmac100.1 has no peer
qemu-system-arm: warning: nic ftgmac100.2 has no peer
qemu-system-arm: warning: nic ftgmac100.3 has no peer
aspeed_timer_ctrl_pulse_enable: Timer does not support pulse mode
aspeed_timer_ctrl_pulse_enable: Timer does not support pulse mode
aspeed_timer_ctrl_pulse_enable: Timer does not support pulse mode
aspeed_timer_ctrl_pulse_enable: Timer does not support pulse mode
aspeed_ast2600_scu_write: SCU is locked!
aspeed_ast2600_scu_write: SCU is locked!
aspeed_smc_write: not implemented: 0x18

U-Boot SPL 2019.04 (Apr 13 2021 - 17:57:20 +0000)
aspeed_soc.io: unimplemented device read  (size 4, offset 0x0f500c)
aspeed_soc.io: unimplemented device write (size 4, offset 0x0f500c, value 0x00000000)
Trying to boot from MMC1
SD: CMD8 in a wrong state
## Checking hash(es) for Image uboot ... sha512,rsa4096:autogenerated-uboot-4096-key
digest_object: ASPEED_HACE_STS='0xe59ff03c'

digest_object: SCU080h='0xf7ff7f8a'

digest_object: writing '0x90200104' to ASPEED_HACE_HASH_SRC

digest_object: writing '0x902ffa50' to ASPEED_HACE_HASH_DIGEST_BUFF

digest_object: writing '0x0005ccd4' to ASPEED_HACE_HASH_DATA_LEN

digest_object: writing '0x00040068' to ASPEED_HACE_HASH_CMD
HACE Scatter-Gather failure
rsa_verify: Error in checksum calculation
- Failed to verify required signature 'key-autogenerated-uboot-4096-key'
  error!
Unable to verify required signature for '' hash node in 'uboot' image node
mmc_load_image_raw_sector: mmc block read error
Trying to boot from UART
CCQEMU: Terminated
Klaus Heinrich Kiwi April 14, 2021, 8:28 p.m. UTC | #2
On 4/13/2021 5:41 PM, Klaus Heinrich Kiwi wrote:
> I've tested your patchset with Cedric's Aspeed-6.0 but looks
> like the probe function is never called. Reading through the code
> a bit more, looks like you need to explicitly probe this device
> somewhere in board_init_r (that is, after sdram was initialized),
> since functions like dm_scan_fdt() and dm_extended_scan_fdt() will
> only scan subnodes of the top level, and the clocks node.
> 
> This is what I get (with some added printfs of mine):

I've played around a bit more, and got it to work on Qemu with the following changes:

  * Added a board-specific spl_board_init() initializing the HACE driver at
    the SPL's board_init_r() timeframe. Enabled that on the defconfig file.
  * Because the driver model is using some pre-sdram malloc pool space,
    the changes above were causing the probing of the sdram itself to fail.
    Corrected by increasing the pre-sdram malloc pool to 0x1000.

 From 4ddc9fe2727a54f2d8c088d1c7046f4a3ab56314 Mon Sep 17 00:00:00 2001
From: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
Date: Wed, 14 Apr 2021 16:29:15 -0300
Subject: [PATCH] fixup: Enable HACE probing on SPL, expand malloc pool

Signed-off-by: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
---
  arch/arm/mach-aspeed/ast2600/spl.c         | 15 +++++++++++++++
  configs/ast2600_openbmc_spl_emmc_defconfig |  3 ++-
  2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mach-aspeed/ast2600/spl.c b/arch/arm/mach-aspeed/ast2600/spl.c
index 527e14f8e3..2b1e8477eb 100644
--- a/arch/arm/mach-aspeed/ast2600/spl.c
+++ b/arch/arm/mach-aspeed/ast2600/spl.c
@@ -86,3 +86,18 @@ struct image_header *spl_get_load_buffer(ssize_t offset, size_t size)
  {
  	return (struct image_header *)(CONFIG_SYS_LOAD_ADDR);
  }
+
+#if CONFIG_IS_ENABLED(BOARD_INIT)
+void spl_board_init(void)
+{
+#ifdef CONFIG_ASPEED_HACE
+	struct udevice *dev;
+
+	if (uclass_get_device_by_driver(UCLASS_MISC, DM_GET_DRIVER(aspeed_hace),
+									&dev))
+		debug("Warning: HACE initialization failure\n");
+#endif
+
+	return;
+}
+#endif /* CONFIG_IS_ENABLED(BOARD_INIT) */
\ No newline at end of file
diff --git a/configs/ast2600_openbmc_spl_emmc_defconfig b/configs/ast2600_openbmc_spl_emmc_defconfig
index 05069084cb..e856df2398 100644
--- a/configs/ast2600_openbmc_spl_emmc_defconfig
+++ b/configs/ast2600_openbmc_spl_emmc_defconfig
@@ -24,7 +24,7 @@ CONFIG_ASPEED_KERNEL_FIT_DRAM_BASE=0x83000000
  CONFIG_TARGET_EVB_AST2600A1=y
  CONFIG_SPL_LIBCOMMON_SUPPORT=y
  CONFIG_SPL_LIBGENERIC_SUPPORT=y
-CONFIG_SYS_MALLOC_F_LEN=0x800
+CONFIG_SYS_MALLOC_F_LEN=0x1000
  CONFIG_SPL_MMC_SUPPORT=y
  CONFIG_SPL_SERIAL_SUPPORT=y
  CONFIG_SPL_DRIVERS_MISC_SUPPORT=y
@@ -48,6 +48,7 @@ CONFIG_SYS_CONSOLE_ENV_OVERWRITE=y
  CONFIG_DISPLAY_BOARDINFO_LATE=y
  CONFIG_ARCH_EARLY_INIT_R=y
  CONFIG_BOARD_EARLY_INIT_F=y
+CONFIG_SPL_BOARD_INIT=y
  # CONFIG_SPL_LEGACY_IMAGE_SUPPORT is not set
  CONFIG_SPL_SYS_MALLOC_SIMPLE=y
  CONFIG_SPL_STACK_R=y
Joel Stanley April 15, 2021, 2:32 a.m. UTC | #3
On Wed, 14 Apr 2021 at 20:28, Klaus Heinrich Kiwi
<klaus@linux.vnet.ibm.com> wrote:
>
>
>
> On 4/13/2021 5:41 PM, Klaus Heinrich Kiwi wrote:
> > I've tested your patchset with Cedric's Aspeed-6.0 but looks
> > like the probe function is never called. Reading through the code
> > a bit more, looks like you need to explicitly probe this device
> > somewhere in board_init_r (that is, after sdram was initialized),
> > since functions like dm_scan_fdt() and dm_extended_scan_fdt() will
> > only scan subnodes of the top level, and the clocks node.
> >
> > This is what I get (with some added printfs of mine):
>
> I've played around a bit more, and got it to work on Qemu with the following changes:
>
>   * Added a board-specific spl_board_init() initializing the HACE driver at
>     the SPL's board_init_r() timeframe. Enabled that on the defconfig file.
>   * Because the driver model is using some pre-sdram malloc pool space,
>     the changes above were causing the probing of the sdram itself to fail.
>     Corrected by increasing the pre-sdram malloc pool to 0x1000.

Thanks. I had added something similar when debugging this yesterday.

I had tested the changes, but must have mixed up which images were
being loaded into qemu.

> However, when I tried to test it on a Rainier, it failed:
> U-Boot SPL 2019.04 (Apr 14 2021 - 19:31:59 +0000)
> already initialized, Trying to boot from MMC1
> ## Checking hash(es) for Image uboot ... sha512,rsa4096:autogenerated-uboot-4096-key- Failed to verify required signature 'key-autogenera'
>   error!
> Unable to verify required signature for '' hash node in 'uboot' image node
> mmc_load_image_raw_sector: mmc block read error
> Trying to boot from UART
> CCCCC �P
>
> (and yes, I had since disabled my debugging printf's). I wonder if the HASH_ISR
> may need to be explicitly cleared, although I'd expect it to work for the first
> command at least.
>
> Another interesting thing is that the SPL tries to boot from UART, but neither
> my fitImages, Legacy images or even RAW images are working.. Not sure if we need
> some special handling of those images before feeding them to the spl ymodem loader?

I wasn't able to get the SPL to load any images - raw binaries or FIT
- from eMMC either. Something is going wrong, but I am unsure what it
is. I will continue to debug.

Cheers,

Joel
Klaus Heinrich Kiwi April 15, 2021, 9:37 p.m. UTC | #4
On 4/14/2021 11:32 PM, Joel Stanley wrote:
>> Another interesting thing is that the SPL tries to boot from UART, but neither
>> my fitImages, Legacy images or even RAW images are working.. Not sure if we need
>> some special handling of those images before feeding them to the spl ymodem loader?
> I wasn't able to get the SPL to load any images - raw binaries or FIT
> - from eMMC either. Something is going wrong, but I am unsure what it
> is. I will continue to debug.

I was able to make it work on real hardware (rainier100) with the following changes
(in addition to the other fixups already mentioned in this thread):

 From a2a2819455ec5de689fd0702cce20bfb18ec11b1 Mon Sep 17 00:00:00 2001
From: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
Date: Thu, 15 Apr 2021 15:16:37 -0300
Subject: [PATCH] HACE fixups:

  * The AST2600 workbook mentions that the list structure passed to
    ASPEED_HACE_HASH_SRC needs to be 8-byte aligned. Normally, glibc's
    malloc() would always align memory to (at least) 8-bytes, but that's
    the case with u-boot's pre-sdram malloc() implementation. So we need
    to explicitly align the context to 8-bytes with malign().

  * The __atribute__ ((align 8)) doesn't have an effect in struct
    elements. Remove it.

  * Since the struct aspeed_hash_ctx->list element is what we need to
    make sure is aligned to 9-bytes, move that to the first element of
    the array, and call-out the fact that this needs to be aligned in
    the declaration.

   * Clear HACE_HASH_ISR before issuing new command

Signed-off-by: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
---
  drivers/crypto/aspeed_hace.c | 15 +++++++++++++--
  1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/aspeed_hace.c b/drivers/crypto/aspeed_hace.c
index 473d4d7391..0551fe6c83 100644
--- a/drivers/crypto/aspeed_hace.c
+++ b/drivers/crypto/aspeed_hace.c
@@ -51,12 +51,19 @@ struct aspeed_sg {
  	u32 addr;
  };
  
+
+/*
+ * Note: element 'list' below needs to be 8-byte aligned,
+ *       keep it as the first element so that we can always
+ *       guarantee that when allocating the struct (that should
+ *       also be 8-byte aligned)
+ */
  struct aspeed_hash_ctx {
+	struct aspeed_sg list[ASPEED_MAX_SG];
  	u32 method;
  	u32 digest_size;
  	u32 len;
  	u32 count;
-	struct aspeed_sg list[ASPEED_MAX_SG] __attribute__((aligned(8)));
  };
  
  struct aspeed_hace {
@@ -85,6 +92,9 @@ static int digest_object(const void *src, unsigned int length, void *digest,
  		return -EINVAL;
  	}
  
+	/* clear any pending interrupt */
+	writel(HACE_HASH_ISR, base + ASPEED_HACE_STS);
+
  	writel((u32)src, base + ASPEED_HACE_HASH_SRC);
  	writel((u32)digest, base + ASPEED_HACE_HASH_DIGEST_BUFF);
  	writel(length, base + ASPEED_HACE_HASH_DATA_LEN);
@@ -145,12 +155,13 @@ int hw_sha_init(struct hash_algo *algo, void **ctxp)
  		return -ENOTSUPP;
  	}
  
-	ctx = calloc(1, sizeof(*ctx));
+	ctx = memalign(8, sizeof(struct aspeed_hash_ctx));
  
  	if (ctx == NULL) {
  		debug("Cannot allocate memory for context\n");
  		return -ENOMEM;
  	}
+	ctx->len = ctx->count = 0;
  	ctx->method = method | HACE_SG_EN;
  	ctx->digest_size = algo->digest_size;
  	*ctxp = ctx;
Joel Stanley April 19, 2021, 12:49 p.m. UTC | #5
On Thu, 15 Apr 2021 at 21:38, Klaus Heinrich Kiwi
<klaus@linux.vnet.ibm.com> wrote:
>
>
>
> On 4/14/2021 11:32 PM, Joel Stanley wrote:
> >> Another interesting thing is that the SPL tries to boot from UART, but neither
> >> my fitImages, Legacy images or even RAW images are working.. Not sure if we need
> >> some special handling of those images before feeding them to the spl ymodem loader?
> > I wasn't able to get the SPL to load any images - raw binaries or FIT
> > - from eMMC either. Something is going wrong, but I am unsure what it
> > is. I will continue to debug.
>
> I was able to make it work on real hardware (rainier100) with the following changes
> (in addition to the other fixups already mentioned in this thread):
>
>  From a2a2819455ec5de689fd0702cce20bfb18ec11b1 Mon Sep 17 00:00:00 2001
> From: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
> Date: Thu, 15 Apr 2021 15:16:37 -0300
> Subject: [PATCH] HACE fixups:
>
>   * The AST2600 workbook mentions that the list structure passed to
>     ASPEED_HACE_HASH_SRC needs to be 8-byte aligned. Normally, glibc's
>     malloc() would always align memory to (at least) 8-bytes, but that's
>     the case with u-boot's pre-sdram malloc() implementation. So we need
>     to explicitly align the context to 8-bytes with malign().

We're not using the HACE engine pre-SDRAM, so we should be ok.

common/dlmalloc.c:

    8-byte alignment is guaranteed by normal malloc calls, so don't
    bother calling memalign with an argument of 8 or less.

Were you able to observe any allocations that were not aligned?

>
>   * The __atribute__ ((align 8)) doesn't have an effect in struct
>     elements. Remove it.

Agreed.

>
>   * Since the struct aspeed_hash_ctx->list element is what we need to
>     make sure is aligned to 9-bytes, move that to the first element of
>     the array, and call-out the fact that this needs to be aligned in
>     the declaration.

9 bytes? Did you mean 8?

Regardless, the array in the structure should be aligned as there will
be no padding earlier in the struct. I have added a runtime check for
this and have not hit that in my testing so far.

>
>    * Clear HACE_HASH_ISR before issuing new command

This makes sense.

I will send out a new series for review. I've tested it thoroughly in
qemu, and have tested the u-boot -> Linux path on hardware.

Cheers,

Joel

>
> Signed-off-by: Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
> ---
>   drivers/crypto/aspeed_hace.c | 15 +++++++++++++--
>   1 file changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/crypto/aspeed_hace.c b/drivers/crypto/aspeed_hace.c
> index 473d4d7391..0551fe6c83 100644
> --- a/drivers/crypto/aspeed_hace.c
> +++ b/drivers/crypto/aspeed_hace.c
> @@ -51,12 +51,19 @@ struct aspeed_sg {
>         u32 addr;
>   };
>
> +
> +/*
> + * Note: element 'list' below needs to be 8-byte aligned,
> + *       keep it as the first element so that we can always
> + *       guarantee that when allocating the struct (that should
> + *       also be 8-byte aligned)
> + */
>   struct aspeed_hash_ctx {
> +       struct aspeed_sg list[ASPEED_MAX_SG];
>         u32 method;
>         u32 digest_size;
>         u32 len;
>         u32 count;
> -       struct aspeed_sg list[ASPEED_MAX_SG] __attribute__((aligned(8)));
>   };
>
>   struct aspeed_hace {
> @@ -85,6 +92,9 @@ static int digest_object(const void *src, unsigned int length, void *digest,
>                 return -EINVAL;
>         }
>
> +       /* clear any pending interrupt */
> +       writel(HACE_HASH_ISR, base + ASPEED_HACE_STS);
> +
>         writel((u32)src, base + ASPEED_HACE_HASH_SRC);
>         writel((u32)digest, base + ASPEED_HACE_HASH_DIGEST_BUFF);
>         writel(length, base + ASPEED_HACE_HASH_DATA_LEN);
> @@ -145,12 +155,13 @@ int hw_sha_init(struct hash_algo *algo, void **ctxp)
>                 return -ENOTSUPP;
>         }
>
> -       ctx = calloc(1, sizeof(*ctx));
> +       ctx = memalign(8, sizeof(struct aspeed_hash_ctx));
>
>         if (ctx == NULL) {
>                 debug("Cannot allocate memory for context\n");
>                 return -ENOMEM;
>         }
> +       ctx->len = ctx->count = 0;
>         ctx->method = method | HACE_SG_EN;
>         ctx->digest_size = algo->digest_size;
>         *ctxp = ctx;
> --
> 2.25.1
>
>
>
>
>
> --
> Klaus Heinrich Kiwi <klaus@linux.vnet.ibm.com>
Klaus Heinrich Kiwi April 19, 2021, 12:58 p.m. UTC | #6
Hi Joel,

On 4/19/2021 9:49 AM, Joel Stanley wrote:
> On Thu, 15 Apr 2021 at 21:38, Klaus Heinrich Kiwi

>>
>>    * The AST2600 workbook mentions that the list structure passed to
>>      ASPEED_HACE_HASH_SRC needs to be 8-byte aligned. Normally, glibc's
>>      malloc() would always align memory to (at least) 8-bytes, but that's
>>      the case with u-boot's pre-sdram malloc() implementation. So we need
>>      to explicitly align the context to 8-bytes with malign().
> 
> We're not using the HACE engine pre-SDRAM, so we should be ok.
> 
> common/dlmalloc.c:
> 
>      8-byte alignment is guaranteed by normal malloc calls, so don't
>      bother calling memalign with an argument of 8 or less.
> 
> Were you able to observe any allocations that were not aligned?

yes, I thought I had. And by pre-sdram I guess I was trying to say pre-relocation.
But that might have been an artifact of trying to probe/initialize hace in the
SPL, so I might have mixed things.

I was sure that Qemu and Hardware were behaving differently, since the same
image would work on qemu but fail on hardware. Perhaps we should put a warning
on Qemu for unaligned access instead of just masking.

  
>>
>>    * The __atribute__ ((align 8)) doesn't have an effect in struct
>>      elements. Remove it.
> 
> Agreed.
> 
>>
>>    * Since the struct aspeed_hash_ctx->list element is what we need to
>>      make sure is aligned to 9-bytes, move that to the first element of
>>      the array, and call-out the fact that this needs to be aligned in
>>      the declaration.
> 
> 9 bytes? Did you mean 8?
> I did. A typo.

> Regardless, the array in the structure should be aligned as there will
> be no padding earlier in the struct. I have added a runtime check for
> this and have not hit that in my testing so far.
> 

My change involved moving the array to the first element exactly to not
risk changing the u32 elements before it to something else and breaking
alignment.

>>
>>     * Clear HACE_HASH_ISR before issuing new command
> 
> This makes sense.
> 
> I will send out a new series for review. I've tested it thoroughly in
> qemu, and have tested the u-boot -> Linux path on hardware.
> 
> Cheers,
> 
> Joel

Sounds good. I can give it a spin on rain100 and check if it's all good.
diff mbox series

Patch

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 1ea116be7503..f78e41e0e9e7 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -2,4 +2,20 @@  menu "Hardware crypto devices"
 
 source drivers/crypto/fsl/Kconfig
 
+config ASPEED_HACE
+	bool "ASPEED Hash and Crypto Engine"
+	select SHA_HW_ACCEL
+	select SHA_PROG_HW_ACCEL
+	depends on ASPEED_AST2600
+	help
+	 Select this option to enable a driver for using the SHA engine in
+	 the ASPEED BMC SoCs.
+
+	 Enabling this allows the use of SHA operations in hardware without requiring the
+	 SHA software implementations, saving code size.
+
+	 Due to hardware limitations it cannot be used with a FIT placed in SPI
+	 FLASH. Data can only be hashed if it is in SDRAM, making this relevant
+	 for MMC and network boot only.
+
 endmenu
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index efbd1d3fca05..ac93b1295954 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -4,5 +4,6 @@ 
 # 	http://www.samsung.com
 
 obj-$(CONFIG_EXYNOS_ACE_SHA)	+= ace_sha.o
+obj-$(CONFIG_ASPEED_HACE)	+= aspeed_hace.o
 obj-y += rsa_mod_exp/
 obj-y += fsl/
diff --git a/drivers/crypto/aspeed_hace.c b/drivers/crypto/aspeed_hace.c
new file mode 100644
index 000000000000..473d4d7391b7
--- /dev/null
+++ b/drivers/crypto/aspeed_hace.c
@@ -0,0 +1,250 @@ 
+/*
+ * (C) Copyright ASPEED Technology Inc.
+ * Copyright 2021 IBM Corp.
+ *
+ * SPDX-License-Identifier:	GPL-2.0-or-later
+ */
+
+#include <common.h>
+#include <clk.h>
+
+#include <log.h>
+#include <asm/io.h>
+#include <malloc.h>
+#include <hash.h>
+
+#include <dm/device.h>
+#include <dm/fdtaddr.h>
+
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/iopoll.h>
+
+#define ASPEED_HACE_STS			0x1C
+#define  HACE_RSA_ISR			BIT(13)
+#define  HACE_CRYPTO_ISR		BIT(12)
+#define  HACE_HASH_ISR			BIT(9)
+#define  HACE_RSA_BUSY			BIT(2)
+#define  HACE_CRYPTO_BUSY		BIT(1)
+#define  HACE_HASH_BUSY			BIT(0)
+#define ASPEED_HACE_HASH_SRC		0x20
+#define ASPEED_HACE_HASH_DIGEST_BUFF	0x24
+#define ASPEED_HACE_HASH_KEY_BUFF	0x28
+#define ASPEED_HACE_HASH_DATA_LEN	0x2C
+#define  HACE_SG_LAST			BIT(31)
+#define ASPEED_HACE_HASH_CMD		0x30
+#define  HACE_SHA_BE_EN			BIT(3)
+#define  HACE_MD5_LE_EN			BIT(2)
+#define  HACE_ALGO_MD5			0
+#define  HACE_ALGO_SHA1			BIT(5)
+#define  HACE_ALGO_SHA224		BIT(6)
+#define  HACE_ALGO_SHA256		(BIT(4) | BIT(6))
+#define  HACE_ALGO_SHA512		(BIT(5) | BIT(6))
+#define  HACE_ALGO_SHA384		(BIT(5) | BIT(6) | BIT(10))
+#define  HACE_SG_EN			BIT(18)
+
+#define ASPEED_MAX_SG			32
+
+struct aspeed_sg {
+	u32 len;
+	u32 addr;
+};
+
+struct aspeed_hash_ctx {
+	u32 method;
+	u32 digest_size;
+	u32 len;
+	u32 count;
+	struct aspeed_sg list[ASPEED_MAX_SG] __attribute__((aligned(8)));
+};
+
+struct aspeed_hace {
+	struct clk clk;
+};
+
+static phys_addr_t base;
+
+static int aspeed_hace_wait_completion(u32 reg, u32 flag, int timeout_us)
+{
+	u32 val;
+
+	return readl_poll_timeout(reg, val, (val & flag) == flag, timeout_us);
+}
+
+static int digest_object(const void *src, unsigned int length, void *digest,
+		  u32 method)
+{
+	if (!((u32)src & BIT(31))) {
+		debug("HACE src out of bounds: can only copy from SDRAM\n");
+		return -EINVAL;
+	}
+
+	if ((u32)digest & 0x7) {
+		debug("HACE dest alignment incorrect: %p\n", digest);
+		return -EINVAL;
+	}
+
+	writel((u32)src, base + ASPEED_HACE_HASH_SRC);
+	writel((u32)digest, base + ASPEED_HACE_HASH_DIGEST_BUFF);
+	writel(length, base + ASPEED_HACE_HASH_DATA_LEN);
+	writel(HACE_SHA_BE_EN | method, base + ASPEED_HACE_HASH_CMD);
+
+	/* SHA512 hashing appears to have a througput of about 12MB/s */
+	return aspeed_hace_wait_completion(base + ASPEED_HACE_STS,
+			HACE_HASH_ISR,
+			1000 + (length >> 3));
+}
+
+void hw_sha1(const unsigned char *pbuf, unsigned int buf_len,
+	       unsigned char *pout, unsigned int chunk_size)
+{
+	int rc;
+
+	rc = digest_object(pbuf, buf_len, pout, HACE_ALGO_SHA1);
+	if (rc)
+		debug("HACE failure: %d\n", rc);
+}
+
+void hw_sha256(const unsigned char *pbuf, unsigned int buf_len,
+	       unsigned char *pout, unsigned int chunk_size)
+{
+	int rc;
+
+	rc = digest_object(pbuf, buf_len, pout, HACE_ALGO_SHA256);
+	if (rc)
+		debug("HACE failure: %d\n", rc);
+}
+
+void hw_sha512(const unsigned char *pbuf, unsigned int buf_len,
+	       unsigned char *pout, unsigned int chunk_size)
+{
+	int rc;
+
+	rc = digest_object(pbuf, buf_len, pout, HACE_ALGO_SHA512);
+	if (rc)
+		debug("HACE failure: %d\n", rc);
+}
+
+#if IS_ENABLED(CONFIG_SHA_PROG_HW_ACCEL)
+int hw_sha_init(struct hash_algo *algo, void **ctxp)
+{
+	struct aspeed_hash_ctx *ctx;
+	u32 method;
+
+	if (!strcmp(algo->name, "sha1")) {
+		method = HACE_ALGO_SHA1;
+	}
+	else if (!strcmp(algo->name, "sha256")) {
+		method = HACE_ALGO_SHA256;
+	}
+	else if (!strcmp(algo->name, "sha512")) {
+		method = HACE_ALGO_SHA512;
+	}
+	else  {
+		return -ENOTSUPP;
+	}
+
+	ctx = calloc(1, sizeof(*ctx));
+
+	if (ctx == NULL) {
+		debug("Cannot allocate memory for context\n");
+		return -ENOMEM;
+	}
+	ctx->method = method | HACE_SG_EN;
+	ctx->digest_size = algo->digest_size;
+	*ctxp = ctx;
+
+	return 0;
+}
+
+int hw_sha_update(struct hash_algo *algo, void *hash_ctx, const void *buf,
+		  unsigned int size, int is_last)
+{
+	struct aspeed_hash_ctx *ctx = hash_ctx;
+	struct aspeed_sg *sg = &ctx->list[ctx->count];
+
+	if (ctx->count >= ARRAY_SIZE(ctx->list)) {
+		debug("HACE error: Reached maximum number of hash segments\n");
+		free(ctx);
+		return -EINVAL;
+	}
+
+	sg->addr = (u32)buf;
+	sg->len = size;
+	if (is_last)
+		sg->len |= HACE_SG_LAST;
+
+	ctx->count++;
+	ctx->len += size;
+
+	return 0;
+}
+
+int hw_sha_finish(struct hash_algo *algo, void *hash_ctx, void *dest_buf, int size)
+{
+	struct aspeed_hash_ctx *ctx = hash_ctx;
+	int rc;
+
+	if (size < ctx->digest_size) {
+		debug("HACE error: insufficient size on destination buffer\n");
+		free(ctx);
+		return -EINVAL;
+	}
+
+	rc = digest_object(ctx->list, ctx->len, dest_buf, ctx->method);
+	if (rc)
+		debug("HACE Scatter-Gather failure\n");
+
+	free(ctx);
+
+	return rc;
+}
+#endif
+
+static int aspeed_hace_probe(struct udevice *dev)
+{
+	struct aspeed_hace *hace = dev_get_priv(dev);
+	int ret;
+
+	ret = clk_get_by_index(dev, 0, &hace->clk);
+	if (ret < 0) {
+		debug("Can't get clock for %s: %d\n", dev->name, ret);
+		return ret;
+	}
+
+	ret = clk_enable(&hace->clk);
+	if (ret) {
+		debug("Failed to enable fsi clock (%d)\n", ret);
+		return ret;
+	}
+
+	/* As the crypto code does not pass us any driver state */
+	base = devfdt_get_addr(dev);
+
+	return ret;
+}
+
+static int aspeed_hace_remove(struct udevice *dev)
+{
+	struct aspeed_hace *hace = dev_get_priv(dev);
+
+	clk_disable(&hace->clk);
+
+	return 0;
+}
+
+static const struct udevice_id aspeed_hace_ids[] = {
+	{ .compatible = "aspeed,ast2600-hace" },
+	{ }
+};
+
+U_BOOT_DRIVER(aspeed_hace) = {
+	.name		= "aspeed_hace",
+	.id		= UCLASS_MISC,
+	.of_match	= aspeed_hace_ids,
+	.probe		= aspeed_hace_probe,
+	.remove 	= aspeed_hace_remove,
+	.priv_auto_alloc_size = sizeof(struct aspeed_hace),
+	.flags  = DM_FLAG_PRE_RELOC,
+};