diff mbox series

[v3,2/8] drivers/soc: Add Aspeed XDMA Engine Driver

Message ID 1559153408-31190-3-git-send-email-eajames@linux.ibm.com
State Superseded, archived
Headers show
Series drivers/soc: Add Aspeed XDMA Engine Driver | expand

Commit Message

Eddie James May 29, 2019, 6:10 p.m. UTC
The XDMA engine embedded in the AST2500 SOC performs PCI DMA operations
between the SOC (acting as a BMC) and a host processor in a server.

This commit adds a driver to control the XDMA engine and adds functions
to initialize the hardware and memory and start DMA operations.

Signed-off-by: Eddie James <eajames@linux.ibm.com>
---
 MAINTAINERS                      |  10 +
 drivers/soc/aspeed/Kconfig       |   8 +
 drivers/soc/aspeed/Makefile      |   1 +
 drivers/soc/aspeed/aspeed-xdma.c | 520 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/aspeed-xdma.h |  26 ++
 5 files changed, 565 insertions(+)
 create mode 100644 drivers/soc/aspeed/aspeed-xdma.c
 create mode 100644 include/uapi/linux/aspeed-xdma.h

Comments

Eduardo Valentin May 31, 2019, 3:31 a.m. UTC | #1
On Wed, May 29, 2019 at 01:10:02PM -0500, Eddie James wrote:
> The XDMA engine embedded in the AST2500 SOC performs PCI DMA operations
> between the SOC (acting as a BMC) and a host processor in a server.
> 
> This commit adds a driver to control the XDMA engine and adds functions
> to initialize the hardware and memory and start DMA operations.
> 
> Signed-off-by: Eddie James <eajames@linux.ibm.com>
> ---
>  MAINTAINERS                      |  10 +
>  drivers/soc/aspeed/Kconfig       |   8 +
>  drivers/soc/aspeed/Makefile      |   1 +
>  drivers/soc/aspeed/aspeed-xdma.c | 520 +++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/aspeed-xdma.h |  26 ++
>  5 files changed, 565 insertions(+)
>  create mode 100644 drivers/soc/aspeed/aspeed-xdma.c
>  create mode 100644 include/uapi/linux/aspeed-xdma.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7e09dda..84e2b62 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2584,6 +2584,16 @@ S:	Maintained
>  F:	drivers/media/platform/aspeed-video.c
>  F:	Documentation/devicetree/bindings/media/aspeed-video.txt
>  
> +ASPEED XDMA ENGINE DRIVER
> +M:	Eddie James <eajames@linux.ibm.com>
> +L:	linux-aspeed@lists.ozlabs.org (moderated for non-subscribers)
> +L:	linux-kernel@vger.kernel.org
> +S:	Maintained
> +F:	Documentation/devicetree/bindings/misc/aspeed,xdma.txt
> +F:	Documentation/ABI/testing/sysfs-devices-platform-aspeed-xdma
> +F:	drivers/soc/aspeed/aspeed-xdma.c
> +F:	include/uapi/linux/aspeed-xdma.h
> +
>  ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS
>  M:	Corentin Chary <corentin.chary@gmail.com>
>  L:	acpi4asus-user@lists.sourceforge.net
> diff --git a/drivers/soc/aspeed/Kconfig b/drivers/soc/aspeed/Kconfig
> index 323e177..8b08310 100644
> --- a/drivers/soc/aspeed/Kconfig
> +++ b/drivers/soc/aspeed/Kconfig
> @@ -29,4 +29,12 @@ config ASPEED_P2A_CTRL
>  	  ioctl()s, the driver also provides an interface for userspace mappings to
>  	  a pre-defined region.
>  
> +config ASPEED_XDMA
> +	tristate "Aspeed XDMA Engine Driver"
> +	depends on SOC_ASPEED && REGMAP && MFD_SYSCON && HAS_DMA
> +	help
> +	  Enable support for the Aspeed XDMA Engine found on the Aspeed AST2500
> +	  SOC. The XDMA engine can perform automatic PCI DMA operations between
> +	  the AST2500 (acting as a BMC) and a host processor.
> +
>  endmenu
> diff --git a/drivers/soc/aspeed/Makefile b/drivers/soc/aspeed/Makefile
> index b64be47..977b046 100644
> --- a/drivers/soc/aspeed/Makefile
> +++ b/drivers/soc/aspeed/Makefile
> @@ -2,3 +2,4 @@
>  obj-$(CONFIG_ASPEED_LPC_CTRL)	+= aspeed-lpc-ctrl.o
>  obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
>  obj-$(CONFIG_ASPEED_P2A_CTRL)	+= aspeed-p2a-ctrl.o
> +obj-$(CONFIG_ASPEED_XDMA)	+= aspeed-xdma.o
> diff --git a/drivers/soc/aspeed/aspeed-xdma.c b/drivers/soc/aspeed/aspeed-xdma.c
> new file mode 100644
> index 0000000..3dc0ce4
> --- /dev/null
> +++ b/drivers/soc/aspeed/aspeed-xdma.c
> @@ -0,0 +1,520 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +// Copyright IBM Corp 2019
> +
> +#include <linux/aspeed-xdma.h>
> +#include <linux/bitfield.h>
> +#include <linux/clk.h>
> +#include <linux/debugfs.h>
> +#include <linux/delay.h>
> +#include <linux/device.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/fs.h>
> +#include <linux/genalloc.h>
> +#include <linux/interrupt.h>
> +#include <linux/jiffies.h>
> +#include <linux/list.h>
> +#include <linux/mfd/syscon.h>
> +#include <linux/miscdevice.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/of.h>
> +#include <linux/of_irq.h>
> +#include <linux/of_reserved_mem.h>
> +#include <linux/platform_device.h>
> +#include <linux/poll.h>
> +#include <linux/regmap.h>
> +#include <linux/reset.h>
> +#include <linux/string.h>
> +#include <linux/uaccess.h>
> +#include <linux/wait.h>
> +
> +#define DEVICE_NAME			"aspeed-xdma"
> +
> +#define SCU_STRAP			0x070
> +#define  SCU_STRAP_VGA_MEM		GENMASK(3, 2)
> +
> +#define SCU_PCIE_CONF			0x180
> +#define  SCU_PCIE_CONF_VGA_EN		BIT(0)
> +#define  SCU_PCIE_CONF_VGA_EN_MMIO	BIT(1)
> +#define  SCU_PCIE_CONF_VGA_EN_LPC	BIT(2)
> +#define  SCU_PCIE_CONF_VGA_EN_MSI	BIT(3)
> +#define  SCU_PCIE_CONF_VGA_EN_MCTP	BIT(4)
> +#define  SCU_PCIE_CONF_VGA_EN_IRQ	BIT(5)
> +#define  SCU_PCIE_CONF_VGA_EN_DMA	BIT(6)
> +#define  SCU_PCIE_CONF_BMC_EN		BIT(8)
> +#define  SCU_PCIE_CONF_BMC_EN_MMIO	BIT(9)
> +#define  SCU_PCIE_CONF_BMC_EN_MSI	BIT(11)
> +#define  SCU_PCIE_CONF_BMC_EN_MCTP	BIT(12)
> +#define  SCU_PCIE_CONF_BMC_EN_IRQ	BIT(13)
> +#define  SCU_PCIE_CONF_BMC_EN_DMA	BIT(14)
> +#define  SCU_PCIE_CONF_RSVD		GENMASK(19, 18)
> +
> +#define SDMC_CONF			0x004
> +#define  SDMC_CONF_MEM			GENMASK(1, 0)
> +#define SDMC_REMAP			0x008
> +#define  SDMC_REMAP_MAGIC		GENMASK(17, 16)
> +
> +#define XDMA_CMD_SIZE			4
> +#define XDMA_CMDQ_SIZE			PAGE_SIZE
> +#define XDMA_BYTE_ALIGN			16
> +#define XDMA_MAX_LINE_SIZE		BIT(10)
> +#define XDMA_NUM_CMDS			\
> +	(XDMA_CMDQ_SIZE / sizeof(struct aspeed_xdma_cmd))
> +#define XDMA_NUM_DEBUGFS_REGS		6
> +
> +#define XDMA_CMD_BMC_CHECK		BIT(0)
> +#define XDMA_CMD_BMC_ADDR		GENMASK(29, 4)
> +#define XDMA_CMD_BMC_DIR_US		BIT(31)
> +
> +#define XDMA_CMD_COMM1_HI_HOST_PITCH	GENMASK(14, 3)
> +#define XDMA_CMD_COMM1_HI_BMC_PITCH	GENMASK(30, 19)
> +
> +#define XDMA_CMD_CONF_CHECK		BIT(1)
> +#define XDMA_CMD_CONF_LINE_SIZE		GENMASK(14, 4)
> +#define XDMA_CMD_CONF_IRQ_BMC		BIT(15)
> +#define XDMA_CMD_CONF_NUM_LINES		GENMASK(27, 16)
> +#define XDMA_CMD_CONF_IRQ		BIT(31)
> +
> +#define XDMA_CMD_ID_UPDIR		GENMASK(17, 16)
> +#define  XDMA_CMD_ID_UPDIR_BMC		0
> +#define  XDMA_CMD_ID_UPDIR_HOST		1
> +#define  XDMA_CMD_ID_UPDIR_VGA		2
> +
> +#define XDMA_DS_PCIE_REQ_SIZE_128	0
> +#define XDMA_DS_PCIE_REQ_SIZE_256	1
> +#define XDMA_DS_PCIE_REQ_SIZE_512	2
> +#define XDMA_DS_PCIE_REQ_SIZE_1K	3
> +#define XDMA_DS_PCIE_REQ_SIZE_2K	4
> +#define XDMA_DS_PCIE_REQ_SIZE_4K	5
> +
> +#define XDMA_BMC_CMD_QUEUE_ADDR		0x10
> +#define XDMA_BMC_CMD_QUEUE_ENDP		0x14
> +#define XDMA_BMC_CMD_QUEUE_WRITEP	0x18
> +#define XDMA_BMC_CMD_QUEUE_READP	0x1c
> +#define  XDMA_BMC_CMD_QUEUE_READP_MAGIC	0xee882266
> +#define XDMA_CTRL			0x20
> +#define  XDMA_CTRL_US_COMP		BIT(4)
> +#define  XDMA_CTRL_DS_COMP		BIT(5)
> +#define  XDMA_CTRL_DS_DIRTY		BIT(6)
> +#define  XDMA_CTRL_DS_PCIE_REQ_SIZE	GENMASK(19, 17)
> +#define  XDMA_CTRL_DS_DATA_TIMEOUT	BIT(28)
> +#define  XDMA_CTRL_DS_CHECK_ID		BIT(29)
> +#define XDMA_STATUS			0x24
> +#define  XDMA_STATUS_US_COMP		BIT(4)
> +#define  XDMA_STATUS_DS_COMP		BIT(5)
> +
> +enum {
> +	XDMA_IN_PRG,
> +	XDMA_UPSTREAM,
> +};
> +
> +struct aspeed_xdma_cmd {
> +	u32 host_addr_lo;
> +	u32 host_addr_hi;
> +	u32 bmc_addr;
> +	u32 comm1_hi;
> +	u32 conf;
> +	u32 id;
> +	u32 resv0;
> +	u32 resv1;
> +};
> +
> +struct aspeed_xdma_client;
> +
> +struct aspeed_xdma {
> +	struct device *dev;
> +	void __iomem *base;
> +	struct regmap *scu;
> +	struct reset_control *reset;
> +
> +	unsigned long flags;

interesting.. why do we need a long size flags field when we just toggle two bits?
From a quick glance, looks like we use this to check for XDMA_IN_PRG and XDMA_UPSTREAM only..

> +	unsigned int cmd_idx;
> +	wait_queue_head_t wait;
> +	struct aspeed_xdma_client *current_client;
> +
> +	u32 vga_phys;
> +	u32 vga_size;
> +	dma_addr_t vga_dma;
> +	void *cmdq;
> +	void *vga_virt;
> +	dma_addr_t cmdq_vga_phys;
> +	void *cmdq_vga_virt;
> +	struct gen_pool *vga_pool;
> +};
> +
> +struct aspeed_xdma_client {
> +	struct aspeed_xdma *ctx;
> +
> +	unsigned long flags;

same

> +	void *virt;
> +	dma_addr_t phys;
> +	u32 size;
> +};
> +
> +static const u32 aspeed_xdma_bmc_pcie_conf = SCU_PCIE_CONF_BMC_EN |
> +	SCU_PCIE_CONF_BMC_EN_MSI | SCU_PCIE_CONF_BMC_EN_MCTP |
> +	SCU_PCIE_CONF_BMC_EN_IRQ | SCU_PCIE_CONF_BMC_EN_DMA |
> +	SCU_PCIE_CONF_RSVD;
> +
> +static const u32 aspeed_xdma_vga_pcie_conf = SCU_PCIE_CONF_VGA_EN |
> +	SCU_PCIE_CONF_VGA_EN_MSI | SCU_PCIE_CONF_VGA_EN_MCTP |
> +	SCU_PCIE_CONF_VGA_EN_IRQ | SCU_PCIE_CONF_VGA_EN_DMA |
> +	SCU_PCIE_CONF_RSVD;
> +
> +static void aspeed_scu_pcie_write(struct aspeed_xdma *ctx, u32 conf)
> +{
> +	u32 v = 0;
> +
> +	regmap_write(ctx->scu, SCU_PCIE_CONF, conf);
> +	regmap_read(ctx->scu, SCU_PCIE_CONF, &v);
> +
> +	dev_dbg(ctx->dev, "write scu pcie_conf[%08x]\n", v);
> +}
> +
> +static u32 aspeed_xdma_reg_read(struct aspeed_xdma *ctx, u32 reg)
> +{
> +	u32 v = readl(ctx->base + reg);
> +
> +	dev_dbg(ctx->dev, "read %02x[%08x]\n", reg, v);
> +	return v;
> +}
> +
> +static void aspeed_xdma_reg_write(struct aspeed_xdma *ctx, u32 reg, u32 val)
> +{
> +	writel(val, ctx->base + reg);
> +	dev_dbg(ctx->dev, "write %02x[%08x]\n", reg, readl(ctx->base + reg));
> +}
> +
> +static void aspeed_xdma_init_eng(struct aspeed_xdma *ctx)
> +{
> +	const u32 ctrl = XDMA_CTRL_US_COMP | XDMA_CTRL_DS_COMP |
> +		XDMA_CTRL_DS_DIRTY | FIELD_PREP(XDMA_CTRL_DS_PCIE_REQ_SIZE,
> +						XDMA_DS_PCIE_REQ_SIZE_256) |
> +		XDMA_CTRL_DS_DATA_TIMEOUT | XDMA_CTRL_DS_CHECK_ID;
> +
> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_ENDP,
> +			      XDMA_CMD_SIZE * XDMA_NUM_CMDS);
> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_READP,
> +			      XDMA_BMC_CMD_QUEUE_READP_MAGIC);
> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_WRITEP, 0);
> +	aspeed_xdma_reg_write(ctx, XDMA_CTRL, ctrl);
> +
> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_ADDR,
> +			      ctx->cmdq_vga_phys);
> +
> +	ctx->cmd_idx = 0;
> +	ctx->flags = 0;
> +}
> +
> +static void aspeed_xdma_reset(struct aspeed_xdma *ctx)
> +{
> +	reset_control_assert(ctx->reset);
> +
> +	msleep(10);
> +
> +	reset_control_deassert(ctx->reset);
> +
> +	msleep(10);

Why 10ms?

> +
> +	aspeed_xdma_init_eng(ctx);
> +}
> +
> +static void aspeed_xdma_start(struct aspeed_xdma *ctx,
> +			      struct aspeed_xdma_op *op, u32 bmc_addr)
> +{
> +	u32 conf = XDMA_CMD_CONF_CHECK | XDMA_CMD_CONF_IRQ_BMC |
> +		XDMA_CMD_CONF_IRQ;
> +	unsigned int line_size = op->len / XDMA_BYTE_ALIGN;
> +	unsigned int num_lines = 1;
> +	unsigned int nidx = (ctx->cmd_idx + 1) % XDMA_NUM_CMDS;
> +	unsigned int pitch = 1;
> +	struct aspeed_xdma_cmd *cmd =
> +		&(((struct aspeed_xdma_cmd *)ctx->cmdq)[ctx->cmd_idx]);
> +
> +	if (line_size > XDMA_MAX_LINE_SIZE) {
> +		unsigned int rem;
> +		unsigned int total;
> +
> +		num_lines = line_size / XDMA_MAX_LINE_SIZE;
> +		total = XDMA_MAX_LINE_SIZE * num_lines;
> +		rem = line_size - total;
> +		line_size = XDMA_MAX_LINE_SIZE;
> +		pitch = line_size;
> +
> +		if (rem) {
> +			unsigned int offs = total * XDMA_BYTE_ALIGN;
> +			u32 r_bmc_addr = bmc_addr + offs;
> +			u64 r_host_addr = op->host_addr + (u64)offs;
> +			struct aspeed_xdma_cmd *r_cmd =
> +				&(((struct aspeed_xdma_cmd *)ctx->cmdq)[nidx]);
> +
> +			r_cmd->host_addr_lo =
> +				(u32)(r_host_addr & 0xFFFFFFFFULL);
> +			r_cmd->host_addr_hi = (u32)(r_host_addr >> 32ULL);
> +			r_cmd->bmc_addr = (r_bmc_addr & XDMA_CMD_BMC_ADDR) |
> +				XDMA_CMD_BMC_CHECK |
> +				(op->upstream ? XDMA_CMD_BMC_DIR_US : 0);
> +			r_cmd->conf = conf |
> +				FIELD_PREP(XDMA_CMD_CONF_LINE_SIZE, rem) |
> +				FIELD_PREP(XDMA_CMD_CONF_NUM_LINES, 1);
> +			r_cmd->comm1_hi =
> +				FIELD_PREP(XDMA_CMD_COMM1_HI_HOST_PITCH, 1) |
> +				FIELD_PREP(XDMA_CMD_COMM1_HI_BMC_PITCH, 1);
> +
> +			/* do not trigger IRQ for first command */
> +			conf = XDMA_CMD_CONF_CHECK;
> +
> +			nidx = (nidx + 1) % XDMA_NUM_CMDS;
> +		}
> +
> +		/* undocumented formula to get required number of lines */
> +		num_lines = (num_lines * 2) - 1;
> +	}
> +
> +	/* ctrl == 0 indicates engine hasn't started properly; restart it */
> +	if (!aspeed_xdma_reg_read(ctx, XDMA_CTRL))
> +		aspeed_xdma_reset(ctx);
> +
> +	cmd->host_addr_lo = (u32)(op->host_addr & 0xFFFFFFFFULL);
> +	cmd->host_addr_hi = (u32)(op->host_addr >> 32ULL);
> +	cmd->bmc_addr = (bmc_addr & XDMA_CMD_BMC_ADDR) | XDMA_CMD_BMC_CHECK |
> +		(op->upstream ? XDMA_CMD_BMC_DIR_US : 0);
> +	cmd->conf = conf |
> +		FIELD_PREP(XDMA_CMD_CONF_LINE_SIZE, line_size) |
> +		FIELD_PREP(XDMA_CMD_CONF_NUM_LINES, num_lines);
> +	cmd->comm1_hi = FIELD_PREP(XDMA_CMD_COMM1_HI_HOST_PITCH, pitch) |
> +			FIELD_PREP(XDMA_CMD_COMM1_HI_BMC_PITCH, pitch);
> +
> +	memcpy(ctx->cmdq_vga_virt, ctx->cmdq, XDMA_CMDQ_SIZE);
> +
> +	if (op->upstream)
> +		set_bit(XDMA_UPSTREAM, &ctx->flags);
> +	else
> +		clear_bit(XDMA_UPSTREAM, &ctx->flags);
> +
> +	set_bit(XDMA_IN_PRG, &ctx->flags);
> +
> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_WRITEP,
> +			      nidx * XDMA_CMD_SIZE);
> +	ctx->cmd_idx = nidx;
> +}
> +
> +static void aspeed_xdma_done(struct aspeed_xdma *ctx)
> +{
> +	if (ctx->current_client) {
> +		clear_bit(XDMA_IN_PRG, &ctx->current_client->flags);
> +
> +		ctx->current_client = NULL;
> +	}
> +
> +	clear_bit(XDMA_IN_PRG, &ctx->flags);
> +	wake_up_interruptible_all(&ctx->wait);
> +}
> +
> +static irqreturn_t aspeed_xdma_irq(int irq, void *arg)
> +{
> +	struct aspeed_xdma *ctx = arg;
> +	u32 status = aspeed_xdma_reg_read(ctx, XDMA_STATUS);
> +
> +	if (status & XDMA_STATUS_US_COMP) {
> +		if (test_bit(XDMA_UPSTREAM, &ctx->flags))
> +			aspeed_xdma_done(ctx);
> +	}
> +
> +	if (status & XDMA_STATUS_DS_COMP) {
> +		if (!test_bit(XDMA_UPSTREAM, &ctx->flags))
> +			aspeed_xdma_done(ctx);
> +	}
> +
> +	aspeed_xdma_reg_write(ctx, XDMA_STATUS, status);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static int aspeed_xdma_init_mem(struct aspeed_xdma *ctx)
> +{
> +	int rc;
> +	u32 scu_conf = 0;
> +	u32 mem_size = 0x20000000;
> +	const u32 mem_sizes[4] = { 0x8000000, 0x10000000, 0x20000000,
> +				   0x40000000 };
> +	const u32 vga_sizes[4] = { 0x800000, 0x1000000, 0x2000000, 0x4000000 };
> +	void __iomem *sdmc_base = ioremap(0x1e6e0000, 0x100);
> +

Should these come from fw specification? Say device tree mem reserved nodes?

> +	aspeed_scu_pcie_write(ctx, aspeed_xdma_vga_pcie_conf);
> +
> +	regmap_read(ctx->scu, SCU_STRAP, &scu_conf);
> +	ctx->vga_size = vga_sizes[FIELD_GET(SCU_STRAP_VGA_MEM, scu_conf)];
> +
> +	if (sdmc_base) {
> +		u32 sdmc = readl(sdmc_base + SDMC_CONF);
> +		u32 remap = readl(sdmc_base + SDMC_REMAP);
> +
> +		remap |= SDMC_REMAP_MAGIC;
> +		writel(remap, sdmc_base + SDMC_REMAP);
> +		remap = readl(sdmc_base + SDMC_REMAP);
> +
> +		mem_size = mem_sizes[sdmc & SDMC_CONF_MEM];
> +		iounmap(sdmc_base);
> +	}
> +
> +	ctx->vga_phys = (mem_size - ctx->vga_size) + 0x80000000;
> +
> +	ctx->cmdq = devm_kzalloc(ctx->dev, XDMA_CMDQ_SIZE, GFP_KERNEL);
> +	if (!ctx->cmdq) {
> +		dev_err(ctx->dev, "Failed to allocate command queue.\n");
> +		return -ENOMEM;
> +	}
> +
> +	rc = dma_set_mask_and_coherent(ctx->dev, DMA_BIT_MASK(32));
> +	if (rc) {
> +		dev_err(ctx->dev, "Failed to set DMA mask: %d.\n", rc);
> +		return rc;
> +	}
> +
> +	rc = dma_declare_coherent_memory(ctx->dev, ctx->vga_phys,
> +					 ctx->vga_phys, ctx->vga_size);
> +	if (rc) {
> +		dev_err(ctx->dev, "Failed to declare coherent memory: %d.\n",
> +			rc);
> +		return rc;
> +	}
> +
> +	ctx->vga_virt = dma_alloc_coherent(ctx->dev, ctx->vga_size,
> +					   &ctx->vga_dma, GFP_KERNEL);
> +	if (!ctx->vga_virt) {
> +		dev_err(ctx->dev, "Failed to allocate DMA.\n");
> +		rc = -ENOMEM;
> +		goto err_dma;
> +	}
> +
> +	rc = gen_pool_add_virt(ctx->vga_pool, (unsigned long)ctx->vga_virt,
> +			       ctx->vga_phys, ctx->vga_size, -1);
> +	if (rc) {
> +		dev_err(ctx->dev, "Failed to add memory to genalloc pool.\n");
> +		goto err_genalloc;
> +	}
> +
> +	ctx->cmdq_vga_virt = gen_pool_dma_alloc(ctx->vga_pool, XDMA_CMDQ_SIZE,
> +						&ctx->cmdq_vga_phys);
> +	if (!ctx->cmdq_vga_virt) {
> +		dev_err(ctx->dev, "Failed to genalloc cmdq.\n");
> +		rc = -ENOMEM;
> +		goto err_genalloc;
> +	}
> +
> +	dev_dbg(ctx->dev, "VGA mapped at phys[%08x], size[%08x].\n",
> +		ctx->vga_phys, ctx->vga_size);
> +
> +	return 0;
> +
> +err_dma:
> +	dma_release_declared_memory(ctx->dev);
> +
> +err_genalloc:
> +	dma_free_coherent(ctx->dev, ctx->vga_size, ctx->vga_virt,
> +			  ctx->vga_dma);
> +	return rc;
> +}
> +
> +static int aspeed_xdma_probe(struct platform_device *pdev)
> +{
> +	int irq;
> +	int rc;
> +	struct resource *res;
> +	struct device *dev = &pdev->dev;
> +	struct aspeed_xdma *ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
> +
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	ctx->dev = dev;
> +	platform_set_drvdata(pdev, ctx);
> +	init_waitqueue_head(&ctx->wait);
> +
> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	ctx->base = devm_ioremap_resource(dev, res);
> +	if (IS_ERR(ctx->base)) {
> +		dev_err(dev, "Unable to ioremap registers.\n");
> +		return PTR_ERR(ctx->base);
> +	}
> +
> +	irq = irq_of_parse_and_map(dev->of_node, 0);
> +	if (!irq) {
> +		dev_err(dev, "Unable to find IRQ.\n");
> +		return -ENODEV;
> +	}
> +
> +	rc = devm_request_irq(dev, irq, aspeed_xdma_irq, IRQF_SHARED,
> +			      DEVICE_NAME, ctx);
> +	if (rc < 0) {
> +		dev_err(dev, "Unable to request IRQ %d.\n", irq);
> +		return rc;
> +	}
> +
> +	ctx->scu = syscon_regmap_lookup_by_compatible("aspeed,ast2500-scu");
> +	if (IS_ERR(ctx->scu)) {
> +		dev_err(ctx->dev, "Unable to grab SCU regs.\n");
> +		return PTR_ERR(ctx->scu);
> +	}
> +
> +	ctx->reset = devm_reset_control_get_exclusive(dev, NULL);
> +	if (IS_ERR(ctx->reset)) {
> +		dev_err(dev, "Unable to request reset control.\n");
> +		return PTR_ERR(ctx->reset);
> +	}
> +
> +	ctx->vga_pool = devm_gen_pool_create(dev, ilog2(PAGE_SIZE), -1, NULL);
> +	if (!ctx->vga_pool) {
> +		dev_err(dev, "Unable to setup genalloc pool.\n");
> +		return -ENOMEM;
> +	}
> +
> +	reset_control_deassert(ctx->reset);
> +
> +	msleep(10);

Why 10ms again? :-)

> +
> +	rc = aspeed_xdma_init_mem(ctx);
> +	if (rc) {
> +		reset_control_assert(ctx->reset);
> +		return rc;
> +	}
> +
> +	aspeed_xdma_init_eng(ctx);
> +
> +	return 0;
> +}
> +
> +static int aspeed_xdma_remove(struct platform_device *pdev)
> +{
> +	struct aspeed_xdma *ctx = platform_get_drvdata(pdev);
> +
> +	gen_pool_free(ctx->vga_pool, (unsigned long)ctx->cmdq_vga_virt,
> +		      XDMA_CMDQ_SIZE);
> +	dma_free_coherent(ctx->dev, ctx->vga_size, ctx->vga_virt,
> +			  ctx->vga_dma);
> +	dma_release_declared_memory(ctx->dev);
> +	reset_control_assert(ctx->reset);
> +
> +	return 0;
> +}
> +
> +static const struct of_device_id aspeed_xdma_match[] = {
> +	{ .compatible = "aspeed,ast2500-xdma" },
> +	{ },
> +};
> +
> +static struct platform_driver aspeed_xdma_driver = {
> +	.probe = aspeed_xdma_probe,
> +	.remove = aspeed_xdma_remove,
> +	.driver = {
> +		.name = DEVICE_NAME,
> +		.of_match_table = aspeed_xdma_match,
> +	},
> +};
> +
> +module_platform_driver(aspeed_xdma_driver);
> +
> +MODULE_AUTHOR("Eddie James");
> +MODULE_DESCRIPTION("Aspeed XDMA Engine Driver");
> +MODULE_LICENSE("GPL v2");
> diff --git a/include/uapi/linux/aspeed-xdma.h b/include/uapi/linux/aspeed-xdma.h
> new file mode 100644
> index 0000000..998459e
> --- /dev/null
> +++ b/include/uapi/linux/aspeed-xdma.h
> @@ -0,0 +1,26 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +/* Copyright IBM Corp 2019 */
> +
> +#ifndef _UAPI_LINUX_ASPEED_XDMA_H_
> +#define _UAPI_LINUX_ASPEED_XDMA_H_
> +
> +#include <linux/types.h>
> +
> +/*
> + * aspeed_xdma_op
> + *
> + * host_addr: the DMA address on the host side, typically configured by PCI
> + *            subsystem
> + *
> + * len: the size of the transfer in bytes; it should be a multiple of 16 bytes
> + *
> + * upstream: boolean indicating the direction of the DMA operation; upstream
> + *           means a transfer from the BMC to the host
> + */
> +struct aspeed_xdma_op {
> +	__u64 host_addr;
> +	__u32 len;
> +	__u32 upstream;
> +};
> +
> +#endif /* _UAPI_LINUX_ASPEED_XDMA_H_ */
> -- 
> 1.8.3.1
>
Eddie James June 28, 2019, 3:43 p.m. UTC | #2
On 5/30/19 10:31 PM, Eduardo Valentin wrote:
> On Wed, May 29, 2019 at 01:10:02PM -0500, Eddie James wrote:
>> The XDMA engine embedded in the AST2500 SOC performs PCI DMA operations
>> between the SOC (acting as a BMC) and a host processor in a server.
>>
>> This commit adds a driver to control the XDMA engine and adds functions
>> to initialize the hardware and memory and start DMA operations.
>>
>> Signed-off-by: Eddie James <eajames@linux.ibm.com>
>> ---
>>   MAINTAINERS                      |  10 +
>>   drivers/soc/aspeed/Kconfig       |   8 +
>>   drivers/soc/aspeed/Makefile      |   1 +
>>   drivers/soc/aspeed/aspeed-xdma.c | 520 +++++++++++++++++++++++++++++++++++++++
>>   include/uapi/linux/aspeed-xdma.h |  26 ++
>>   5 files changed, 565 insertions(+)
>>   create mode 100644 drivers/soc/aspeed/aspeed-xdma.c
>>   create mode 100644 include/uapi/linux/aspeed-xdma.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 7e09dda..84e2b62 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -2584,6 +2584,16 @@ S:	Maintained
>>   F:	drivers/media/platform/aspeed-video.c
>>   F:	Documentation/devicetree/bindings/media/aspeed-video.txt
>>   
>> +ASPEED XDMA ENGINE DRIVER
>> +M:	Eddie James <eajames@linux.ibm.com>
>> +L:	linux-aspeed@lists.ozlabs.org (moderated for non-subscribers)
>> +L:	linux-kernel@vger.kernel.org
>> +S:	Maintained
>> +F:	Documentation/devicetree/bindings/misc/aspeed,xdma.txt
>> +F:	Documentation/ABI/testing/sysfs-devices-platform-aspeed-xdma
>> +F:	drivers/soc/aspeed/aspeed-xdma.c
>> +F:	include/uapi/linux/aspeed-xdma.h
>> +
>>   ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS
>>   M:	Corentin Chary <corentin.chary@gmail.com>
>>   L:	acpi4asus-user@lists.sourceforge.net
>> diff --git a/drivers/soc/aspeed/Kconfig b/drivers/soc/aspeed/Kconfig
>> index 323e177..8b08310 100644
>> --- a/drivers/soc/aspeed/Kconfig
>> +++ b/drivers/soc/aspeed/Kconfig
>> @@ -29,4 +29,12 @@ config ASPEED_P2A_CTRL
>>   	  ioctl()s, the driver also provides an interface for userspace mappings to
>>   	  a pre-defined region.
>>   
>> +config ASPEED_XDMA
>> +	tristate "Aspeed XDMA Engine Driver"
>> +	depends on SOC_ASPEED && REGMAP && MFD_SYSCON && HAS_DMA
>> +	help
>> +	  Enable support for the Aspeed XDMA Engine found on the Aspeed AST2500
>> +	  SOC. The XDMA engine can perform automatic PCI DMA operations between
>> +	  the AST2500 (acting as a BMC) and a host processor.
>> +
>>   endmenu
>> diff --git a/drivers/soc/aspeed/Makefile b/drivers/soc/aspeed/Makefile
>> index b64be47..977b046 100644
>> --- a/drivers/soc/aspeed/Makefile
>> +++ b/drivers/soc/aspeed/Makefile
>> @@ -2,3 +2,4 @@
>>   obj-$(CONFIG_ASPEED_LPC_CTRL)	+= aspeed-lpc-ctrl.o
>>   obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
>>   obj-$(CONFIG_ASPEED_P2A_CTRL)	+= aspeed-p2a-ctrl.o
>> +obj-$(CONFIG_ASPEED_XDMA)	+= aspeed-xdma.o
>> diff --git a/drivers/soc/aspeed/aspeed-xdma.c b/drivers/soc/aspeed/aspeed-xdma.c
>> new file mode 100644
>> index 0000000..3dc0ce4
>> --- /dev/null
>> +++ b/drivers/soc/aspeed/aspeed-xdma.c
>> @@ -0,0 +1,520 @@
>> +// SPDX-License-Identifier: GPL-2.0+
>> +// Copyright IBM Corp 2019
>> +
>> +#include <linux/aspeed-xdma.h>
>> +#include <linux/bitfield.h>
>> +#include <linux/clk.h>
>> +#include <linux/debugfs.h>
>> +#include <linux/delay.h>
>> +#include <linux/device.h>
>> +#include <linux/dma-mapping.h>
>> +#include <linux/fs.h>
>> +#include <linux/genalloc.h>
>> +#include <linux/interrupt.h>
>> +#include <linux/jiffies.h>
>> +#include <linux/list.h>
>> +#include <linux/mfd/syscon.h>
>> +#include <linux/miscdevice.h>
>> +#include <linux/module.h>
>> +#include <linux/mutex.h>
>> +#include <linux/of.h>
>> +#include <linux/of_irq.h>
>> +#include <linux/of_reserved_mem.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/poll.h>
>> +#include <linux/regmap.h>
>> +#include <linux/reset.h>
>> +#include <linux/string.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/wait.h>
>> +
>> +#define DEVICE_NAME			"aspeed-xdma"
>> +
>> +#define SCU_STRAP			0x070
>> +#define  SCU_STRAP_VGA_MEM		GENMASK(3, 2)
>> +
>> +#define SCU_PCIE_CONF			0x180
>> +#define  SCU_PCIE_CONF_VGA_EN		BIT(0)
>> +#define  SCU_PCIE_CONF_VGA_EN_MMIO	BIT(1)
>> +#define  SCU_PCIE_CONF_VGA_EN_LPC	BIT(2)
>> +#define  SCU_PCIE_CONF_VGA_EN_MSI	BIT(3)
>> +#define  SCU_PCIE_CONF_VGA_EN_MCTP	BIT(4)
>> +#define  SCU_PCIE_CONF_VGA_EN_IRQ	BIT(5)
>> +#define  SCU_PCIE_CONF_VGA_EN_DMA	BIT(6)
>> +#define  SCU_PCIE_CONF_BMC_EN		BIT(8)
>> +#define  SCU_PCIE_CONF_BMC_EN_MMIO	BIT(9)
>> +#define  SCU_PCIE_CONF_BMC_EN_MSI	BIT(11)
>> +#define  SCU_PCIE_CONF_BMC_EN_MCTP	BIT(12)
>> +#define  SCU_PCIE_CONF_BMC_EN_IRQ	BIT(13)
>> +#define  SCU_PCIE_CONF_BMC_EN_DMA	BIT(14)
>> +#define  SCU_PCIE_CONF_RSVD		GENMASK(19, 18)
>> +
>> +#define SDMC_CONF			0x004
>> +#define  SDMC_CONF_MEM			GENMASK(1, 0)
>> +#define SDMC_REMAP			0x008
>> +#define  SDMC_REMAP_MAGIC		GENMASK(17, 16)
>> +
>> +#define XDMA_CMD_SIZE			4
>> +#define XDMA_CMDQ_SIZE			PAGE_SIZE
>> +#define XDMA_BYTE_ALIGN			16
>> +#define XDMA_MAX_LINE_SIZE		BIT(10)
>> +#define XDMA_NUM_CMDS			\
>> +	(XDMA_CMDQ_SIZE / sizeof(struct aspeed_xdma_cmd))
>> +#define XDMA_NUM_DEBUGFS_REGS		6
>> +
>> +#define XDMA_CMD_BMC_CHECK		BIT(0)
>> +#define XDMA_CMD_BMC_ADDR		GENMASK(29, 4)
>> +#define XDMA_CMD_BMC_DIR_US		BIT(31)
>> +
>> +#define XDMA_CMD_COMM1_HI_HOST_PITCH	GENMASK(14, 3)
>> +#define XDMA_CMD_COMM1_HI_BMC_PITCH	GENMASK(30, 19)
>> +
>> +#define XDMA_CMD_CONF_CHECK		BIT(1)
>> +#define XDMA_CMD_CONF_LINE_SIZE		GENMASK(14, 4)
>> +#define XDMA_CMD_CONF_IRQ_BMC		BIT(15)
>> +#define XDMA_CMD_CONF_NUM_LINES		GENMASK(27, 16)
>> +#define XDMA_CMD_CONF_IRQ		BIT(31)
>> +
>> +#define XDMA_CMD_ID_UPDIR		GENMASK(17, 16)
>> +#define  XDMA_CMD_ID_UPDIR_BMC		0
>> +#define  XDMA_CMD_ID_UPDIR_HOST		1
>> +#define  XDMA_CMD_ID_UPDIR_VGA		2
>> +
>> +#define XDMA_DS_PCIE_REQ_SIZE_128	0
>> +#define XDMA_DS_PCIE_REQ_SIZE_256	1
>> +#define XDMA_DS_PCIE_REQ_SIZE_512	2
>> +#define XDMA_DS_PCIE_REQ_SIZE_1K	3
>> +#define XDMA_DS_PCIE_REQ_SIZE_2K	4
>> +#define XDMA_DS_PCIE_REQ_SIZE_4K	5
>> +
>> +#define XDMA_BMC_CMD_QUEUE_ADDR		0x10
>> +#define XDMA_BMC_CMD_QUEUE_ENDP		0x14
>> +#define XDMA_BMC_CMD_QUEUE_WRITEP	0x18
>> +#define XDMA_BMC_CMD_QUEUE_READP	0x1c
>> +#define  XDMA_BMC_CMD_QUEUE_READP_MAGIC	0xee882266
>> +#define XDMA_CTRL			0x20
>> +#define  XDMA_CTRL_US_COMP		BIT(4)
>> +#define  XDMA_CTRL_DS_COMP		BIT(5)
>> +#define  XDMA_CTRL_DS_DIRTY		BIT(6)
>> +#define  XDMA_CTRL_DS_PCIE_REQ_SIZE	GENMASK(19, 17)
>> +#define  XDMA_CTRL_DS_DATA_TIMEOUT	BIT(28)
>> +#define  XDMA_CTRL_DS_CHECK_ID		BIT(29)
>> +#define XDMA_STATUS			0x24
>> +#define  XDMA_STATUS_US_COMP		BIT(4)
>> +#define  XDMA_STATUS_DS_COMP		BIT(5)
>> +
>> +enum {
>> +	XDMA_IN_PRG,
>> +	XDMA_UPSTREAM,
>> +};
>> +
>> +struct aspeed_xdma_cmd {
>> +	u32 host_addr_lo;
>> +	u32 host_addr_hi;
>> +	u32 bmc_addr;
>> +	u32 comm1_hi;
>> +	u32 conf;
>> +	u32 id;
>> +	u32 resv0;
>> +	u32 resv1;
>> +};
>> +
>> +struct aspeed_xdma_client;
>> +
>> +struct aspeed_xdma {
>> +	struct device *dev;
>> +	void __iomem *base;
>> +	struct regmap *scu;
>> +	struct reset_control *reset;
>> +
>> +	unsigned long flags;
> interesting.. why do we need a long size flags field when we just toggle two bits?
>  From a quick glance, looks like we use this to check for XDMA_IN_PRG and XDMA_UPSTREAM only..


That's correct. More flags could be added in the future. Are you worried 
about memory space?


>
>> +	unsigned int cmd_idx;
>> +	wait_queue_head_t wait;
>> +	struct aspeed_xdma_client *current_client;
>> +
>> +	u32 vga_phys;
>> +	u32 vga_size;
>> +	dma_addr_t vga_dma;
>> +	void *cmdq;
>> +	void *vga_virt;
>> +	dma_addr_t cmdq_vga_phys;
>> +	void *cmdq_vga_virt;
>> +	struct gen_pool *vga_pool;
>> +};
>> +
>> +struct aspeed_xdma_client {
>> +	struct aspeed_xdma *ctx;
>> +
>> +	unsigned long flags;
> same
>
>> +	void *virt;
>> +	dma_addr_t phys;
>> +	u32 size;
>> +};
>> +
>> +static const u32 aspeed_xdma_bmc_pcie_conf = SCU_PCIE_CONF_BMC_EN |
>> +	SCU_PCIE_CONF_BMC_EN_MSI | SCU_PCIE_CONF_BMC_EN_MCTP |
>> +	SCU_PCIE_CONF_BMC_EN_IRQ | SCU_PCIE_CONF_BMC_EN_DMA |
>> +	SCU_PCIE_CONF_RSVD;
>> +
>> +static const u32 aspeed_xdma_vga_pcie_conf = SCU_PCIE_CONF_VGA_EN |
>> +	SCU_PCIE_CONF_VGA_EN_MSI | SCU_PCIE_CONF_VGA_EN_MCTP |
>> +	SCU_PCIE_CONF_VGA_EN_IRQ | SCU_PCIE_CONF_VGA_EN_DMA |
>> +	SCU_PCIE_CONF_RSVD;
>> +
>> +static void aspeed_scu_pcie_write(struct aspeed_xdma *ctx, u32 conf)
>> +{
>> +	u32 v = 0;
>> +
>> +	regmap_write(ctx->scu, SCU_PCIE_CONF, conf);
>> +	regmap_read(ctx->scu, SCU_PCIE_CONF, &v);
>> +
>> +	dev_dbg(ctx->dev, "write scu pcie_conf[%08x]\n", v);
>> +}
>> +
>> +static u32 aspeed_xdma_reg_read(struct aspeed_xdma *ctx, u32 reg)
>> +{
>> +	u32 v = readl(ctx->base + reg);
>> +
>> +	dev_dbg(ctx->dev, "read %02x[%08x]\n", reg, v);
>> +	return v;
>> +}
>> +
>> +static void aspeed_xdma_reg_write(struct aspeed_xdma *ctx, u32 reg, u32 val)
>> +{
>> +	writel(val, ctx->base + reg);
>> +	dev_dbg(ctx->dev, "write %02x[%08x]\n", reg, readl(ctx->base + reg));
>> +}
>> +
>> +static void aspeed_xdma_init_eng(struct aspeed_xdma *ctx)
>> +{
>> +	const u32 ctrl = XDMA_CTRL_US_COMP | XDMA_CTRL_DS_COMP |
>> +		XDMA_CTRL_DS_DIRTY | FIELD_PREP(XDMA_CTRL_DS_PCIE_REQ_SIZE,
>> +						XDMA_DS_PCIE_REQ_SIZE_256) |
>> +		XDMA_CTRL_DS_DATA_TIMEOUT | XDMA_CTRL_DS_CHECK_ID;
>> +
>> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_ENDP,
>> +			      XDMA_CMD_SIZE * XDMA_NUM_CMDS);
>> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_READP,
>> +			      XDMA_BMC_CMD_QUEUE_READP_MAGIC);
>> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_WRITEP, 0);
>> +	aspeed_xdma_reg_write(ctx, XDMA_CTRL, ctrl);
>> +
>> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_ADDR,
>> +			      ctx->cmdq_vga_phys);
>> +
>> +	ctx->cmd_idx = 0;
>> +	ctx->flags = 0;
>> +}
>> +
>> +static void aspeed_xdma_reset(struct aspeed_xdma *ctx)
>> +{
>> +	reset_control_assert(ctx->reset);
>> +
>> +	msleep(10);
>> +
>> +	reset_control_deassert(ctx->reset);
>> +
>> +	msleep(10);
> Why 10ms?


I will add a comment. It's from the specification.


>
>> +
>> +	aspeed_xdma_init_eng(ctx);
>> +}
>> +
>> +static void aspeed_xdma_start(struct aspeed_xdma *ctx,
>> +			      struct aspeed_xdma_op *op, u32 bmc_addr)
>> +{
>> +	u32 conf = XDMA_CMD_CONF_CHECK | XDMA_CMD_CONF_IRQ_BMC |
>> +		XDMA_CMD_CONF_IRQ;
>> +	unsigned int line_size = op->len / XDMA_BYTE_ALIGN;
>> +	unsigned int num_lines = 1;
>> +	unsigned int nidx = (ctx->cmd_idx + 1) % XDMA_NUM_CMDS;
>> +	unsigned int pitch = 1;
>> +	struct aspeed_xdma_cmd *cmd =
>> +		&(((struct aspeed_xdma_cmd *)ctx->cmdq)[ctx->cmd_idx]);
>> +
>> +	if (line_size > XDMA_MAX_LINE_SIZE) {
>> +		unsigned int rem;
>> +		unsigned int total;
>> +
>> +		num_lines = line_size / XDMA_MAX_LINE_SIZE;
>> +		total = XDMA_MAX_LINE_SIZE * num_lines;
>> +		rem = line_size - total;
>> +		line_size = XDMA_MAX_LINE_SIZE;
>> +		pitch = line_size;
>> +
>> +		if (rem) {
>> +			unsigned int offs = total * XDMA_BYTE_ALIGN;
>> +			u32 r_bmc_addr = bmc_addr + offs;
>> +			u64 r_host_addr = op->host_addr + (u64)offs;
>> +			struct aspeed_xdma_cmd *r_cmd =
>> +				&(((struct aspeed_xdma_cmd *)ctx->cmdq)[nidx]);
>> +
>> +			r_cmd->host_addr_lo =
>> +				(u32)(r_host_addr & 0xFFFFFFFFULL);
>> +			r_cmd->host_addr_hi = (u32)(r_host_addr >> 32ULL);
>> +			r_cmd->bmc_addr = (r_bmc_addr & XDMA_CMD_BMC_ADDR) |
>> +				XDMA_CMD_BMC_CHECK |
>> +				(op->upstream ? XDMA_CMD_BMC_DIR_US : 0);
>> +			r_cmd->conf = conf |
>> +				FIELD_PREP(XDMA_CMD_CONF_LINE_SIZE, rem) |
>> +				FIELD_PREP(XDMA_CMD_CONF_NUM_LINES, 1);
>> +			r_cmd->comm1_hi =
>> +				FIELD_PREP(XDMA_CMD_COMM1_HI_HOST_PITCH, 1) |
>> +				FIELD_PREP(XDMA_CMD_COMM1_HI_BMC_PITCH, 1);
>> +
>> +			/* do not trigger IRQ for first command */
>> +			conf = XDMA_CMD_CONF_CHECK;
>> +
>> +			nidx = (nidx + 1) % XDMA_NUM_CMDS;
>> +		}
>> +
>> +		/* undocumented formula to get required number of lines */
>> +		num_lines = (num_lines * 2) - 1;
>> +	}
>> +
>> +	/* ctrl == 0 indicates engine hasn't started properly; restart it */
>> +	if (!aspeed_xdma_reg_read(ctx, XDMA_CTRL))
>> +		aspeed_xdma_reset(ctx);
>> +
>> +	cmd->host_addr_lo = (u32)(op->host_addr & 0xFFFFFFFFULL);
>> +	cmd->host_addr_hi = (u32)(op->host_addr >> 32ULL);
>> +	cmd->bmc_addr = (bmc_addr & XDMA_CMD_BMC_ADDR) | XDMA_CMD_BMC_CHECK |
>> +		(op->upstream ? XDMA_CMD_BMC_DIR_US : 0);
>> +	cmd->conf = conf |
>> +		FIELD_PREP(XDMA_CMD_CONF_LINE_SIZE, line_size) |
>> +		FIELD_PREP(XDMA_CMD_CONF_NUM_LINES, num_lines);
>> +	cmd->comm1_hi = FIELD_PREP(XDMA_CMD_COMM1_HI_HOST_PITCH, pitch) |
>> +			FIELD_PREP(XDMA_CMD_COMM1_HI_BMC_PITCH, pitch);
>> +
>> +	memcpy(ctx->cmdq_vga_virt, ctx->cmdq, XDMA_CMDQ_SIZE);
>> +
>> +	if (op->upstream)
>> +		set_bit(XDMA_UPSTREAM, &ctx->flags);
>> +	else
>> +		clear_bit(XDMA_UPSTREAM, &ctx->flags);
>> +
>> +	set_bit(XDMA_IN_PRG, &ctx->flags);
>> +
>> +	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_WRITEP,
>> +			      nidx * XDMA_CMD_SIZE);
>> +	ctx->cmd_idx = nidx;
>> +}
>> +
>> +static void aspeed_xdma_done(struct aspeed_xdma *ctx)
>> +{
>> +	if (ctx->current_client) {
>> +		clear_bit(XDMA_IN_PRG, &ctx->current_client->flags);
>> +
>> +		ctx->current_client = NULL;
>> +	}
>> +
>> +	clear_bit(XDMA_IN_PRG, &ctx->flags);
>> +	wake_up_interruptible_all(&ctx->wait);
>> +}
>> +
>> +static irqreturn_t aspeed_xdma_irq(int irq, void *arg)
>> +{
>> +	struct aspeed_xdma *ctx = arg;
>> +	u32 status = aspeed_xdma_reg_read(ctx, XDMA_STATUS);
>> +
>> +	if (status & XDMA_STATUS_US_COMP) {
>> +		if (test_bit(XDMA_UPSTREAM, &ctx->flags))
>> +			aspeed_xdma_done(ctx);
>> +	}
>> +
>> +	if (status & XDMA_STATUS_DS_COMP) {
>> +		if (!test_bit(XDMA_UPSTREAM, &ctx->flags))
>> +			aspeed_xdma_done(ctx);
>> +	}
>> +
>> +	aspeed_xdma_reg_write(ctx, XDMA_STATUS, status);
>> +
>> +	return IRQ_HANDLED;
>> +}
>> +
>> +static int aspeed_xdma_init_mem(struct aspeed_xdma *ctx)
>> +{
>> +	int rc;
>> +	u32 scu_conf = 0;
>> +	u32 mem_size = 0x20000000;
>> +	const u32 mem_sizes[4] = { 0x8000000, 0x10000000, 0x20000000,
>> +				   0x40000000 };
>> +	const u32 vga_sizes[4] = { 0x800000, 0x1000000, 0x2000000, 0x4000000 };
>> +	void __iomem *sdmc_base = ioremap(0x1e6e0000, 0x100);
>> +
> Should these come from fw specification? Say device tree mem reserved nodes?


I'm not sure. I quite like finding it dynamically here rather than 
having to specify a new memory node in every system that will use the 
driver. In addition it's a little bit awkward to get the physical 
address from the reserved memory subsystem; we don't actually need to 
grab the reserved memory for this device, it's already reserved by the 
CPU/VGA. What do you think?


Thanks for the review!

Eddie


>
>> +	aspeed_scu_pcie_write(ctx, aspeed_xdma_vga_pcie_conf);
>> +
>> +	regmap_read(ctx->scu, SCU_STRAP, &scu_conf);
>> +	ctx->vga_size = vga_sizes[FIELD_GET(SCU_STRAP_VGA_MEM, scu_conf)];
>> +
>> +	if (sdmc_base) {
>> +		u32 sdmc = readl(sdmc_base + SDMC_CONF);
>> +		u32 remap = readl(sdmc_base + SDMC_REMAP);
>> +
>> +		remap |= SDMC_REMAP_MAGIC;
>> +		writel(remap, sdmc_base + SDMC_REMAP);
>> +		remap = readl(sdmc_base + SDMC_REMAP);
>> +
>> +		mem_size = mem_sizes[sdmc & SDMC_CONF_MEM];
>> +		iounmap(sdmc_base);
>> +	}
>> +
>> +	ctx->vga_phys = (mem_size - ctx->vga_size) + 0x80000000;
>> +
>> +	ctx->cmdq = devm_kzalloc(ctx->dev, XDMA_CMDQ_SIZE, GFP_KERNEL);
>> +	if (!ctx->cmdq) {
>> +		dev_err(ctx->dev, "Failed to allocate command queue.\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	rc = dma_set_mask_and_coherent(ctx->dev, DMA_BIT_MASK(32));
>> +	if (rc) {
>> +		dev_err(ctx->dev, "Failed to set DMA mask: %d.\n", rc);
>> +		return rc;
>> +	}
>> +
>> +	rc = dma_declare_coherent_memory(ctx->dev, ctx->vga_phys,
>> +					 ctx->vga_phys, ctx->vga_size);
>> +	if (rc) {
>> +		dev_err(ctx->dev, "Failed to declare coherent memory: %d.\n",
>> +			rc);
>> +		return rc;
>> +	}
>> +
>> +	ctx->vga_virt = dma_alloc_coherent(ctx->dev, ctx->vga_size,
>> +					   &ctx->vga_dma, GFP_KERNEL);
>> +	if (!ctx->vga_virt) {
>> +		dev_err(ctx->dev, "Failed to allocate DMA.\n");
>> +		rc = -ENOMEM;
>> +		goto err_dma;
>> +	}
>> +
>> +	rc = gen_pool_add_virt(ctx->vga_pool, (unsigned long)ctx->vga_virt,
>> +			       ctx->vga_phys, ctx->vga_size, -1);
>> +	if (rc) {
>> +		dev_err(ctx->dev, "Failed to add memory to genalloc pool.\n");
>> +		goto err_genalloc;
>> +	}
>> +
>> +	ctx->cmdq_vga_virt = gen_pool_dma_alloc(ctx->vga_pool, XDMA_CMDQ_SIZE,
>> +						&ctx->cmdq_vga_phys);
>> +	if (!ctx->cmdq_vga_virt) {
>> +		dev_err(ctx->dev, "Failed to genalloc cmdq.\n");
>> +		rc = -ENOMEM;
>> +		goto err_genalloc;
>> +	}
>> +
>> +	dev_dbg(ctx->dev, "VGA mapped at phys[%08x], size[%08x].\n",
>> +		ctx->vga_phys, ctx->vga_size);
>> +
>> +	return 0;
>> +
>> +err_dma:
>> +	dma_release_declared_memory(ctx->dev);
>> +
>> +err_genalloc:
>> +	dma_free_coherent(ctx->dev, ctx->vga_size, ctx->vga_virt,
>> +			  ctx->vga_dma);
>> +	return rc;
>> +}
>> +
>> +static int aspeed_xdma_probe(struct platform_device *pdev)
>> +{
>> +	int irq;
>> +	int rc;
>> +	struct resource *res;
>> +	struct device *dev = &pdev->dev;
>> +	struct aspeed_xdma *ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
>> +
>> +	if (!ctx)
>> +		return -ENOMEM;
>> +
>> +	ctx->dev = dev;
>> +	platform_set_drvdata(pdev, ctx);
>> +	init_waitqueue_head(&ctx->wait);
>> +
>> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
>> +	ctx->base = devm_ioremap_resource(dev, res);
>> +	if (IS_ERR(ctx->base)) {
>> +		dev_err(dev, "Unable to ioremap registers.\n");
>> +		return PTR_ERR(ctx->base);
>> +	}
>> +
>> +	irq = irq_of_parse_and_map(dev->of_node, 0);
>> +	if (!irq) {
>> +		dev_err(dev, "Unable to find IRQ.\n");
>> +		return -ENODEV;
>> +	}
>> +
>> +	rc = devm_request_irq(dev, irq, aspeed_xdma_irq, IRQF_SHARED,
>> +			      DEVICE_NAME, ctx);
>> +	if (rc < 0) {
>> +		dev_err(dev, "Unable to request IRQ %d.\n", irq);
>> +		return rc;
>> +	}
>> +
>> +	ctx->scu = syscon_regmap_lookup_by_compatible("aspeed,ast2500-scu");
>> +	if (IS_ERR(ctx->scu)) {
>> +		dev_err(ctx->dev, "Unable to grab SCU regs.\n");
>> +		return PTR_ERR(ctx->scu);
>> +	}
>> +
>> +	ctx->reset = devm_reset_control_get_exclusive(dev, NULL);
>> +	if (IS_ERR(ctx->reset)) {
>> +		dev_err(dev, "Unable to request reset control.\n");
>> +		return PTR_ERR(ctx->reset);
>> +	}
>> +
>> +	ctx->vga_pool = devm_gen_pool_create(dev, ilog2(PAGE_SIZE), -1, NULL);
>> +	if (!ctx->vga_pool) {
>> +		dev_err(dev, "Unable to setup genalloc pool.\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	reset_control_deassert(ctx->reset);
>> +
>> +	msleep(10);
> Why 10ms again? :-)
>
>> +
>> +	rc = aspeed_xdma_init_mem(ctx);
>> +	if (rc) {
>> +		reset_control_assert(ctx->reset);
>> +		return rc;
>> +	}
>> +
>> +	aspeed_xdma_init_eng(ctx);
>> +
>> +	return 0;
>> +}
>> +
>> +static int aspeed_xdma_remove(struct platform_device *pdev)
>> +{
>> +	struct aspeed_xdma *ctx = platform_get_drvdata(pdev);
>> +
>> +	gen_pool_free(ctx->vga_pool, (unsigned long)ctx->cmdq_vga_virt,
>> +		      XDMA_CMDQ_SIZE);
>> +	dma_free_coherent(ctx->dev, ctx->vga_size, ctx->vga_virt,
>> +			  ctx->vga_dma);
>> +	dma_release_declared_memory(ctx->dev);
>> +	reset_control_assert(ctx->reset);
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct of_device_id aspeed_xdma_match[] = {
>> +	{ .compatible = "aspeed,ast2500-xdma" },
>> +	{ },
>> +};
>> +
>> +static struct platform_driver aspeed_xdma_driver = {
>> +	.probe = aspeed_xdma_probe,
>> +	.remove = aspeed_xdma_remove,
>> +	.driver = {
>> +		.name = DEVICE_NAME,
>> +		.of_match_table = aspeed_xdma_match,
>> +	},
>> +};
>> +
>> +module_platform_driver(aspeed_xdma_driver);
>> +
>> +MODULE_AUTHOR("Eddie James");
>> +MODULE_DESCRIPTION("Aspeed XDMA Engine Driver");
>> +MODULE_LICENSE("GPL v2");
>> diff --git a/include/uapi/linux/aspeed-xdma.h b/include/uapi/linux/aspeed-xdma.h
>> new file mode 100644
>> index 0000000..998459e
>> --- /dev/null
>> +++ b/include/uapi/linux/aspeed-xdma.h
>> @@ -0,0 +1,26 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ */
>> +/* Copyright IBM Corp 2019 */
>> +
>> +#ifndef _UAPI_LINUX_ASPEED_XDMA_H_
>> +#define _UAPI_LINUX_ASPEED_XDMA_H_
>> +
>> +#include <linux/types.h>
>> +
>> +/*
>> + * aspeed_xdma_op
>> + *
>> + * host_addr: the DMA address on the host side, typically configured by PCI
>> + *            subsystem
>> + *
>> + * len: the size of the transfer in bytes; it should be a multiple of 16 bytes
>> + *
>> + * upstream: boolean indicating the direction of the DMA operation; upstream
>> + *           means a transfer from the BMC to the host
>> + */
>> +struct aspeed_xdma_op {
>> +	__u64 host_addr;
>> +	__u32 len;
>> +	__u32 upstream;
>> +};
>> +
>> +#endif /* _UAPI_LINUX_ASPEED_XDMA_H_ */
>> -- 
>> 1.8.3.1
>>
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 7e09dda..84e2b62 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2584,6 +2584,16 @@  S:	Maintained
 F:	drivers/media/platform/aspeed-video.c
 F:	Documentation/devicetree/bindings/media/aspeed-video.txt
 
+ASPEED XDMA ENGINE DRIVER
+M:	Eddie James <eajames@linux.ibm.com>
+L:	linux-aspeed@lists.ozlabs.org (moderated for non-subscribers)
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/misc/aspeed,xdma.txt
+F:	Documentation/ABI/testing/sysfs-devices-platform-aspeed-xdma
+F:	drivers/soc/aspeed/aspeed-xdma.c
+F:	include/uapi/linux/aspeed-xdma.h
+
 ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS
 M:	Corentin Chary <corentin.chary@gmail.com>
 L:	acpi4asus-user@lists.sourceforge.net
diff --git a/drivers/soc/aspeed/Kconfig b/drivers/soc/aspeed/Kconfig
index 323e177..8b08310 100644
--- a/drivers/soc/aspeed/Kconfig
+++ b/drivers/soc/aspeed/Kconfig
@@ -29,4 +29,12 @@  config ASPEED_P2A_CTRL
 	  ioctl()s, the driver also provides an interface for userspace mappings to
 	  a pre-defined region.
 
+config ASPEED_XDMA
+	tristate "Aspeed XDMA Engine Driver"
+	depends on SOC_ASPEED && REGMAP && MFD_SYSCON && HAS_DMA
+	help
+	  Enable support for the Aspeed XDMA Engine found on the Aspeed AST2500
+	  SOC. The XDMA engine can perform automatic PCI DMA operations between
+	  the AST2500 (acting as a BMC) and a host processor.
+
 endmenu
diff --git a/drivers/soc/aspeed/Makefile b/drivers/soc/aspeed/Makefile
index b64be47..977b046 100644
--- a/drivers/soc/aspeed/Makefile
+++ b/drivers/soc/aspeed/Makefile
@@ -2,3 +2,4 @@ 
 obj-$(CONFIG_ASPEED_LPC_CTRL)	+= aspeed-lpc-ctrl.o
 obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
 obj-$(CONFIG_ASPEED_P2A_CTRL)	+= aspeed-p2a-ctrl.o
+obj-$(CONFIG_ASPEED_XDMA)	+= aspeed-xdma.o
diff --git a/drivers/soc/aspeed/aspeed-xdma.c b/drivers/soc/aspeed/aspeed-xdma.c
new file mode 100644
index 0000000..3dc0ce4
--- /dev/null
+++ b/drivers/soc/aspeed/aspeed-xdma.c
@@ -0,0 +1,520 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright IBM Corp 2019
+
+#include <linux/aspeed-xdma.h>
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/fs.h>
+#include <linux/genalloc.h>
+#include <linux/interrupt.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/mfd/syscon.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/platform_device.h>
+#include <linux/poll.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+
+#define DEVICE_NAME			"aspeed-xdma"
+
+#define SCU_STRAP			0x070
+#define  SCU_STRAP_VGA_MEM		GENMASK(3, 2)
+
+#define SCU_PCIE_CONF			0x180
+#define  SCU_PCIE_CONF_VGA_EN		BIT(0)
+#define  SCU_PCIE_CONF_VGA_EN_MMIO	BIT(1)
+#define  SCU_PCIE_CONF_VGA_EN_LPC	BIT(2)
+#define  SCU_PCIE_CONF_VGA_EN_MSI	BIT(3)
+#define  SCU_PCIE_CONF_VGA_EN_MCTP	BIT(4)
+#define  SCU_PCIE_CONF_VGA_EN_IRQ	BIT(5)
+#define  SCU_PCIE_CONF_VGA_EN_DMA	BIT(6)
+#define  SCU_PCIE_CONF_BMC_EN		BIT(8)
+#define  SCU_PCIE_CONF_BMC_EN_MMIO	BIT(9)
+#define  SCU_PCIE_CONF_BMC_EN_MSI	BIT(11)
+#define  SCU_PCIE_CONF_BMC_EN_MCTP	BIT(12)
+#define  SCU_PCIE_CONF_BMC_EN_IRQ	BIT(13)
+#define  SCU_PCIE_CONF_BMC_EN_DMA	BIT(14)
+#define  SCU_PCIE_CONF_RSVD		GENMASK(19, 18)
+
+#define SDMC_CONF			0x004
+#define  SDMC_CONF_MEM			GENMASK(1, 0)
+#define SDMC_REMAP			0x008
+#define  SDMC_REMAP_MAGIC		GENMASK(17, 16)
+
+#define XDMA_CMD_SIZE			4
+#define XDMA_CMDQ_SIZE			PAGE_SIZE
+#define XDMA_BYTE_ALIGN			16
+#define XDMA_MAX_LINE_SIZE		BIT(10)
+#define XDMA_NUM_CMDS			\
+	(XDMA_CMDQ_SIZE / sizeof(struct aspeed_xdma_cmd))
+#define XDMA_NUM_DEBUGFS_REGS		6
+
+#define XDMA_CMD_BMC_CHECK		BIT(0)
+#define XDMA_CMD_BMC_ADDR		GENMASK(29, 4)
+#define XDMA_CMD_BMC_DIR_US		BIT(31)
+
+#define XDMA_CMD_COMM1_HI_HOST_PITCH	GENMASK(14, 3)
+#define XDMA_CMD_COMM1_HI_BMC_PITCH	GENMASK(30, 19)
+
+#define XDMA_CMD_CONF_CHECK		BIT(1)
+#define XDMA_CMD_CONF_LINE_SIZE		GENMASK(14, 4)
+#define XDMA_CMD_CONF_IRQ_BMC		BIT(15)
+#define XDMA_CMD_CONF_NUM_LINES		GENMASK(27, 16)
+#define XDMA_CMD_CONF_IRQ		BIT(31)
+
+#define XDMA_CMD_ID_UPDIR		GENMASK(17, 16)
+#define  XDMA_CMD_ID_UPDIR_BMC		0
+#define  XDMA_CMD_ID_UPDIR_HOST		1
+#define  XDMA_CMD_ID_UPDIR_VGA		2
+
+#define XDMA_DS_PCIE_REQ_SIZE_128	0
+#define XDMA_DS_PCIE_REQ_SIZE_256	1
+#define XDMA_DS_PCIE_REQ_SIZE_512	2
+#define XDMA_DS_PCIE_REQ_SIZE_1K	3
+#define XDMA_DS_PCIE_REQ_SIZE_2K	4
+#define XDMA_DS_PCIE_REQ_SIZE_4K	5
+
+#define XDMA_BMC_CMD_QUEUE_ADDR		0x10
+#define XDMA_BMC_CMD_QUEUE_ENDP		0x14
+#define XDMA_BMC_CMD_QUEUE_WRITEP	0x18
+#define XDMA_BMC_CMD_QUEUE_READP	0x1c
+#define  XDMA_BMC_CMD_QUEUE_READP_MAGIC	0xee882266
+#define XDMA_CTRL			0x20
+#define  XDMA_CTRL_US_COMP		BIT(4)
+#define  XDMA_CTRL_DS_COMP		BIT(5)
+#define  XDMA_CTRL_DS_DIRTY		BIT(6)
+#define  XDMA_CTRL_DS_PCIE_REQ_SIZE	GENMASK(19, 17)
+#define  XDMA_CTRL_DS_DATA_TIMEOUT	BIT(28)
+#define  XDMA_CTRL_DS_CHECK_ID		BIT(29)
+#define XDMA_STATUS			0x24
+#define  XDMA_STATUS_US_COMP		BIT(4)
+#define  XDMA_STATUS_DS_COMP		BIT(5)
+
+enum {
+	XDMA_IN_PRG,
+	XDMA_UPSTREAM,
+};
+
+struct aspeed_xdma_cmd {
+	u32 host_addr_lo;
+	u32 host_addr_hi;
+	u32 bmc_addr;
+	u32 comm1_hi;
+	u32 conf;
+	u32 id;
+	u32 resv0;
+	u32 resv1;
+};
+
+struct aspeed_xdma_client;
+
+struct aspeed_xdma {
+	struct device *dev;
+	void __iomem *base;
+	struct regmap *scu;
+	struct reset_control *reset;
+
+	unsigned long flags;
+	unsigned int cmd_idx;
+	wait_queue_head_t wait;
+	struct aspeed_xdma_client *current_client;
+
+	u32 vga_phys;
+	u32 vga_size;
+	dma_addr_t vga_dma;
+	void *cmdq;
+	void *vga_virt;
+	dma_addr_t cmdq_vga_phys;
+	void *cmdq_vga_virt;
+	struct gen_pool *vga_pool;
+};
+
+struct aspeed_xdma_client {
+	struct aspeed_xdma *ctx;
+
+	unsigned long flags;
+	void *virt;
+	dma_addr_t phys;
+	u32 size;
+};
+
+static const u32 aspeed_xdma_bmc_pcie_conf = SCU_PCIE_CONF_BMC_EN |
+	SCU_PCIE_CONF_BMC_EN_MSI | SCU_PCIE_CONF_BMC_EN_MCTP |
+	SCU_PCIE_CONF_BMC_EN_IRQ | SCU_PCIE_CONF_BMC_EN_DMA |
+	SCU_PCIE_CONF_RSVD;
+
+static const u32 aspeed_xdma_vga_pcie_conf = SCU_PCIE_CONF_VGA_EN |
+	SCU_PCIE_CONF_VGA_EN_MSI | SCU_PCIE_CONF_VGA_EN_MCTP |
+	SCU_PCIE_CONF_VGA_EN_IRQ | SCU_PCIE_CONF_VGA_EN_DMA |
+	SCU_PCIE_CONF_RSVD;
+
+static void aspeed_scu_pcie_write(struct aspeed_xdma *ctx, u32 conf)
+{
+	u32 v = 0;
+
+	regmap_write(ctx->scu, SCU_PCIE_CONF, conf);
+	regmap_read(ctx->scu, SCU_PCIE_CONF, &v);
+
+	dev_dbg(ctx->dev, "write scu pcie_conf[%08x]\n", v);
+}
+
+static u32 aspeed_xdma_reg_read(struct aspeed_xdma *ctx, u32 reg)
+{
+	u32 v = readl(ctx->base + reg);
+
+	dev_dbg(ctx->dev, "read %02x[%08x]\n", reg, v);
+	return v;
+}
+
+static void aspeed_xdma_reg_write(struct aspeed_xdma *ctx, u32 reg, u32 val)
+{
+	writel(val, ctx->base + reg);
+	dev_dbg(ctx->dev, "write %02x[%08x]\n", reg, readl(ctx->base + reg));
+}
+
+static void aspeed_xdma_init_eng(struct aspeed_xdma *ctx)
+{
+	const u32 ctrl = XDMA_CTRL_US_COMP | XDMA_CTRL_DS_COMP |
+		XDMA_CTRL_DS_DIRTY | FIELD_PREP(XDMA_CTRL_DS_PCIE_REQ_SIZE,
+						XDMA_DS_PCIE_REQ_SIZE_256) |
+		XDMA_CTRL_DS_DATA_TIMEOUT | XDMA_CTRL_DS_CHECK_ID;
+
+	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_ENDP,
+			      XDMA_CMD_SIZE * XDMA_NUM_CMDS);
+	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_READP,
+			      XDMA_BMC_CMD_QUEUE_READP_MAGIC);
+	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_WRITEP, 0);
+	aspeed_xdma_reg_write(ctx, XDMA_CTRL, ctrl);
+
+	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_ADDR,
+			      ctx->cmdq_vga_phys);
+
+	ctx->cmd_idx = 0;
+	ctx->flags = 0;
+}
+
+static void aspeed_xdma_reset(struct aspeed_xdma *ctx)
+{
+	reset_control_assert(ctx->reset);
+
+	msleep(10);
+
+	reset_control_deassert(ctx->reset);
+
+	msleep(10);
+
+	aspeed_xdma_init_eng(ctx);
+}
+
+static void aspeed_xdma_start(struct aspeed_xdma *ctx,
+			      struct aspeed_xdma_op *op, u32 bmc_addr)
+{
+	u32 conf = XDMA_CMD_CONF_CHECK | XDMA_CMD_CONF_IRQ_BMC |
+		XDMA_CMD_CONF_IRQ;
+	unsigned int line_size = op->len / XDMA_BYTE_ALIGN;
+	unsigned int num_lines = 1;
+	unsigned int nidx = (ctx->cmd_idx + 1) % XDMA_NUM_CMDS;
+	unsigned int pitch = 1;
+	struct aspeed_xdma_cmd *cmd =
+		&(((struct aspeed_xdma_cmd *)ctx->cmdq)[ctx->cmd_idx]);
+
+	if (line_size > XDMA_MAX_LINE_SIZE) {
+		unsigned int rem;
+		unsigned int total;
+
+		num_lines = line_size / XDMA_MAX_LINE_SIZE;
+		total = XDMA_MAX_LINE_SIZE * num_lines;
+		rem = line_size - total;
+		line_size = XDMA_MAX_LINE_SIZE;
+		pitch = line_size;
+
+		if (rem) {
+			unsigned int offs = total * XDMA_BYTE_ALIGN;
+			u32 r_bmc_addr = bmc_addr + offs;
+			u64 r_host_addr = op->host_addr + (u64)offs;
+			struct aspeed_xdma_cmd *r_cmd =
+				&(((struct aspeed_xdma_cmd *)ctx->cmdq)[nidx]);
+
+			r_cmd->host_addr_lo =
+				(u32)(r_host_addr & 0xFFFFFFFFULL);
+			r_cmd->host_addr_hi = (u32)(r_host_addr >> 32ULL);
+			r_cmd->bmc_addr = (r_bmc_addr & XDMA_CMD_BMC_ADDR) |
+				XDMA_CMD_BMC_CHECK |
+				(op->upstream ? XDMA_CMD_BMC_DIR_US : 0);
+			r_cmd->conf = conf |
+				FIELD_PREP(XDMA_CMD_CONF_LINE_SIZE, rem) |
+				FIELD_PREP(XDMA_CMD_CONF_NUM_LINES, 1);
+			r_cmd->comm1_hi =
+				FIELD_PREP(XDMA_CMD_COMM1_HI_HOST_PITCH, 1) |
+				FIELD_PREP(XDMA_CMD_COMM1_HI_BMC_PITCH, 1);
+
+			/* do not trigger IRQ for first command */
+			conf = XDMA_CMD_CONF_CHECK;
+
+			nidx = (nidx + 1) % XDMA_NUM_CMDS;
+		}
+
+		/* undocumented formula to get required number of lines */
+		num_lines = (num_lines * 2) - 1;
+	}
+
+	/* ctrl == 0 indicates engine hasn't started properly; restart it */
+	if (!aspeed_xdma_reg_read(ctx, XDMA_CTRL))
+		aspeed_xdma_reset(ctx);
+
+	cmd->host_addr_lo = (u32)(op->host_addr & 0xFFFFFFFFULL);
+	cmd->host_addr_hi = (u32)(op->host_addr >> 32ULL);
+	cmd->bmc_addr = (bmc_addr & XDMA_CMD_BMC_ADDR) | XDMA_CMD_BMC_CHECK |
+		(op->upstream ? XDMA_CMD_BMC_DIR_US : 0);
+	cmd->conf = conf |
+		FIELD_PREP(XDMA_CMD_CONF_LINE_SIZE, line_size) |
+		FIELD_PREP(XDMA_CMD_CONF_NUM_LINES, num_lines);
+	cmd->comm1_hi = FIELD_PREP(XDMA_CMD_COMM1_HI_HOST_PITCH, pitch) |
+			FIELD_PREP(XDMA_CMD_COMM1_HI_BMC_PITCH, pitch);
+
+	memcpy(ctx->cmdq_vga_virt, ctx->cmdq, XDMA_CMDQ_SIZE);
+
+	if (op->upstream)
+		set_bit(XDMA_UPSTREAM, &ctx->flags);
+	else
+		clear_bit(XDMA_UPSTREAM, &ctx->flags);
+
+	set_bit(XDMA_IN_PRG, &ctx->flags);
+
+	aspeed_xdma_reg_write(ctx, XDMA_BMC_CMD_QUEUE_WRITEP,
+			      nidx * XDMA_CMD_SIZE);
+	ctx->cmd_idx = nidx;
+}
+
+static void aspeed_xdma_done(struct aspeed_xdma *ctx)
+{
+	if (ctx->current_client) {
+		clear_bit(XDMA_IN_PRG, &ctx->current_client->flags);
+
+		ctx->current_client = NULL;
+	}
+
+	clear_bit(XDMA_IN_PRG, &ctx->flags);
+	wake_up_interruptible_all(&ctx->wait);
+}
+
+static irqreturn_t aspeed_xdma_irq(int irq, void *arg)
+{
+	struct aspeed_xdma *ctx = arg;
+	u32 status = aspeed_xdma_reg_read(ctx, XDMA_STATUS);
+
+	if (status & XDMA_STATUS_US_COMP) {
+		if (test_bit(XDMA_UPSTREAM, &ctx->flags))
+			aspeed_xdma_done(ctx);
+	}
+
+	if (status & XDMA_STATUS_DS_COMP) {
+		if (!test_bit(XDMA_UPSTREAM, &ctx->flags))
+			aspeed_xdma_done(ctx);
+	}
+
+	aspeed_xdma_reg_write(ctx, XDMA_STATUS, status);
+
+	return IRQ_HANDLED;
+}
+
+static int aspeed_xdma_init_mem(struct aspeed_xdma *ctx)
+{
+	int rc;
+	u32 scu_conf = 0;
+	u32 mem_size = 0x20000000;
+	const u32 mem_sizes[4] = { 0x8000000, 0x10000000, 0x20000000,
+				   0x40000000 };
+	const u32 vga_sizes[4] = { 0x800000, 0x1000000, 0x2000000, 0x4000000 };
+	void __iomem *sdmc_base = ioremap(0x1e6e0000, 0x100);
+
+	aspeed_scu_pcie_write(ctx, aspeed_xdma_vga_pcie_conf);
+
+	regmap_read(ctx->scu, SCU_STRAP, &scu_conf);
+	ctx->vga_size = vga_sizes[FIELD_GET(SCU_STRAP_VGA_MEM, scu_conf)];
+
+	if (sdmc_base) {
+		u32 sdmc = readl(sdmc_base + SDMC_CONF);
+		u32 remap = readl(sdmc_base + SDMC_REMAP);
+
+		remap |= SDMC_REMAP_MAGIC;
+		writel(remap, sdmc_base + SDMC_REMAP);
+		remap = readl(sdmc_base + SDMC_REMAP);
+
+		mem_size = mem_sizes[sdmc & SDMC_CONF_MEM];
+		iounmap(sdmc_base);
+	}
+
+	ctx->vga_phys = (mem_size - ctx->vga_size) + 0x80000000;
+
+	ctx->cmdq = devm_kzalloc(ctx->dev, XDMA_CMDQ_SIZE, GFP_KERNEL);
+	if (!ctx->cmdq) {
+		dev_err(ctx->dev, "Failed to allocate command queue.\n");
+		return -ENOMEM;
+	}
+
+	rc = dma_set_mask_and_coherent(ctx->dev, DMA_BIT_MASK(32));
+	if (rc) {
+		dev_err(ctx->dev, "Failed to set DMA mask: %d.\n", rc);
+		return rc;
+	}
+
+	rc = dma_declare_coherent_memory(ctx->dev, ctx->vga_phys,
+					 ctx->vga_phys, ctx->vga_size);
+	if (rc) {
+		dev_err(ctx->dev, "Failed to declare coherent memory: %d.\n",
+			rc);
+		return rc;
+	}
+
+	ctx->vga_virt = dma_alloc_coherent(ctx->dev, ctx->vga_size,
+					   &ctx->vga_dma, GFP_KERNEL);
+	if (!ctx->vga_virt) {
+		dev_err(ctx->dev, "Failed to allocate DMA.\n");
+		rc = -ENOMEM;
+		goto err_dma;
+	}
+
+	rc = gen_pool_add_virt(ctx->vga_pool, (unsigned long)ctx->vga_virt,
+			       ctx->vga_phys, ctx->vga_size, -1);
+	if (rc) {
+		dev_err(ctx->dev, "Failed to add memory to genalloc pool.\n");
+		goto err_genalloc;
+	}
+
+	ctx->cmdq_vga_virt = gen_pool_dma_alloc(ctx->vga_pool, XDMA_CMDQ_SIZE,
+						&ctx->cmdq_vga_phys);
+	if (!ctx->cmdq_vga_virt) {
+		dev_err(ctx->dev, "Failed to genalloc cmdq.\n");
+		rc = -ENOMEM;
+		goto err_genalloc;
+	}
+
+	dev_dbg(ctx->dev, "VGA mapped at phys[%08x], size[%08x].\n",
+		ctx->vga_phys, ctx->vga_size);
+
+	return 0;
+
+err_dma:
+	dma_release_declared_memory(ctx->dev);
+
+err_genalloc:
+	dma_free_coherent(ctx->dev, ctx->vga_size, ctx->vga_virt,
+			  ctx->vga_dma);
+	return rc;
+}
+
+static int aspeed_xdma_probe(struct platform_device *pdev)
+{
+	int irq;
+	int rc;
+	struct resource *res;
+	struct device *dev = &pdev->dev;
+	struct aspeed_xdma *ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->dev = dev;
+	platform_set_drvdata(pdev, ctx);
+	init_waitqueue_head(&ctx->wait);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ctx->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(ctx->base)) {
+		dev_err(dev, "Unable to ioremap registers.\n");
+		return PTR_ERR(ctx->base);
+	}
+
+	irq = irq_of_parse_and_map(dev->of_node, 0);
+	if (!irq) {
+		dev_err(dev, "Unable to find IRQ.\n");
+		return -ENODEV;
+	}
+
+	rc = devm_request_irq(dev, irq, aspeed_xdma_irq, IRQF_SHARED,
+			      DEVICE_NAME, ctx);
+	if (rc < 0) {
+		dev_err(dev, "Unable to request IRQ %d.\n", irq);
+		return rc;
+	}
+
+	ctx->scu = syscon_regmap_lookup_by_compatible("aspeed,ast2500-scu");
+	if (IS_ERR(ctx->scu)) {
+		dev_err(ctx->dev, "Unable to grab SCU regs.\n");
+		return PTR_ERR(ctx->scu);
+	}
+
+	ctx->reset = devm_reset_control_get_exclusive(dev, NULL);
+	if (IS_ERR(ctx->reset)) {
+		dev_err(dev, "Unable to request reset control.\n");
+		return PTR_ERR(ctx->reset);
+	}
+
+	ctx->vga_pool = devm_gen_pool_create(dev, ilog2(PAGE_SIZE), -1, NULL);
+	if (!ctx->vga_pool) {
+		dev_err(dev, "Unable to setup genalloc pool.\n");
+		return -ENOMEM;
+	}
+
+	reset_control_deassert(ctx->reset);
+
+	msleep(10);
+
+	rc = aspeed_xdma_init_mem(ctx);
+	if (rc) {
+		reset_control_assert(ctx->reset);
+		return rc;
+	}
+
+	aspeed_xdma_init_eng(ctx);
+
+	return 0;
+}
+
+static int aspeed_xdma_remove(struct platform_device *pdev)
+{
+	struct aspeed_xdma *ctx = platform_get_drvdata(pdev);
+
+	gen_pool_free(ctx->vga_pool, (unsigned long)ctx->cmdq_vga_virt,
+		      XDMA_CMDQ_SIZE);
+	dma_free_coherent(ctx->dev, ctx->vga_size, ctx->vga_virt,
+			  ctx->vga_dma);
+	dma_release_declared_memory(ctx->dev);
+	reset_control_assert(ctx->reset);
+
+	return 0;
+}
+
+static const struct of_device_id aspeed_xdma_match[] = {
+	{ .compatible = "aspeed,ast2500-xdma" },
+	{ },
+};
+
+static struct platform_driver aspeed_xdma_driver = {
+	.probe = aspeed_xdma_probe,
+	.remove = aspeed_xdma_remove,
+	.driver = {
+		.name = DEVICE_NAME,
+		.of_match_table = aspeed_xdma_match,
+	},
+};
+
+module_platform_driver(aspeed_xdma_driver);
+
+MODULE_AUTHOR("Eddie James");
+MODULE_DESCRIPTION("Aspeed XDMA Engine Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/aspeed-xdma.h b/include/uapi/linux/aspeed-xdma.h
new file mode 100644
index 0000000..998459e
--- /dev/null
+++ b/include/uapi/linux/aspeed-xdma.h
@@ -0,0 +1,26 @@ 
+/* SPDX-License-Identifier: GPL-2.0+ */
+/* Copyright IBM Corp 2019 */
+
+#ifndef _UAPI_LINUX_ASPEED_XDMA_H_
+#define _UAPI_LINUX_ASPEED_XDMA_H_
+
+#include <linux/types.h>
+
+/*
+ * aspeed_xdma_op
+ *
+ * host_addr: the DMA address on the host side, typically configured by PCI
+ *            subsystem
+ *
+ * len: the size of the transfer in bytes; it should be a multiple of 16 bytes
+ *
+ * upstream: boolean indicating the direction of the DMA operation; upstream
+ *           means a transfer from the BMC to the host
+ */
+struct aspeed_xdma_op {
+	__u64 host_addr;
+	__u32 len;
+	__u32 upstream;
+};
+
+#endif /* _UAPI_LINUX_ASPEED_XDMA_H_ */