diff mbox series

[RFC,v2,10/18] dmaengine: idxd: add emulation rw routines

Message ID 159534740139.28840.8782422568949418156.stgit@djiang5-desk3.ch.intel.com
State New
Headers show
Series Add VFIO mediated device support and DEV-MSI support for the idxd driver | expand

Commit Message

Dave Jiang July 21, 2020, 4:03 p.m. UTC
Add emulation routines for PCI config read/write, MMIO read/write, and
interrupt handling routine for the emulated device. The rw routines are
called when PCI config read/writes or BAR0 mmio read/writes and being
issued by the guest kernel through KVM/qemu.

Because we are supporting read-only configuration, most of the MMIO
emulations are simple memory copy except for cases such as handling device
commands and interrupts.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
 drivers/dma/idxd/registers.h |    4 
 drivers/dma/idxd/vdev.c      |  428 +++++++++++++++++++++++++++++++++++++++++-
 drivers/dma/idxd/vdev.h      |    8 +
 include/uapi/linux/idxd.h    |    2 
 4 files changed, 434 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index ace7248ee195..f8e4dd10a738 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -268,6 +268,10 @@  union msix_perm {
 	u32 bits;
 } __packed;
 
+#define IDXD_MSIX_PERM_MASK	0xfffff00c
+#define IDXD_MSIX_PERM_IGNORE	0x3
+#define MSIX_ENTRY_MASK_INT	0x1
+
 union group_flags {
 	struct {
 		u32 tc_a:3;
diff --git a/drivers/dma/idxd/vdev.c b/drivers/dma/idxd/vdev.c
index af421852cc51..b4eace02199e 100644
--- a/drivers/dma/idxd/vdev.c
+++ b/drivers/dma/idxd/vdev.c
@@ -25,8 +25,23 @@ 
 
 int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
 {
-	/* PLACE HOLDER */
-	return 0;
+	int rc = -1;
+	struct device *dev = &vidxd->idxd->pdev->dev;
+
+	dev_dbg(dev, "%s interrput %d\n", __func__, msix_idx);
+
+	if (!vidxd->vdev.msix_trigger[msix_idx]) {
+		dev_warn(dev, "%s: intr evtfd not found %d\n", __func__, msix_idx);
+		return -EINVAL;
+	}
+
+	rc = eventfd_signal(vidxd->vdev.msix_trigger[msix_idx], 1);
+	if (rc != 1)
+		dev_err(dev, "eventfd signal failed (%d)\n", rc);
+	else
+		dev_dbg(dev, "vidxd interrupt triggered wq(%d) %d\n", vidxd->wq->id, msix_idx);
+
+	return rc;
 }
 
 int vidxd_disable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx)
@@ -41,31 +56,423 @@  int vidxd_enable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx)
 	return 0;
 }
 
-int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+static void vidxd_report_error(struct vdcm_idxd *vidxd, unsigned int error)
 {
-	/* PLACEHOLDER */
-	return 0;
+	u8 *bar0 = vidxd->bar0;
+	union sw_err_reg *swerr = (union sw_err_reg *)(bar0 + IDXD_SWERR_OFFSET);
+	union genctrl_reg *genctrl;
+	bool send = false;
+
+	if (!swerr->valid) {
+		memset(swerr, 0, sizeof(*swerr));
+		swerr->valid = 1;
+		swerr->error = error;
+		send = true;
+	} else if (swerr->valid && !swerr->overflow) {
+		swerr->overflow = 1;
+	}
+
+	genctrl = (union genctrl_reg *)(bar0 + IDXD_GENCTRL_OFFSET);
+	if (send && genctrl->softerr_int_en) {
+		u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+
+		*intcause |= IDXD_INTC_ERR;
+		vidxd_send_interrupt(vidxd, 0);
+	}
 }
 
 int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
 {
-	/* PLACEHOLDER */
+	u32 offset = pos & (vidxd->bar_size[0] - 1);
+	u8 *bar0 = vidxd->bar0;
+	struct device *dev = mdev_dev(vidxd->vdev.mdev);
+
+	dev_dbg(dev, "vidxd mmio W %d %x %x: %llx\n", vidxd->wq->id, size,
+		offset, get_reg_val(buf, size));
+
+	if (((size & (size - 1)) != 0) || (offset & (size - 1)) != 0)
+		return -EINVAL;
+
+	/* If we don't limit this, we potentially can write out of bound */
+	if (size > 4)
+		return -EINVAL;
+
+	switch (offset) {
+	case IDXD_GENCFG_OFFSET ... IDXD_GENCFG_OFFSET + 3:
+		/* Write only when device is disabled. */
+		if (vidxd_state(vidxd) == IDXD_DEVICE_STATE_DISABLED)
+			memcpy(bar0 + offset, buf, size);
+		break;
+
+	case IDXD_GENCTRL_OFFSET:
+		memcpy(bar0 + offset, buf, size);
+		break;
+
+	case IDXD_INTCAUSE_OFFSET:
+		bar0[offset] &= ~(get_reg_val(buf, 1) & 0x1f);
+		break;
+
+	case IDXD_CMD_OFFSET: {
+		u32 *cmdsts = (u32 *)(bar0 + IDXD_CMDSTS_OFFSET);
+		u32 val = get_reg_val(buf, size);
+
+		if (size != 4)
+			return -EINVAL;
+
+		/* Check and set command in progress */
+		if (test_and_set_bit(31, (unsigned long *)cmdsts) == 0)
+			vidxd_do_command(vidxd, val);
+		else
+			vidxd_report_error(vidxd, DSA_ERR_CMD_REG);
+		break;
+	}
+
+	case IDXD_SWERR_OFFSET:
+		/* W1C */
+		bar0[offset] &= ~(get_reg_val(buf, 1) & 3);
+		break;
+
+	case VIDXD_WQCFG_OFFSET ... VIDXD_WQCFG_OFFSET + VIDXD_WQ_CTRL_SZ - 1:
+	case VIDXD_GRPCFG_OFFSET ...  VIDXD_GRPCFG_OFFSET + VIDXD_GRP_CTRL_SZ - 1:
+		/* Nothing is written. Should be all RO */
+		break;
+
+	case VIDXD_MSIX_TABLE_OFFSET ...  VIDXD_MSIX_TABLE_OFFSET + VIDXD_MSIX_TBL_SZ - 1: {
+		int index = (offset - VIDXD_MSIX_TABLE_OFFSET) / 0x10;
+		u8 *msix_entry = &bar0[VIDXD_MSIX_TABLE_OFFSET + index * 0x10];
+		u64 *pba = (u64 *)(bar0 + VIDXD_MSIX_PBA_OFFSET);
+		u8 cvec_byte;
+
+		cvec_byte = msix_entry[12];
+		memcpy(bar0 + offset, buf, size);
+		/* Handle clearing of UNMASK bit */
+		if (!(msix_entry[12] & MSIX_ENTRY_MASK_INT) && cvec_byte & MSIX_ENTRY_MASK_INT)
+			if (test_and_clear_bit(index, (unsigned long *)pba))
+				vidxd_send_interrupt(vidxd, index);
+		break;
+	}
+
+	case VIDXD_MSIX_PERM_OFFSET ...  VIDXD_MSIX_PERM_OFFSET + VIDXD_MSIX_PERM_TBL_SZ - 1:
+		memcpy(bar0 + offset, buf, size);
+		break;
+	} /* offset */
+
+	return 0;
+}
+
+int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+{
+	u32 offset = pos & (vidxd->bar_size[0] - 1);
+	struct device *dev = mdev_dev(vidxd->vdev.mdev);
+
+	memcpy(buf, vidxd->bar0 + offset, size);
+
+	dev_dbg(dev, "vidxd mmio R %d %x %x: %llx\n",
+		vidxd->wq->id, size, offset, get_reg_val(buf, size));
 	return 0;
 }
 
 int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count)
 {
-	/* PLACEHOLDER */
+	u32 offset = pos & 0xfff;
+	struct device *dev = mdev_dev(vidxd->vdev.mdev);
+
+	memcpy(buf, &vidxd->cfg[offset], count);
+
+	dev_dbg(dev, "vidxd pci R %d %x %x: %llx\n",
+		vidxd->wq->id, count, offset, get_reg_val(buf, count));
+
+	return 0;
+}
+
+/*
+ * Much of the emulation code has been borrowed from Intel i915 cfg space
+ * emulation code.
+ * drivers/gpu/drm/i915/gvt/cfg_space.c:
+ */
+
+/*
+ * Bitmap for writable bits (RW or RW1C bits, but cannot co-exist in one
+ * byte) byte by byte in standard pci configuration space. (not the full
+ * 256 bytes.)
+ */
+static const u8 pci_cfg_space_rw_bmp[PCI_INTERRUPT_LINE + 4] = {
+	[PCI_COMMAND]		= 0xff, 0x07,
+	[PCI_STATUS]		= 0x00, 0xf9, /* the only one RW1C byte */
+	[PCI_CACHE_LINE_SIZE]	= 0xff,
+	[PCI_BASE_ADDRESS_0 ... PCI_CARDBUS_CIS - 1] = 0xff,
+	[PCI_ROM_ADDRESS]	= 0x01, 0xf8, 0xff, 0xff,
+	[PCI_INTERRUPT_LINE]	= 0xff,
+};
+
+static void _pci_cfg_mem_write(struct vdcm_idxd *vidxd, unsigned int off, u8 *src,
+			       unsigned int bytes)
+{
+	u8 *cfg_base = vidxd->cfg;
+	u8 mask, new, old;
+	int i = 0;
+
+	for (; i < bytes && (off + i < sizeof(pci_cfg_space_rw_bmp)); i++) {
+		mask = pci_cfg_space_rw_bmp[off + i];
+		old = cfg_base[off + i];
+		new = src[i] & mask;
+
+		/**
+		 * The PCI_STATUS high byte has RW1C bits, here
+		 * emulates clear by writing 1 for these bits.
+		 * Writing a 0b to RW1C bits has no effect.
+		 */
+		if (off + i == PCI_STATUS + 1)
+			new = (~new & old) & mask;
+
+		cfg_base[off + i] = (old & ~mask) | new;
+	}
+
+	/* For other configuration space directly copy as it is. */
+	if (i < bytes)
+		memcpy(cfg_base + off + i, src + i, bytes - i);
+}
+
+static inline void _write_pci_bar(struct vdcm_idxd *vidxd, u32 offset, u32 val, bool low)
+{
+	u32 *pval;
+
+	/* BAR offset should be 32 bits algiend */
+	offset = rounddown(offset, 4);
+	pval = (u32 *)(vidxd->cfg + offset);
+
+	if (low) {
+		/*
+		 * only update bit 31 - bit 4,
+		 * leave the bit 3 - bit 0 unchanged.
+		 */
+		*pval = (val & GENMASK(31, 4)) | (*pval & GENMASK(3, 0));
+	} else {
+		*pval = val;
+	}
+}
+
+static int _pci_cfg_bar_write(struct vdcm_idxd *vidxd, unsigned int offset, void *p_data,
+			      unsigned int bytes)
+{
+	u32 new = *(u32 *)(p_data);
+	bool lo = IS_ALIGNED(offset, 8);
+	u64 size;
+	unsigned int bar_id;
+
+	/*
+	 * Power-up software can determine how much address
+	 * space the device requires by writing a value of
+	 * all 1's to the register and then reading the value
+	 * back. The device will return 0's in all don't-care
+	 * address bits.
+	 */
+	if (new == 0xffffffff) {
+		switch (offset) {
+		case PCI_BASE_ADDRESS_0:
+		case PCI_BASE_ADDRESS_1:
+		case PCI_BASE_ADDRESS_2:
+		case PCI_BASE_ADDRESS_3:
+			bar_id = (offset - PCI_BASE_ADDRESS_0) / 8;
+			size = vidxd->bar_size[bar_id];
+			_write_pci_bar(vidxd, offset, size >> (lo ? 0 : 32), lo);
+			break;
+		default:
+			/* Unimplemented BARs */
+			_write_pci_bar(vidxd, offset, 0x0, false);
+		}
+	} else {
+		switch (offset) {
+		case PCI_BASE_ADDRESS_0:
+		case PCI_BASE_ADDRESS_1:
+		case PCI_BASE_ADDRESS_2:
+		case PCI_BASE_ADDRESS_3:
+			_write_pci_bar(vidxd, offset, new, lo);
+			break;
+		default:
+			break;
+		}
+	}
 	return 0;
 }
 
 int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size)
 {
-	/* PLACEHOLDER */
+	struct device *dev = &vidxd->idxd->pdev->dev;
+
+	if (size > 4)
+		return -EINVAL;
+
+	if (pos + size > VIDXD_MAX_CFG_SPACE_SZ)
+		return -EINVAL;
+
+	dev_dbg(dev, "vidxd pci W %d %x %x: %llx\n", vidxd->wq->id, size, pos,
+		get_reg_val(buf, size));
+
+	/* First check if it's PCI_COMMAND */
+	if (IS_ALIGNED(pos, 2) && pos == PCI_COMMAND) {
+		bool new_bme;
+		bool bme;
+
+		if (size > 2)
+			return -EINVAL;
+
+		new_bme = !!(get_reg_val(buf, 2) & PCI_COMMAND_MASTER);
+		bme = !!(vidxd->cfg[pos] & PCI_COMMAND_MASTER);
+		_pci_cfg_mem_write(vidxd, pos, buf, size);
+
+		/* Flag error if turning off BME while device is enabled */
+		if ((bme && !new_bme) && vidxd_state(vidxd) == IDXD_DEVICE_STATE_ENABLED)
+			vidxd_report_error(vidxd, DSA_ERR_PCI_CFG);
+		return 0;
+	}
+
+	switch (rounddown(pos, 4)) {
+	case PCI_BASE_ADDRESS_0 ... PCI_BASE_ADDRESS_5:
+		if (!IS_ALIGNED(pos, 4))
+			return -EINVAL;
+		return _pci_cfg_bar_write(vidxd, pos, buf, size);
+
+	default:
+		_pci_cfg_mem_write(vidxd, pos, buf, size);
+	}
 	return 0;
 }
 
+static void vidxd_mmio_init_grpcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union group_cap_reg *grp_cap = (union group_cap_reg *)(bar0 + IDXD_GRPCAP_OFFSET);
+
+	/* single group for current implementation */
+	grp_cap->token_en = 0;
+	grp_cap->token_limit = 0;
+	grp_cap->num_groups = 1;
+}
+
+static void vidxd_mmio_init_grpcfg(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	struct grpcfg *grpcfg = (struct grpcfg *)(bar0 + VIDXD_GRPCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	struct idxd_group *group = wq->group;
+	int i;
+
+	/*
+	 * At this point, we are only exporting a single workqueue for
+	 * each mdev. So we need to just fake it as first workqueue
+	 * and also mark the available engines in this group.
+	 */
+
+	/* Set single workqueue and the first one */
+	grpcfg->wqs[0] = 0x1;
+	grpcfg->engines = 0;
+	for (i = 0; i < group->num_engines; i++)
+		grpcfg->engines |= BIT(i);
+	grpcfg->flags.bits = group->grpcfg.flags.bits;
+}
+
+static void vidxd_mmio_init_wqcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	struct idxd_wq *wq = vidxd->wq;
+	union wq_cap_reg *wq_cap = (union wq_cap_reg *)(bar0 + IDXD_WQCAP_OFFSET);
+
+	wq_cap->occupancy_int = 0;
+	wq_cap->occupancy = 0;
+	wq_cap->priority = 0;
+	wq_cap->total_wq_size = wq->size;
+	wq_cap->num_wqs = VIDXD_MAX_WQS;
+	if (wq_dedicated(wq))
+		wq_cap->dedicated_mode = 1;
+}
+
+static void vidxd_mmio_init_wqcfg(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	struct idxd_wq *wq = vidxd->wq;
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+
+	wqcfg->wq_size = wq->size;
+	wqcfg->wq_thresh = wq->threshold;
+
+	if (wq_dedicated(wq))
+		wqcfg->mode = 1;
+
+	if (idxd->hw.gen_cap.block_on_fault &&
+	    test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags))
+		wqcfg->bof = 1;
+
+	wqcfg->priority = wq->priority;
+	wqcfg->max_xfer_shift = idxd->hw.gen_cap.max_xfer_shift;
+	wqcfg->max_batch_shift = idxd->hw.gen_cap.max_batch_shift;
+	/* make mode change read-only */
+	wqcfg->mode_support = 0;
+}
+
+static void vidxd_mmio_init_engcap(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union engine_cap_reg *engcap = (union engine_cap_reg *)(bar0 + IDXD_ENGCAP_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	struct idxd_group *group = wq->group;
+
+	engcap->num_engines = group->num_engines;
+}
+
+static void vidxd_mmio_init_gencap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	union gen_cap_reg *gencap = (union gen_cap_reg *)(bar0 + IDXD_GENCAP_OFFSET);
+
+	gencap->bits = idxd->hw.gen_cap.bits;
+	gencap->config_en = 0;
+	gencap->max_ims_mult = 0;
+	gencap->cmd_cap = 1;
+}
+
+static void vidxd_mmio_init_cmdcap(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmdcap = (u32 *)(bar0 + IDXD_CMDCAP_OFFSET);
+
+	if (idxd->hw.cmd_cap)
+		*cmdcap = idxd->hw.cmd_cap;
+	else
+		*cmdcap = 0x1ffe;
+
+	*cmdcap |= BIT(IDXD_CMD_REQUEST_INT_HANDLE);
+}
+
 void vidxd_mmio_init(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	union offsets_reg *offsets;
+
+	/* Copy up to where table offset is */
+	memcpy_fromio(vidxd->bar0, idxd->reg_base, IDXD_TABLE_OFFSET);
+
+	vidxd_mmio_init_gencap(vidxd);
+	vidxd_mmio_init_cmdcap(vidxd);
+	vidxd_mmio_init_wqcap(vidxd);
+	vidxd_mmio_init_wqcfg(vidxd);
+	vidxd_mmio_init_grpcap(vidxd);
+	vidxd_mmio_init_grpcfg(vidxd);
+	vidxd_mmio_init_engcap(vidxd);
+
+	offsets = (union offsets_reg *)(bar0 + IDXD_TABLE_OFFSET);
+	offsets->grpcfg = VIDXD_GRPCFG_OFFSET / 0x100;
+	offsets->wqcfg = VIDXD_WQCFG_OFFSET / 0x100;
+	offsets->msix_perm = VIDXD_MSIX_PERM_OFFSET / 0x100;
+
+	memset(bar0 + VIDXD_MSIX_PERM_OFFSET, 0, VIDXD_MSIX_PERM_TBL_SZ);
+}
+
+static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
 {
 	/* PLACEHOLDER */
 }
@@ -74,3 +481,8 @@  void vidxd_reset(struct vdcm_idxd *vidxd)
 {
 	/* PLACEHOLDER */
 }
+
+void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val)
+{
+	/* PLACEHOLDER */
+}
diff --git a/drivers/dma/idxd/vdev.h b/drivers/dma/idxd/vdev.h
index 1a2fdda271e8..2dc8d22d3ea7 100644
--- a/drivers/dma/idxd/vdev.h
+++ b/drivers/dma/idxd/vdev.h
@@ -6,6 +6,13 @@ 
 
 #include "mdev.h"
 
+static inline u8 vidxd_state(struct vdcm_idxd *vidxd)
+{
+	union gensts_reg *gensts = (union gensts_reg *)(vidxd->bar0 + IDXD_GENSTATS_OFFSET);
+
+	return gensts->state;
+}
+
 int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
 int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
 int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count);
@@ -15,5 +22,6 @@  void vidxd_reset(struct vdcm_idxd *vidxd);
 int vidxd_disable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx);
 int vidxd_enable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx);
 int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx);
+void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val);
 
 #endif
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index fdcdfe414223..a0c0475a4626 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -78,6 +78,8 @@  enum dsa_completion_status {
 	DSA_COMP_HW_ERR1,
 	DSA_COMP_HW_ERR_DRB,
 	DSA_COMP_TRANSLATION_FAIL,
+	DSA_ERR_PCI_CFG = 0x51,
+	DSA_ERR_CMD_REG,
 };
 
 #define DSA_COMP_STATUS_MASK		0x7f