diff mbox series

[V2,2/3] amd-iommu: Sync IOVA-to-GPA translation during page invalidation

Message ID 20201002145907.1294353-3-wei.huang2@amd.com
State New
Headers show
Series Passthru device support under emulated amd-iommu | expand

Commit Message

Wei Huang Oct. 2, 2020, 2:59 p.m. UTC
Add support to sync the IOVA-to-GPA translation at the time of IOMMU
page invalidation. This function is called when two IOMMU commands,
AMDVI_CMD_INVAL_AMDVI_PAGES and AMDVI_CMD_INVAL_AMDVI_ALL, are
intercepted. Address space notifiers are called accordingly.

Co-developed-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
---
 hw/i386/amd_iommu.c | 177 ++++++++++++++++++++++++++++++++++++++++++++
 hw/i386/amd_iommu.h |  10 +++
 2 files changed, 187 insertions(+)

Comments

Peter Xu Oct. 2, 2020, 7:11 p.m. UTC | #1
On Fri, Oct 02, 2020 at 09:59:06AM -0500, Wei Huang wrote:
> +static void amdvi_sync_domain(AMDVIState *s, uint32_t domid,
> +                              uint64_t addr, uint16_t flags)
> +{

[...]

> +        /*
> +         * In case of syncing more than a page, we invalidate the entire
> +         * address range and re-walk the whole page table.
> +         */
> +        if (size == AMDVI_PGSZ_ENTIRE) {
> +            IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
> +                amdvi_address_space_unmap(as, n);
> +            }
> +        } else if (size > 0x1000) {
> +            IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
> +                if (n->start <= addr && addr + size < n->end) {
> +                    amdvi_address_space_unmap(as, n);
> +                }
> +            }
> +        }

So this may not work... Because DMA could happen concurrently with the page
sync, so the DMA could fail if the mapping is suddenly gone (even if it's a
very short period).

We encountered this problem on x86 and those will be reported as host DMAR
errors (since those pages were missing), please feel free to have a look at:

  63b88968f1 ("intel-iommu: rework the page walk logic", 2018-05-23)

Majorly this paragraph:

  For each VTDAddressSpace, now we maintain what IOVA ranges we have
  mapped and what we have not.  With that information, now we only send
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
  know we have already mapped the range, meanwhile we don't send UNMAP
  notifies if we know we never mapped the range at all.

So the simple idea is, as long as one page was mapped and it's not unmapped, we
can _never_ unmap it.. because the device might be using it simultaneously.

What VT-d does right now is to maintain a per-device iova tree, so that we know
what exactly have been mapped.  That's kind of an overkill for sure (especially
because vfio kernel module will maintain a similar iova tree, so it's at least
kind of duplicated), however that should fix this problem.
diff mbox series

Patch

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index c7d24a05484d..7ce68289d20c 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -76,6 +76,12 @@  typedef struct AMDVIIOTLBEntry {
     uint64_t page_mask;         /* physical page size  */
 } AMDVIIOTLBEntry;
 
+static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry);
+static void amdvi_sync_domain(AMDVIState *s, uint32_t domid,
+                              uint64_t addr, uint16_t flags);
+static void amdvi_walk_level(AMDVIAddressSpace *as, uint64_t pte,
+                             uint64_t iova, uint64_t partial);
+
 /* configure MMIO registers at startup/reset */
 static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
                            uint64_t romask, uint64_t w1cmask)
@@ -443,6 +449,78 @@  static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n)
     memory_region_notify_one(n, &entry);
 }
 
+/*
+ * Sync the IOVA-to-GPA translation at the time of IOMMU page invalidation.
+ * This function is called when IOMMU commands, AMDVI_CMD_INVAL_AMDVI_PAGES
+ * and AMDVI_CMD_INVAL_AMDVI_ALL, are triggred.
+ *
+ * The range of addr invalidation is determined by addr and flags, using
+ * the following rules:
+ *   - All pages
+ *     In this case, we unmap the whole address space and then re-walk the
+ *     I/O page table to sync the mapping relationship.
+ *   - Single page
+ *     Re-walk the page based on the specified iova, and only sync the
+ *     newly mapped page.
+ */
+static void amdvi_sync_domain(AMDVIState *s, uint32_t domid,
+                              uint64_t addr, uint16_t flags)
+{
+    AMDVIAddressSpace *as;
+    bool sync_all_domains = false;
+    uint64_t mask, size = 0x1000;
+
+    if (domid == AMDVI_DOMAIN_ALL) {
+        sync_all_domains = true;
+    }
+
+     /* S=1 means the invalidation size is from addr field; otherwise 4KB */
+    if (flags & AMDVI_CMD_INVAL_IOMMU_PAGES_S_BIT) {
+        uint32_t zbit = cto64(addr | 0xFFF) + 1;
+
+        size = 1ULL << zbit;
+
+        if (size < 0x1000) {
+            addr = 0;
+            size = AMDVI_PGSZ_ENTIRE;
+        } else {
+            mask = ~(size - 1);
+            addr &= mask;
+        }
+    }
+
+    QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) {
+        uint64_t dte[4];
+        IOMMUNotifier *n;
+
+        if (!amdvi_get_dte(s, as->devfn, dte)) {
+            continue;
+        }
+
+        if (!sync_all_domains && (domid != (dte[1] & 0xFFFULL))) {
+            continue;
+        }
+
+        /*
+         * In case of syncing more than a page, we invalidate the entire
+         * address range and re-walk the whole page table.
+         */
+        if (size == AMDVI_PGSZ_ENTIRE) {
+            IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
+                amdvi_address_space_unmap(as, n);
+            }
+        } else if (size > 0x1000) {
+            IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
+                if (n->start <= addr && addr + size < n->end) {
+                    amdvi_address_space_unmap(as, n);
+                }
+            }
+        }
+
+        amdvi_walk_level(as, dte[0], addr, 0);
+    }
+}
+
 static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
                                             gpointer user_data)
 {
@@ -455,6 +533,8 @@  static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
 static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
 {
     uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
+    uint64_t addr  = cpu_to_le64(extract64(cmd[1], 12, 52)) << 12;
+    uint16_t flags = cpu_to_le16((uint16_t)extract64(cmd[1], 0, 12));
 
     if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) ||
         extract64(cmd[1], 3, 9)) {
@@ -465,6 +545,8 @@  static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
     g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
                                 &domid);
     trace_amdvi_pages_inval(domid);
+
+    amdvi_sync_domain(s, domid, addr, flags);
 }
 
 static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd)
@@ -910,6 +992,101 @@  static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
     return pte;
 }
 
+static inline uint64_t pte_get_page_size(uint64_t level)
+{
+    return 1UL << ((level * 9) + 3);
+}
+
+static void amdvi_sync_iova(AMDVIAddressSpace *as, uint64_t pte, uint64_t iova)
+{
+    IOMMUTLBEntry entry;
+    uint64_t addr = pte & AMDVI_DEV_PT_ROOT_MASK;
+    uint32_t level = get_pte_translation_mode(pte);
+    uint64_t size = pte_get_page_size(level + 1);
+    uint64_t perm = amdvi_get_perms(pte);
+
+    assert(level == 0 || level == 7);
+
+    entry.target_as = &address_space_memory;
+    entry.iova = iova ;
+    entry.perm = perm;
+    if (level == 0) {
+        entry.addr_mask = size - 1;
+        entry.translated_addr = addr;
+    } else if (level == 7) {
+        entry.addr_mask = (1 << (cto64(addr | 0xFFF) + 1)) - 1;
+        entry.translated_addr = addr & ~entry.addr_mask;
+    }
+
+    memory_region_notify_iommu(&as->iommu, 0, entry);
+}
+
+/*
+ * Walk the I/O page table and notify mapping change. Note that iova
+ * determines if this function's behavior:
+ *   - iova == 0: re-walk the whole page table
+ *   - iova != 0: re-walk the address defined in iova
+ */
+static void amdvi_walk_level(AMDVIAddressSpace *as, uint64_t pte,
+                             uint64_t iova, uint64_t partial)
+{
+    uint64_t index = 0;
+    uint8_t level = get_pte_translation_mode(pte);
+    uint64_t cur_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
+    uint64_t end_addr = cur_addr + 4096;
+    uint64_t new_partial = 0;
+
+    if (!(pte & AMDVI_PTE_PRESENT)) {
+        return;
+    }
+
+    if (level == 7) {
+        amdvi_sync_iova(as, pte, iova);
+        return;
+    }
+
+    /* narrow the scope of table walk if iova != 0 */
+    if (iova) {
+        cur_addr += ((iova >> (3 + 9 * level)) & 0x1FF) << 3;
+        end_addr = cur_addr + 8;
+    }
+
+    while (cur_addr < end_addr) {
+        int cur_addr_inc = 8;
+        int index_inc = 1;
+
+        pte = amdvi_get_pte_entry(as->iommu_state, cur_addr, as->devfn);
+        /* validate the entry */
+        if (!(pte & AMDVI_PTE_PRESENT)) {
+            goto next;
+        }
+
+        if (level > 1) {
+            new_partial = (partial << 9) | index;
+            amdvi_walk_level(as, pte, iova, new_partial);
+        } else {
+            /* found a page, sync the mapping first */
+            if (iova) {
+                amdvi_sync_iova(as, pte, iova);
+            } else {
+                amdvi_sync_iova(as, pte, ((partial << 9) | index) << 12);
+            }
+
+            /* skip following entries when a large page is found */
+            if (get_pte_translation_mode(pte) == 7) {
+                int skipped = 1 << (cto64(pte >> 12) + 1);
+
+                cur_addr_inc = 8 * skipped;
+                index_inc = skipped;
+            }
+        }
+
+next:
+        cur_addr += cur_addr_inc;
+        index += index_inc;
+    }
+}
+
 static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
                             IOMMUTLBEntry *ret, unsigned perms,
                             hwaddr addr)
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 29b7a35a51a5..6d80f4913984 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -123,6 +123,8 @@ 
 #define AMDVI_CMD_COMPLETE_PPR_REQUEST    0x07
 #define AMDVI_CMD_INVAL_AMDVI_ALL         0x08
 
+#define AMDVI_CMD_INVAL_IOMMU_PAGES_S_BIT (1ULL << 0)
+
 #define AMDVI_DEVTAB_ENTRY_SIZE           32
 
 /* Device table entry bits 0:63 */
@@ -148,6 +150,9 @@ 
 #define AMDVI_EVENT_ILLEGAL_COMMAND_ERROR (0x5U << 12)
 #define AMDVI_EVENT_COMMAND_HW_ERROR      (0x6U << 12)
 
+/* PTE bits */
+#define AMDVI_PTE_PRESENT                 (1ULL << 0)
+
 #define AMDVI_EVENT_LEN                  16
 #define AMDVI_PERM_READ             (1 << 0)
 #define AMDVI_PERM_WRITE            (1 << 1)
@@ -198,6 +203,11 @@ 
 #define AMDVI_MAX_PH_ADDR          (40UL << 8)
 #define AMDVI_MAX_GVA_ADDR         (48UL << 15)
 
+#define AMDVI_PGSZ_ENTIRE          (0X0007FFFFFFFFF000ULL)
+
+/* The domain id is 16-bit, so use 32-bit all 1's to represent all domains */
+#define AMDVI_DOMAIN_ALL           (UINT32_MAX)
+
 /* Completion Wait data size */
 #define AMDVI_COMPLETION_DATA_SIZE    8