diff mbox series

[RFC,07/12] hw/arm/smmu-common: Support nested translation

Message ID 20240325101442.1306300-8-smostafa@google.com
State New
Headers show
Series SMMUv3 nested translation support | expand

Commit Message

Mostafa Saleh March 25, 2024, 10:14 a.m. UTC
When nested translation is requested, we need  to do:

- Translate stage-1 IPA using stage-2 to a physical address.
- Translate stage-1 PTW walks using stage-2.
- Combine both to create a single TLB entry

Signed-off-by: Mostafa Saleh <smostafa@google.com>
---
 hw/arm/smmu-common.c         | 135 ++++++++++++++++++++++++++++-------
 hw/arm/trace-events          |   2 +-
 include/hw/arm/smmu-common.h |   3 +-
 3 files changed, 113 insertions(+), 27 deletions(-)

Comments

Julien Grall March 25, 2024, 2:20 p.m. UTC | #1
Hi Mostafa,

On 25/03/2024 10:14, Mostafa Saleh wrote:
> @@ -524,7 +551,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
>           tlbe->entry.translated_addr = gpa;
>           tlbe->entry.iova = ipa & ~mask;
>           tlbe->entry.addr_mask = mask;
> -        tlbe->entry.perm = s2ap;
> +        tlbe->parent_perm = tlbe->entry.perm = s2ap;
>           tlbe->level = level;
>           tlbe->granule = granule_sz;
>           return 0;
> @@ -537,6 +564,35 @@ error:
>       return -EINVAL;
>   }
>   
> +/* Combine 2 TLB enteries and return in tlbe. */
> +static void combine_tlb(SMMUTLBEntry *tlbe, SMMUTLBEntry *tlbe_s2,
> +                        dma_addr_t iova, SMMUTransCfg *cfg)
> +{
> +        if (cfg->stage == SMMU_NESTED) {
> +
> +            /*
> +             * tg and level are used from stage-1, while the addr mask can be
With the current approach, I can't boot a guest if I create a dummy 
stage-1 using 512GB mapping and a stage-2 using 2MB mapping. It looks 
like this is because the level will be used during the TLB lookup.

I managed to solve the issue by using the max level of the two stages. I 
think we may need to do a minimum for the granule.


> +             * smaller in case stage-2 size(based on granule and level) was
> +             * smaller than stage-1.
> +             * That should have no impact on:
> +             * - lookup: as iova is properly aligned with the stage-1 level and
> +             *   granule.
> +             * - Invalidation: as it uses the entry mask.
> +             */
> +            tlbe->entry.addr_mask = MIN(tlbe->entry.addr_mask,
> +                                        tlbe_s2->entry.addr_mask);
> +            tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
> +                                          tlbe->entry.translated_addr);
> +
> +            /* parent_perm has s2 perm while perm has s1 perm. */
> +            tlbe->parent_perm = tlbe_s2->entry.perm;
> +            return;
> +        }
> +
> +        /* That was not nested, use the s2. */
> +        memcpy(tlbe, tlbe_s2, sizeof(*tlbe));
> +}

Cheers,
Mostafa Saleh March 25, 2024, 8:47 p.m. UTC | #2
Hi Julien,

On Mon, Mar 25, 2024 at 02:20:07PM +0000, Julien Grall wrote:
> Hi Mostafa,
> 
> On 25/03/2024 10:14, Mostafa Saleh wrote:
> > @@ -524,7 +551,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >           tlbe->entry.translated_addr = gpa;
> >           tlbe->entry.iova = ipa & ~mask;
> >           tlbe->entry.addr_mask = mask;
> > -        tlbe->entry.perm = s2ap;
> > +        tlbe->parent_perm = tlbe->entry.perm = s2ap;
> >           tlbe->level = level;
> >           tlbe->granule = granule_sz;
> >           return 0;
> > @@ -537,6 +564,35 @@ error:
> >       return -EINVAL;
> >   }
> > +/* Combine 2 TLB enteries and return in tlbe. */
> > +static void combine_tlb(SMMUTLBEntry *tlbe, SMMUTLBEntry *tlbe_s2,
> > +                        dma_addr_t iova, SMMUTransCfg *cfg)
> > +{
> > +        if (cfg->stage == SMMU_NESTED) {
> > +
> > +            /*
> > +             * tg and level are used from stage-1, while the addr mask can be
> With the current approach, I can't boot a guest if I create a dummy stage-1
> using 512GB mapping and a stage-2 using 2MB mapping. It looks like this is
> because the level will be used during the TLB lookup.

Agh, I guess that case is’t common with Linux.

I was able to reproduce it with a hacked Linux driver, and the issue
happens in smmu_iotlb_lookup() because it assumes the cached entry has
a mask matching level and granularity, which is not correct with
nesting and I missed it, and fixing the mask is not enough here.

Looking at the mask of the found entry, not good also, if there is
disparity between stage-1 and stage-2 levels we always miss in TLB
even for the same address.

> 
> I managed to solve the issue by using the max level of the two stages. I
> think we may need to do a minimum for the granule.
> 

Just fixing the granularity and level, will alway miss in TLB if they
are different as granularity is used in lookup, I guess one way is to
fall back for stage-2 granularity in lookup if stage-1 lookup fails,
I will have another look and see if there is a better solution for v2.

But for now as you mentioned (also we need update the IOVA to match
the mask), that just should at least work:

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index ef5edfe4dc..ac2dc3efeb 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -572,21 +572,13 @@ static void combine_tlb(SMMUTLBEntry *tlbe, SMMUTLBEntry *tlbe_s2,
                         dma_addr_t iova, SMMUTransCfg *cfg)
 {
         if (cfg->stage == SMMU_NESTED) {
-
-            /*
-             * tg and level are used from stage-1, while the addr mask can be
-             * smaller in case stage-2 size(based on granule and level) was
-             * smaller than stage-1.
-             * That should have no impact on:
-             * - lookup: as iova is properly aligned with the stage-1 level and
-             *   granule.
-             * - Invalidation: as it uses the entry mask.
-             */
             tlbe->entry.addr_mask = MIN(tlbe->entry.addr_mask,
                                         tlbe_s2->entry.addr_mask);
             tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
                                           tlbe->entry.translated_addr);
-
+            tlbe->granule = MIN(tlbe->granule, tlbe_s2->granule);
+            tlbe->level = MAX(tlbe->level, tlbe_s2->level);
+            tlbe->entry.iova = iova & ~tlbe->entry.addr_mask;
             /* parent_perm has s2 perm while perm has s1 perm. */
             tlbe->parent_perm = tlbe_s2->entry.perm;

> 
> > +             * smaller in case stage-2 size(based on granule and level) was
> > +             * smaller than stage-1.
> > +             * That should have no impact on:
> > +             * - lookup: as iova is properly aligned with the stage-1 level and
> > +             *   granule.
> > +             * - Invalidation: as it uses the entry mask.
> > +             */
> > +            tlbe->entry.addr_mask = MIN(tlbe->entry.addr_mask,
> > +                                        tlbe_s2->entry.addr_mask);
> > +            tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
> > +                                          tlbe->entry.translated_addr);
> > +
> > +            /* parent_perm has s2 perm while perm has s1 perm. */
> > +            tlbe->parent_perm = tlbe_s2->entry.perm;
> > +            return;
> > +        }
> > +
> > +        /* That was not nested, use the s2. */
> > +        memcpy(tlbe, tlbe_s2, sizeof(*tlbe));
> > +}
> 
> Cheers,
> 
> -- 
> Julien Grall

Thanks,
Mostafa
diff mbox series

Patch

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index f0905c28cf..da8776ecec 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -119,7 +119,7 @@  void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new)
     *key = smmu_get_iotlb_key(cfg->asid, cfg->s2cfg.vmid, new->entry.iova,
                               tg, new->level, stage_tag);
     trace_smmu_iotlb_insert(cfg->asid, cfg->s2cfg.vmid, new->entry.iova,
-                            tg, new->level, stage_tag);
+                            tg, new->level, new->entry.addr_mask, stage_tag);
     g_hash_table_insert(bs->iotlb, key, new);
 }
 
@@ -305,6 +305,27 @@  SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova)
     return NULL;
 }
 
+/* Return the correct table address based on configuration. */
+static inline int translate_table_s1(dma_addr_t *table_addr, SMMUTransCfg *cfg,
+                                     SMMUPTWEventInfo *info, SMMUState *bs)
+{
+    dma_addr_t addr = *table_addr;
+    SMMUTLBEntry *cached_entry;
+
+    if (cfg->stage != SMMU_NESTED) {
+        return 0;
+    }
+
+    CALL_FUNC_CFG_S2(cfg, cached_entry, smmu_translate,
+                     bs, cfg, addr, IOMMU_RO, info);
+
+    if (cached_entry) {
+        *table_addr = CACHED_ENTRY_TO_ADDR(cached_entry, addr);
+        return 0;
+    }
+    return -EINVAL;
+}
+
 /**
  * smmu_ptw_64_s1 - VMSAv8-64 Walk of the page tables for a given IOVA
  * @cfg: translation config
@@ -320,7 +341,8 @@  SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova)
  */
 static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
                           dma_addr_t iova, IOMMUAccessFlags perm,
-                          SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info)
+                          SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info,
+                          SMMUState *bs)
 {
     dma_addr_t baseaddr, indexmask;
     SMMUStage stage = cfg->stage;
@@ -368,6 +390,10 @@  static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
                 goto error;
             }
             baseaddr = get_table_pte_address(pte, granule_sz);
+            /* In case of failure, retain stage-2 fault. */
+            if (translate_table_s1(&baseaddr, cfg, info, bs)) {
+                goto error_no_stage;
+            }
             level++;
             continue;
         } else if (is_page_pte(pte, level)) {
@@ -403,7 +429,7 @@  static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
         tlbe->entry.translated_addr = gpa;
         tlbe->entry.iova = iova & ~mask;
         tlbe->entry.addr_mask = mask;
-        tlbe->entry.perm = PTE_AP_TO_PERM(ap);
+        tlbe->parent_perm = tlbe->entry.perm = PTE_AP_TO_PERM(ap);
         tlbe->level = level;
         tlbe->granule = granule_sz;
         return 0;
@@ -412,6 +438,7 @@  static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
 
 error:
     info->stage = SMMU_STAGE_1;
+error_no_stage:
     tlbe->entry.perm = IOMMU_NONE;
     return -EINVAL;
 }
@@ -524,7 +551,7 @@  static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
         tlbe->entry.translated_addr = gpa;
         tlbe->entry.iova = ipa & ~mask;
         tlbe->entry.addr_mask = mask;
-        tlbe->entry.perm = s2ap;
+        tlbe->parent_perm = tlbe->entry.perm = s2ap;
         tlbe->level = level;
         tlbe->granule = granule_sz;
         return 0;
@@ -537,6 +564,35 @@  error:
     return -EINVAL;
 }
 
+/* Combine 2 TLB enteries and return in tlbe. */
+static void combine_tlb(SMMUTLBEntry *tlbe, SMMUTLBEntry *tlbe_s2,
+                        dma_addr_t iova, SMMUTransCfg *cfg)
+{
+        if (cfg->stage == SMMU_NESTED) {
+
+            /*
+             * tg and level are used from stage-1, while the addr mask can be
+             * smaller in case stage-2 size(based on granule and level) was
+             * smaller than stage-1.
+             * That should have no impact on:
+             * - lookup: as iova is properly aligned with the stage-1 level and
+             *   granule.
+             * - Invalidation: as it uses the entry mask.
+             */
+            tlbe->entry.addr_mask = MIN(tlbe->entry.addr_mask,
+                                        tlbe_s2->entry.addr_mask);
+            tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
+                                          tlbe->entry.translated_addr);
+
+            /* parent_perm has s2 perm while perm has s1 perm. */
+            tlbe->parent_perm = tlbe_s2->entry.perm;
+            return;
+        }
+
+        /* That was not nested, use the s2. */
+        memcpy(tlbe, tlbe_s2, sizeof(*tlbe));
+}
+
 /**
  * smmu_ptw - Walk the page tables for an IOVA, according to @cfg
  *
@@ -549,28 +605,59 @@  error:
  * return 0 on success
  */
 int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm,
-             SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info)
+             SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info, SMMUState *bs)
 {
-    if (cfg->stage == SMMU_STAGE_1) {
-        return smmu_ptw_64_s1(cfg, iova, perm, tlbe, info);
-    } else if (cfg->stage == SMMU_STAGE_2) {
-        /*
-         * If bypassing stage 1(or unimplemented), the input address is passed
-         * directly to stage 2 as IPA. If the input address of a transaction
-         * exceeds the size of the IAS, a stage 1 Address Size fault occurs.
-         * For AA64, IAS = OAS according to (IHI 0070.E.a) "3.4 Address sizes"
-         */
-        if (iova >= (1ULL << cfg->oas)) {
-            info->type = SMMU_PTW_ERR_ADDR_SIZE;
-            info->stage = SMMU_STAGE_1;
-            tlbe->entry.perm = IOMMU_NONE;
-            return -EINVAL;
+    int ret = 0;
+    SMMUTLBEntry tlbe_s2;
+    dma_addr_t ipa = iova;
+
+    if (cfg->stage & SMMU_STAGE_1) {
+        ret = smmu_ptw_64_s1(cfg, iova, perm, tlbe, info, bs);
+        if (ret) {
+            return ret;
         }
+        /* This is the IPA for next stage.*/
+        ipa = CACHED_ENTRY_TO_ADDR(tlbe, iova);
+    }
+
+    /*
+     * The address output from the translation causes a stage 1 Address Size
+     * fault if it exceeds the range of the effective IPA size for the given CD.
+     * If bypassing stage 1(or unimplemented), the input address is passed
+     * directly to stage 2 as IPA. If the input address of a transaction
+     * exceeds the size of the IAS, a stage 1 Address Size fault occurs.
+     * For AA64, IAS = OAS according to (IHI 0070.E.a) "3.4 Address sizes"
+     */
+    if (ipa >= (1ULL << cfg->oas)) {
+        info->type = SMMU_PTW_ERR_ADDR_SIZE;
+        info->stage = SMMU_STAGE_1;
+        tlbe->entry.perm = IOMMU_NONE;
+        return -EINVAL;
+    }
 
-        return smmu_ptw_64_s2(cfg, iova, perm, tlbe, info);
+    if (cfg->stage & SMMU_STAGE_2) {
+        ret = smmu_ptw_64_s2(cfg, ipa, perm, &tlbe_s2, info);
+        if (ret) {
+            return ret;
+        }
+        combine_tlb(tlbe, &tlbe_s2, iova, cfg);
     }
 
-    g_assert_not_reached();
+    return ret;
+}
+
+static int validate_tlb_entry(SMMUTLBEntry *cached_entry, IOMMUAccessFlags flag,
+                              SMMUPTWEventInfo *info)
+{
+        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm &
+            cached_entry->parent_perm & IOMMU_WO)) {
+            info->type = SMMU_PTW_ERR_PERMISSION;
+            info->stage = !(cached_entry->entry.perm & IOMMU_WO) ?
+                          SMMU_STAGE_1 :
+                          SMMU_STAGE_2;
+            return -EINVAL;
+        }
+        return 0;
 }
 
 SMMUTLBEntry *smmu_translate(SMMUState *bs, SMMUTransCfg *cfg, dma_addr_t addr,
@@ -614,16 +701,14 @@  SMMUTLBEntry *smmu_translate(SMMUState *bs, SMMUTransCfg *cfg, dma_addr_t addr,
 
     cached_entry = smmu_iotlb_lookup(bs, cfg, &tt_combined, aligned_addr);
     if (cached_entry) {
-        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) {
-            info->type = SMMU_PTW_ERR_PERMISSION;
-            info->stage = SMMU_STAGE_TO_TLB_TAG(cfg->stage);
+        if (validate_tlb_entry(cached_entry, flag, info)) {
             return NULL;
         }
         return cached_entry;
     }
 
     cached_entry = g_new0(SMMUTLBEntry, 1);
-    status = smmu_ptw(cfg, aligned_addr, flag, cached_entry, info);
+    status = smmu_ptw(cfg, aligned_addr, flag, cached_entry, info, bs);
     if (status) {
             g_free(cached_entry);
             return NULL;
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
index 34b10af83f..215df91ea3 100644
--- a/hw/arm/trace-events
+++ b/hw/arm/trace-events
@@ -18,7 +18,7 @@  smmu_iotlb_inv_stage(int stage) "Stage invalidate stage=%d"
 smmu_inv_notifiers_mr(const char *name) "iommu mr=%s"
 smmu_iotlb_lookup_hit(uint16_t asid, uint16_t vmid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache HIT asid=%d vmid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d"
 smmu_iotlb_lookup_miss(uint16_t asid, uint16_t vmid, uint64_t addr, uint32_t hit, uint32_t miss, uint32_t p) "IOTLB cache MISS asid=%d vmid=%d addr=0x%"PRIx64" hit=%d miss=%d hit rate=%d"
-smmu_iotlb_insert(uint16_t asid, uint16_t vmid, uint64_t addr, uint8_t tg, uint8_t level, int stage) "IOTLB ++ asid=%d vmid=%d addr=0x%"PRIx64" tg=%d level=%d stage=%d"
+smmu_iotlb_insert(uint16_t asid, uint16_t vmid, uint64_t addr, uint8_t tg, uint8_t level, uint64_t mask, int stage) "IOTLB ++ asid=%d vmid=%d addr=0x%"PRIx64" tg=%d level=%d mask=0x%"PRIx64" stage=%d"
 
 # smmuv3.c
 smmuv3_read_mmio(uint64_t addr, uint64_t val, unsigned size, uint32_t r) "addr: 0x%"PRIx64" val:0x%"PRIx64" size: 0x%x(%d)"
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index c0969e461d..4f9505d91c 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -91,6 +91,7 @@  typedef struct SMMUTLBEntry {
     IOMMUTLBEntry entry;
     uint8_t level;
     uint8_t granule;
+    IOMMUAccessFlags parent_perm;
 } SMMUTLBEntry;
 
 /* Stage-2 configuration. */
@@ -199,7 +200,7 @@  static inline uint16_t smmu_get_sid(SMMUDevice *sdev)
  * pair, according to @cfg translation config
  */
 int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm,
-             SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info);
+             SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info, SMMUState *bs);
 
 
 /* smmu_translate - Look for a translation in TLB, if not, do a PTW. */