diff mbox series

[RFC,v3,08/18] hw/arm/smmu-common: Add support for nested TLB

Message ID 20240429032403.74910-9-smostafa@google.com
State New
Headers show
Series SMMUv3 nested translation support | expand

Commit Message

Mostafa Saleh April 29, 2024, 3:23 a.m. UTC
This patch adds support for nested(combined) TLB entries.
The main function combine_tlb() is not used here but in the next
patches, but to simplify the patches it is introduced first.

Main changes:
1) New entry added in the TLB, parent_perm, for nested TLB, holds the
   stage-2 permission, this can be used to know the origin of a
   permission fault from a cached entry as caching the “and” of the
   permissions loses this information.

   SMMUPTWEventInfo is used to hold information about PTW faults so
   the event can be populated, the value of stage (which maps to S2
   in the event) used to be set based on the current stage for TLB
   permission faults, however with the parent_perm, it is now set
   based on which perm has the missing permission

   When nesting is not enabled it has the same value as perm which
   doesn't change the logic.

2) As combined TLB implementation is used, the combination logic
   chooses:
   - tg and level from the entry which has the smallest addr_mask.
   - Based on that the iova that would be cached is recalculated.
   - Translated_addr is chosen from stage-2.

Signed-off-by: Mostafa Saleh <smostafa@google.com>
---
 hw/arm/smmu-common.c         | 32 ++++++++++++++++++++++++++++----
 include/hw/arm/smmu-common.h |  1 +
 2 files changed, 29 insertions(+), 4 deletions(-)

Comments

Eric Auger May 15, 2024, 1:48 p.m. UTC | #1
Hi Mostafa,

On 4/29/24 05:23, Mostafa Saleh wrote:
> This patch adds support for nested(combined) TLB entries.
space between nested and (.
> The main function combine_tlb() is not used here but in the next
> patches, but to simplify the patches it is introduced first.
>
> Main changes:
> 1) New entry added in the TLB, parent_perm, for nested TLB, holds the
s/entry/field, s/TLB/SMMUTLBEntry struct
>    stage-2 permission, this can be used to know the origin of a
>    permission fault from a cached entry as caching the “and” of the
>    permissions loses this information.
>
>    SMMUPTWEventInfo is used to hold information about PTW faults so
>    the event can be populated, the value of stage (which maps to S2
>    in the event) used to be set based on the current stage for TLB
I don't understand "(which maps to S2 in the event)". What do you mean?
This could be S1 or S2 depending on the active stage, no?
>    permission faults, however with the parent_perm, it is now set
>    based on which perm has the missing permission
>
>    When nesting is not enabled it has the same value as perm which
>    doesn't change the logic.
>
> 2) As combined TLB implementation is used, the combination logic
>    chooses:
>    - tg and level from the entry which has the smallest addr_mask.
tbh I am scared bout swapping s1/s2 tg and level. In smmu_iotlb_lookup()
I see tt->granule_sz being used which is s1 data. I mean it is not
obvious to me this is correct. Could you maybe give more explanations
detailing why/how this is guaranted to work.

Can you give additional details about what s1+s2 combinations were tested?
>    - Based on that the iova that would be cached is recalculated.
>    - Translated_addr is chosen from stage-2.
>
> Signed-off-by: Mostafa Saleh <smostafa@google.com>
> ---
>  hw/arm/smmu-common.c         | 32 ++++++++++++++++++++++++++++----
>  include/hw/arm/smmu-common.h |  1 +
>  2 files changed, 29 insertions(+), 4 deletions(-)
>
> diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
> index 21982621c0..0d6945fa54 100644
> --- a/hw/arm/smmu-common.c
> +++ b/hw/arm/smmu-common.c
> @@ -394,7 +394,7 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
>          tlbe->entry.translated_addr = gpa;
>          tlbe->entry.iova = iova & ~mask;
>          tlbe->entry.addr_mask = mask;
> -        tlbe->entry.perm = PTE_AP_TO_PERM(ap);
> +        tlbe->parent_perm = tlbe->entry.perm = PTE_AP_TO_PERM(ap);
nit: I would prefer on separate lines.
>          tlbe->level = level;
>          tlbe->granule = granule_sz;
>          return 0;
> @@ -515,7 +515,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
>          tlbe->entry.translated_addr = gpa;
>          tlbe->entry.iova = ipa & ~mask;
>          tlbe->entry.addr_mask = mask;
> -        tlbe->entry.perm = s2ap;
> +        tlbe->parent_perm = tlbe->entry.perm = s2ap;
>          tlbe->level = level;
>          tlbe->granule = granule_sz;
>          return 0;
> @@ -530,6 +530,27 @@ error:
>      return -EINVAL;
>  }
>  
> +/* combine 2 TLB entries and return in tlbe in nested config. */
suggestion: combine S1 and S2 TLB entries into a single entry. As a
result the S1 entry is overriden with combined data.
> +static void __attribute__((unused)) combine_tlb(SMMUTLBEntry *tlbe,
> +                                                SMMUTLBEntry *tlbe_s2,
> +                                                dma_addr_t iova,
> +                                                SMMUTransCfg *cfg)
> +{
> +    if (tlbe_s2->entry.addr_mask < tlbe->entry.addr_mask) {
> +        tlbe->entry.addr_mask = tlbe_s2->entry.addr_mask;
> +        tlbe->granule = tlbe_s2->granule;
> +        tlbe->level = tlbe_s2->level;
> +    }
> +
> +    tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
> +                                    tlbe->entry.translated_addr);
> +
> +    tlbe->entry.iova = iova & ~tlbe->entry.addr_mask;
> +    /* parent_perm has s2 perm while perm has s1 perm. */

 suggestion: while perm keeps s1 perm.

> +    tlbe->parent_perm = tlbe_s2->entry.perm;
> +    return;
> +}
> +
>  /**
>   * smmu_ptw - Walk the page tables for an IOVA, according to @cfg
>   *
> @@ -607,9 +628,12 @@ SMMUTLBEntry *smmu_translate(SMMUState *bs, SMMUTransCfg *cfg, dma_addr_t addr,
>  
>      cached_entry = smmu_iotlb_lookup(bs, cfg, &tt_combined, aligned_addr);
>      if (cached_entry) {
> -        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) {
> +        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm &
> +            cached_entry->parent_perm & IOMMU_WO)) {
>              info->type = SMMU_PTW_ERR_PERMISSION;
> -            info->stage = cfg->stage;
> +            info->stage = !(cached_entry->entry.perm & IOMMU_WO) ?
> +                          SMMU_STAGE_1 :
> +                          SMMU_STAGE_2;
>              return NULL;
>          }
>          return cached_entry;
> diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
> index 09d3b9e734..1db566d451 100644
> --- a/include/hw/arm/smmu-common.h
> +++ b/include/hw/arm/smmu-common.h
> @@ -77,6 +77,7 @@ typedef struct SMMUTLBEntry {
>      IOMMUTLBEntry entry;
>      uint8_t level;
>      uint8_t granule;
> +    IOMMUAccessFlags parent_perm;
>  } SMMUTLBEntry;
>  
>  /* Stage-2 configuration. */
Thanks

Eric
Mostafa Saleh May 16, 2024, 3:20 p.m. UTC | #2
Hi Eric,

On Wed, May 15, 2024 at 03:48:05PM +0200, Eric Auger wrote:
> Hi Mostafa,
> 
> On 4/29/24 05:23, Mostafa Saleh wrote:
> > This patch adds support for nested(combined) TLB entries.
> space between nested and (.
Will do.
> > The main function combine_tlb() is not used here but in the next
> > patches, but to simplify the patches it is introduced first.
> >
> > Main changes:
> > 1) New entry added in the TLB, parent_perm, for nested TLB, holds the
> s/entry/field, s/TLB/SMMUTLBEntry struct
Will do.
> >    stage-2 permission, this can be used to know the origin of a
> >    permission fault from a cached entry as caching the “and” of the
> >    permissions loses this information.
> >
> >    SMMUPTWEventInfo is used to hold information about PTW faults so
> >    the event can be populated, the value of stage (which maps to S2
> >    in the event) used to be set based on the current stage for TLB
> I don't understand "(which maps to S2 in the event)". What do you mean?
> This could be S1 or S2 depending on the active stage, no?

Not really, if the IPA size is larger than S2 input size, this is
considered stage-1 fault.

For TLB permission fault, yes, that is how it is decided.
However, with nesting, a permission fault from a cached entry can be
from a stage-1 or stage-2, that’s why we now cache both and not just
the combined permission, and the logic to set fault stage is modified
accordingly.

> >    permission faults, however with the parent_perm, it is now set
> >    based on which perm has the missing permission
> >
> >    When nesting is not enabled it has the same value as perm which
> >    doesn't change the logic.
> >
> > 2) As combined TLB implementation is used, the combination logic
> >    chooses:
> >    - tg and level from the entry which has the smallest addr_mask.
> tbh I am scared bout swapping s1/s2 tg and level. In smmu_iotlb_lookup()
> I see tt->granule_sz being used which is s1 data. I mean it is not
> obvious to me this is correct. Could you maybe give more explanations
> detailing why/how this is guaranted to work.

As you mentioned the next patch reworks the lookup logic, I can reorder
the 2 patches if that is better, please let me know what you think?

> 
> Can you give additional details about what s1+s2 combinations were tested?

I tested with S1 and S2 4K pages
S1 level = 3 and S2 level = 3
S1 level = 2 and S2 level = 3
S1 level = 3 and S2 level = 2
S1 level = 1 and S2 level = 2

And also tested with with S1 64K granule and S2 4K.

> >    - Based on that the iova that would be cached is recalculated.
> >    - Translated_addr is chosen from stage-2.
> >
> > Signed-off-by: Mostafa Saleh <smostafa@google.com>
> > ---
> >  hw/arm/smmu-common.c         | 32 ++++++++++++++++++++++++++++----
> >  include/hw/arm/smmu-common.h |  1 +
> >  2 files changed, 29 insertions(+), 4 deletions(-)
> >
> > diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
> > index 21982621c0..0d6945fa54 100644
> > --- a/hw/arm/smmu-common.c
> > +++ b/hw/arm/smmu-common.c
> > @@ -394,7 +394,7 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
> >          tlbe->entry.translated_addr = gpa;
> >          tlbe->entry.iova = iova & ~mask;
> >          tlbe->entry.addr_mask = mask;
> > -        tlbe->entry.perm = PTE_AP_TO_PERM(ap);
> > +        tlbe->parent_perm = tlbe->entry.perm = PTE_AP_TO_PERM(ap);
> nit: I would prefer on separate lines.
Will do.

> >          tlbe->level = level;
> >          tlbe->granule = granule_sz;
> >          return 0;
> > @@ -515,7 +515,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >          tlbe->entry.translated_addr = gpa;
> >          tlbe->entry.iova = ipa & ~mask;
> >          tlbe->entry.addr_mask = mask;
> > -        tlbe->entry.perm = s2ap;
> > +        tlbe->parent_perm = tlbe->entry.perm = s2ap;
> >          tlbe->level = level;
> >          tlbe->granule = granule_sz;
> >          return 0;
> > @@ -530,6 +530,27 @@ error:
> >      return -EINVAL;
> >  }
> >  
> > +/* combine 2 TLB entries and return in tlbe in nested config. */
> suggestion: combine S1 and S2 TLB entries into a single entry. As a
> result the S1 entry is overriden with combined data.
Will do.

> > +static void __attribute__((unused)) combine_tlb(SMMUTLBEntry *tlbe,
> > +                                                SMMUTLBEntry *tlbe_s2,
> > +                                                dma_addr_t iova,
> > +                                                SMMUTransCfg *cfg)
> > +{
> > +    if (tlbe_s2->entry.addr_mask < tlbe->entry.addr_mask) {
> > +        tlbe->entry.addr_mask = tlbe_s2->entry.addr_mask;
> > +        tlbe->granule = tlbe_s2->granule;
> > +        tlbe->level = tlbe_s2->level;
> > +    }
> > +
> > +    tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
> > +                                    tlbe->entry.translated_addr);
> > +
> > +    tlbe->entry.iova = iova & ~tlbe->entry.addr_mask;
> > +    /* parent_perm has s2 perm while perm has s1 perm. */
> 
>  suggestion: while perm keeps s1 perm.
> 
Will do.

Thanks,
Mostafa
> > +    tlbe->parent_perm = tlbe_s2->entry.perm;
> > +    return;
> > +}
> > +
> >  /**
> >   * smmu_ptw - Walk the page tables for an IOVA, according to @cfg
> >   *
> > @@ -607,9 +628,12 @@ SMMUTLBEntry *smmu_translate(SMMUState *bs, SMMUTransCfg *cfg, dma_addr_t addr,
> >  
> >      cached_entry = smmu_iotlb_lookup(bs, cfg, &tt_combined, aligned_addr);
> >      if (cached_entry) {
> > -        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) {
> > +        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm &
> > +            cached_entry->parent_perm & IOMMU_WO)) {
> >              info->type = SMMU_PTW_ERR_PERMISSION;
> > -            info->stage = cfg->stage;
> > +            info->stage = !(cached_entry->entry.perm & IOMMU_WO) ?
> > +                          SMMU_STAGE_1 :
> > +                          SMMU_STAGE_2;
> >              return NULL;
> >          }
> >          return cached_entry;
> > diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
> > index 09d3b9e734..1db566d451 100644
> > --- a/include/hw/arm/smmu-common.h
> > +++ b/include/hw/arm/smmu-common.h
> > @@ -77,6 +77,7 @@ typedef struct SMMUTLBEntry {
> >      IOMMUTLBEntry entry;
> >      uint8_t level;
> >      uint8_t granule;
> > +    IOMMUAccessFlags parent_perm;
> >  } SMMUTLBEntry;
> >  
> >  /* Stage-2 configuration. */
> Thanks
> 
> Eric
>
Eric Auger May 20, 2024, 8:20 a.m. UTC | #3
Hi Mostafa,
On 5/16/24 17:20, Mostafa Saleh wrote:
> Hi Eric,
>
> On Wed, May 15, 2024 at 03:48:05PM +0200, Eric Auger wrote:
>> Hi Mostafa,
>>
>> On 4/29/24 05:23, Mostafa Saleh wrote:
>>> This patch adds support for nested(combined) TLB entries.
>> space between nested and (.
> Will do.
>>> The main function combine_tlb() is not used here but in the next
>>> patches, but to simplify the patches it is introduced first.
>>>
>>> Main changes:
>>> 1) New entry added in the TLB, parent_perm, for nested TLB, holds the
>> s/entry/field, s/TLB/SMMUTLBEntry struct
> Will do.
>>>    stage-2 permission, this can be used to know the origin of a
>>>    permission fault from a cached entry as caching the “and” of the
>>>    permissions loses this information.
>>>
>>>    SMMUPTWEventInfo is used to hold information about PTW faults so
>>>    the event can be populated, the value of stage (which maps to S2
>>>    in the event) used to be set based on the current stage for TLB
>> I don't understand "(which maps to S2 in the event)". What do you mean?
>> This could be S1 or S2 depending on the active stage, no?
> Not really, if the IPA size is larger than S2 input size, this is
> considered stage-1 fault.
>
> For TLB permission fault, yes, that is how it is decided.
> However, with nesting, a permission fault from a cached entry can be
> from a stage-1 or stage-2, that’s why we now cache both and not just
> the combined permission, and the logic to set fault stage is modified
> accordingly.
I meant in smmu_translate() we initially had for permission fault
info->stage = cfg->stage whcih can be S1 or S2. Hence the fact I do not
understand the sentence

the value of stage (which maps to S2 in the event)

I understand that with nested this computation needs to change because the permission can be linked to either the S1 or S2 stage.
Maybe that's just a matter or rephrasing?


>>>    permission faults, however with the parent_perm, it is now set
>>>    based on which perm has the missing permission
>>>
>>>    When nesting is not enabled it has the same value as perm which
>>>    doesn't change the logic.
>>>
>>> 2) As combined TLB implementation is used, the combination logic
>>>    chooses:
>>>    - tg and level from the entry which has the smallest addr_mask.
>> tbh I am scared bout swapping s1/s2 tg and level. In smmu_iotlb_lookup()
>> I see tt->granule_sz being used which is s1 data. I mean it is not
>> obvious to me this is correct. Could you maybe give more explanations
>> detailing why/how this is guaranted to work.
> As you mentioned the next patch reworks the lookup logic, I can reorder
> the 2 patches if that is better, please let me know what you think?
Yes if you manage to reorder that may be more logical because otherwise
it looks incorrect.
>
>> Can you give additional details about what s1+s2 combinations were tested?
> I tested with S1 and S2 4K pages
> S1 level = 3 and S2 level = 3
> S1 level = 2 and S2 level = 3
> S1 level = 3 and S2 level = 2
> S1 level = 1 and S2 level = 2
>
> And also tested with with S1 64K granule and S2 4K.
OK, I would suggest you mention that in the coverletter because it is
reassuring and the combination is not totally obvious - at least to me ;-) -

Eric
>
>>>    - Based on that the iova that would be cached is recalculated.
>>>    - Translated_addr is chosen from stage-2.
>>>
>>> Signed-off-by: Mostafa Saleh <smostafa@google.com>
>>> ---
>>>  hw/arm/smmu-common.c         | 32 ++++++++++++++++++++++++++++----
>>>  include/hw/arm/smmu-common.h |  1 +
>>>  2 files changed, 29 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
>>> index 21982621c0..0d6945fa54 100644
>>> --- a/hw/arm/smmu-common.c
>>> +++ b/hw/arm/smmu-common.c
>>> @@ -394,7 +394,7 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
>>>          tlbe->entry.translated_addr = gpa;
>>>          tlbe->entry.iova = iova & ~mask;
>>>          tlbe->entry.addr_mask = mask;
>>> -        tlbe->entry.perm = PTE_AP_TO_PERM(ap);
>>> +        tlbe->parent_perm = tlbe->entry.perm = PTE_AP_TO_PERM(ap);
>> nit: I would prefer on separate lines.
> Will do.
>
>>>          tlbe->level = level;
>>>          tlbe->granule = granule_sz;
>>>          return 0;
>>> @@ -515,7 +515,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
>>>          tlbe->entry.translated_addr = gpa;
>>>          tlbe->entry.iova = ipa & ~mask;
>>>          tlbe->entry.addr_mask = mask;
>>> -        tlbe->entry.perm = s2ap;
>>> +        tlbe->parent_perm = tlbe->entry.perm = s2ap;
>>>          tlbe->level = level;
>>>          tlbe->granule = granule_sz;
>>>          return 0;
>>> @@ -530,6 +530,27 @@ error:
>>>      return -EINVAL;
>>>  }
>>>  
>>> +/* combine 2 TLB entries and return in tlbe in nested config. */
>> suggestion: combine S1 and S2 TLB entries into a single entry. As a
>> result the S1 entry is overriden with combined data.
> Will do.
>
>>> +static void __attribute__((unused)) combine_tlb(SMMUTLBEntry *tlbe,
>>> +                                                SMMUTLBEntry *tlbe_s2,
>>> +                                                dma_addr_t iova,
>>> +                                                SMMUTransCfg *cfg)
>>> +{
>>> +    if (tlbe_s2->entry.addr_mask < tlbe->entry.addr_mask) {
>>> +        tlbe->entry.addr_mask = tlbe_s2->entry.addr_mask;
>>> +        tlbe->granule = tlbe_s2->granule;
>>> +        tlbe->level = tlbe_s2->level;
>>> +    }
>>> +
>>> +    tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
>>> +                                    tlbe->entry.translated_addr);
>>> +
>>> +    tlbe->entry.iova = iova & ~tlbe->entry.addr_mask;
>>> +    /* parent_perm has s2 perm while perm has s1 perm. */
>>  suggestion: while perm keeps s1 perm.
>>
> Will do.
>
> Thanks,
> Mostafa
>>> +    tlbe->parent_perm = tlbe_s2->entry.perm;
>>> +    return;
>>> +}
>>> +
>>>  /**
>>>   * smmu_ptw - Walk the page tables for an IOVA, according to @cfg
>>>   *
>>> @@ -607,9 +628,12 @@ SMMUTLBEntry *smmu_translate(SMMUState *bs, SMMUTransCfg *cfg, dma_addr_t addr,
>>>  
>>>      cached_entry = smmu_iotlb_lookup(bs, cfg, &tt_combined, aligned_addr);
>>>      if (cached_entry) {
>>> -        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) {
>>> +        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm &
>>> +            cached_entry->parent_perm & IOMMU_WO)) {
>>>              info->type = SMMU_PTW_ERR_PERMISSION;
>>> -            info->stage = cfg->stage;
>>> +            info->stage = !(cached_entry->entry.perm & IOMMU_WO) ?
>>> +                          SMMU_STAGE_1 :
>>> +                          SMMU_STAGE_2;
>>>              return NULL;
>>>          }
>>>          return cached_entry;
>>> diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
>>> index 09d3b9e734..1db566d451 100644
>>> --- a/include/hw/arm/smmu-common.h
>>> +++ b/include/hw/arm/smmu-common.h
>>> @@ -77,6 +77,7 @@ typedef struct SMMUTLBEntry {
>>>      IOMMUTLBEntry entry;
>>>      uint8_t level;
>>>      uint8_t granule;
>>> +    IOMMUAccessFlags parent_perm;
>>>  } SMMUTLBEntry;
>>>  
>>>  /* Stage-2 configuration. */
>> Thanks
>>
>> Eric
>>
diff mbox series

Patch

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 21982621c0..0d6945fa54 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -394,7 +394,7 @@  static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
         tlbe->entry.translated_addr = gpa;
         tlbe->entry.iova = iova & ~mask;
         tlbe->entry.addr_mask = mask;
-        tlbe->entry.perm = PTE_AP_TO_PERM(ap);
+        tlbe->parent_perm = tlbe->entry.perm = PTE_AP_TO_PERM(ap);
         tlbe->level = level;
         tlbe->granule = granule_sz;
         return 0;
@@ -515,7 +515,7 @@  static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
         tlbe->entry.translated_addr = gpa;
         tlbe->entry.iova = ipa & ~mask;
         tlbe->entry.addr_mask = mask;
-        tlbe->entry.perm = s2ap;
+        tlbe->parent_perm = tlbe->entry.perm = s2ap;
         tlbe->level = level;
         tlbe->granule = granule_sz;
         return 0;
@@ -530,6 +530,27 @@  error:
     return -EINVAL;
 }
 
+/* combine 2 TLB entries and return in tlbe in nested config. */
+static void __attribute__((unused)) combine_tlb(SMMUTLBEntry *tlbe,
+                                                SMMUTLBEntry *tlbe_s2,
+                                                dma_addr_t iova,
+                                                SMMUTransCfg *cfg)
+{
+    if (tlbe_s2->entry.addr_mask < tlbe->entry.addr_mask) {
+        tlbe->entry.addr_mask = tlbe_s2->entry.addr_mask;
+        tlbe->granule = tlbe_s2->granule;
+        tlbe->level = tlbe_s2->level;
+    }
+
+    tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
+                                    tlbe->entry.translated_addr);
+
+    tlbe->entry.iova = iova & ~tlbe->entry.addr_mask;
+    /* parent_perm has s2 perm while perm has s1 perm. */
+    tlbe->parent_perm = tlbe_s2->entry.perm;
+    return;
+}
+
 /**
  * smmu_ptw - Walk the page tables for an IOVA, according to @cfg
  *
@@ -607,9 +628,12 @@  SMMUTLBEntry *smmu_translate(SMMUState *bs, SMMUTransCfg *cfg, dma_addr_t addr,
 
     cached_entry = smmu_iotlb_lookup(bs, cfg, &tt_combined, aligned_addr);
     if (cached_entry) {
-        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) {
+        if ((flag & IOMMU_WO) && !(cached_entry->entry.perm &
+            cached_entry->parent_perm & IOMMU_WO)) {
             info->type = SMMU_PTW_ERR_PERMISSION;
-            info->stage = cfg->stage;
+            info->stage = !(cached_entry->entry.perm & IOMMU_WO) ?
+                          SMMU_STAGE_1 :
+                          SMMU_STAGE_2;
             return NULL;
         }
         return cached_entry;
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index 09d3b9e734..1db566d451 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -77,6 +77,7 @@  typedef struct SMMUTLBEntry {
     IOMMUTLBEntry entry;
     uint8_t level;
     uint8_t granule;
+    IOMMUAccessFlags parent_perm;
 } SMMUTLBEntry;
 
 /* Stage-2 configuration. */