diff mbox series

[v2] arm: implement cache/shareability attribute bits for PAR registers

Message ID 20171020214918.18944-1-Andrew.Baumann@microsoft.com
State New
Headers show
Series [v2] arm: implement cache/shareability attribute bits for PAR registers | expand

Commit Message

Cameron Esfahani via Oct. 20, 2017, 9:49 p.m. UTC
On a successful address translation instruction, PAR is supposed to
contain cacheability and shareability attributes determined by the
translation. We previously returned 0 for these bits (in line with the
general strategy of ignoring caches and memory attributes), but some
guest OSes may depend on them.

This patch collects the attribute bits in the page-table walk, and
updates PAR with the correct attributes for all LPAE
translations. Short descriptor formats still return 0 for these bits,
as in the prior implementation, but now log an unimplemented message.

Signed-off-by: Andrew Baumann <Andrew.Baumann@microsoft.com>
---
v2:
 * return attrs via out parameter from get_phys_addr, rather than MemTxAttrs
 * move MAIR lookup/index inline, since it turned out to be simple
 * implement attributes for stage 2 translations
 * combine attributes from stages 1 and 2 when required

Attributes for short PTE formats remain unimplemented; there's a LOG_UNIMP for
this case, but it's likely to be noisy for guests that trigger it -- do we need
a one-shot mechanism for the log statement?

Cheers,
Andrew

 target/arm/helper.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 139 insertions(+), 14 deletions(-)

Comments

Peter Maydell Oct. 30, 2017, 7:25 p.m. UTC | #1
On 20 October 2017 at 22:49, Andrew Baumann
<Andrew.Baumann@microsoft.com> wrote:
> On a successful address translation instruction, PAR is supposed to
> contain cacheability and shareability attributes determined by the
> translation. We previously returned 0 for these bits (in line with the
> general strategy of ignoring caches and memory attributes), but some
> guest OSes may depend on them.
>
> This patch collects the attribute bits in the page-table walk, and
> updates PAR with the correct attributes for all LPAE
> translations. Short descriptor formats still return 0 for these bits,
> as in the prior implementation, but now log an unimplemented message.
>
> Signed-off-by: Andrew Baumann <Andrew.Baumann@microsoft.com>
> ---
> v2:
>  * return attrs via out parameter from get_phys_addr, rather than MemTxAttrs
>  * move MAIR lookup/index inline, since it turned out to be simple
>  * implement attributes for stage 2 translations
>  * combine attributes from stages 1 and 2 when required

Hi. This is looking pretty good, but I have a few comments below,
and we're pretty much at the softfreeze date (KVM Forum last week
meant I didn't get much code review done, unfortunately). Would
you be too sad if this missed 2.11 ?

> Attributes for short PTE formats remain unimplemented; there's a LOG_UNIMP for
> this case, but it's likely to be noisy for guests that trigger it -- do we need
> a one-shot mechanism for the log statement?

I think we should just drop that LOG_UNIMP.

> @@ -8929,6 +8939,28 @@ static bool get_phys_addr_lpae(CPUARMState *env, target_ulong address,
>           */
>          txattrs->secure = false;
>      }
> +
> +    if (cacheattrs != NULL) {
> +        if (mmu_idx == ARMMMUIdx_S2NS) {
> +            /* Translate from the 4-bit stage 2 representation of
> +             * memory attributes (without cache-allocation hints) to
> +             * the 8-bit representation of the stage 1 MAIR registers
> +             * (which includes allocation hints).
> +             */
> +            uint8_t memattr = extract32(attrs, 0, 4);
> +            cacheattrs->attrs = (extract32(memattr, 2, 2) << 4)
> +                              | (extract32(memattr, 0, 2) << 2);

Pseudocode S2ConvertAttrsHints() specifies some hint bit defaults
(no-allocate for NC; RW-allocate for WT or WB) -- do we want to
follow that?

> +            cacheattrs->shareability = extract32(attrs, 4, 2);

Are you sure this is the right bit offset for the shareability bits?
I think 4,2 is the S2AP (access) bits, and the SH bits are in 6,2, same
as for stage 1 descriptors.

> +        } else {
> +            /* Index into MAIR registers for cache attributes */
> +            uint8_t attrindx = extract32(attrs, 0, 3);
> +            uint64_t mair = env->cp15.mair_el[regime_el(env, mmu_idx)];
> +            assert(attrindx <= 7);
> +            cacheattrs->attrs = extract64(mair, attrindx * 8, 8);
> +            cacheattrs->shareability = extract32(attrs, 6, 2);
> +        }
> +    }
> +
>      *phys_ptr = descaddr;
>      *page_size_ptr = page_size;
>      return false;
> @@ -9490,6 +9522,89 @@ static bool get_phys_addr_pmsav5(CPUARMState *env, uint32_t address,
>      return false;
>  }
>
> +/* Combine either inner or outer cacheability attributes for normal
> + * memory, according to table D4-42 of ARM DDI 0487B.b (the ARMv8 ARM).
> + *
> + * NB: only stage 1 includes allocation hints (RW bits), leading to
> + * some asymmetry.
> + */
> +static uint8_t combine_cacheattr_nibble(uint8_t s1, uint8_t s2)
> +{
> +    if (s1 == 4 || s2 == 4) {
> +        /* non-cacheable has precedence */
> +        return 4;
> +    } else if (extract32(s1, 2, 2) == 0 || extract32(s1, 2, 2) == 2) {
> +        /* stage 1 write-through takes precedence */
> +        return s1;
> +    } else if (extract32(s2, 2, 2) == 2) {
> +        /* stage 2 write-through takes precedence */
> +        return s2;
> +    } else { /* write-back */
> +        return s1;
> +    }

The v8A ARM ARM pseudocode CombineS1S2AttrHints() says that the hint
bits always come from s1 regardless of whose attrs won.

(I was hoping you could write this function as something like a
MAX or MIN, but the complexities of the writethrough-transient
encoding and the hint bits mean it doesn't work out.)

> +}
> +
> +/* Combine S1 and S2 cacheability/shareability attributes, per D4.5.4
> + *
> + * @s1:      Attributes from stage 1 walk
> + * @s2:      Attributes from stage 2 walk
> + */
> +static ARMCacheAttrs combine_cacheattrs(ARMCacheAttrs s1, ARMCacheAttrs s2)
> +{
> +    uint8_t s1lo = extract32(s1.attrs, 0, 4), s2lo = extract32(s2.attrs, 0, 4);
> +    uint8_t s1hi = extract32(s1.attrs, 4, 4), s2hi = extract32(s2.attrs, 4, 4);
> +    ARMCacheAttrs ret;
> +
> +    /* Combine shareability attributes (table D4-43) */
> +    if (s1.shareability == 2 || s2.shareability == 2) {
> +        /* if either are outer-shareable, the result is outer-shareable */
> +        ret.shareability = 2;
> +    } else if (s1.shareability == 3 || s2.shareability == 3) {
> +        /* if either are inner-shareable, the result is inner-shareable */
> +        ret.shareability = 3;
> +    } else {
> +        /* both non-shareable */
> +        ret.shareability = 0;
> +    }

You can play bit games with the format here, because
what we're effectively implementing is "whichever is last in
the order '0, 3, 2' wins", which is
   ret.shareability = MIN(s1.shareability ^ 1, s2.shareability ^ 1) ^ 1;
(since the xor with 1 transforms (0,3,2) to (1,2,3) and is self-inverse).
Is that better than the if ladder above? Not entirely sure :-)

> +    /* Combine memory type and cacheability attributes */
> +    if (s1hi == 0 || s2hi == 0) {
> +        /* Device has precedence over normal */
> +        if (s1lo == 0 || s2lo == 0) {
> +            /* nGnRnE has precedence over anything */
> +            ret.attrs = 0;
> +        } else if (s1lo == 4 || s2lo == 4) {
> +            /* non-Reordering has precedence over Reordering */
> +            ret.attrs = 4;  /* nGnRE */
> +        } else if (s1lo == 8 || s2lo == 8) {
> +            /* non-Gathering has precedence over Gathering */
> +            ret.attrs = 8;  /* nGRE */
> +        } else {
> +            ret.attrs = 0xc; /* GRE */
> +        }

Isn't this if-ladder equivalent to just "ret.attrs = MIN(s1lo, s2lo);" ?

> +
> +        /* Any location for which the resultant memory type is any
> +         * type of Device memory is always treated as Outer Shareable.
> +         */
> +        ret.shareability = 2;
> +    } else { /* Normal memory */
> +        /* Outer/inner cacheability combine independently */
> +        ret.attrs = combine_cacheattr_nibble(s1hi, s2hi) << 4
> +                  | combine_cacheattr_nibble(s1lo, s2lo);
> +
> +        if (ret.attrs == 0x44) {
> +            /* Any location for which the resultant memory type is Normal
> +             * Inner Non-cacheable, Outer Non-cacheable is always treated
> +             * as Outer Shareable.
> +             */
> +            ret.shareability = 2;
> +        }
> +    }
> +
> +    return ret;
> +}

thanks
-- PMM
Cameron Esfahani via Oct. 31, 2017, 1:23 a.m. UTC | #2
> From: Peter Maydell [mailto:peter.maydell@linaro.org]

> Sent: Tuesday, 31 October 2017 03:25

> 

> On 20 October 2017 at 22:49, Andrew Baumann

> <Andrew.Baumann@microsoft.com> wrote:

> > On a successful address translation instruction, PAR is supposed to

> > contain cacheability and shareability attributes determined by the

> > translation. We previously returned 0 for these bits (in line with the

> > general strategy of ignoring caches and memory attributes), but some

> > guest OSes may depend on them.

> >

> > This patch collects the attribute bits in the page-table walk, and

> > updates PAR with the correct attributes for all LPAE

> > translations. Short descriptor formats still return 0 for these bits,

> > as in the prior implementation, but now log an unimplemented message.

> >

> > Signed-off-by: Andrew Baumann <Andrew.Baumann@microsoft.com>

> > ---

> > v2:

> >  * return attrs via out parameter from get_phys_addr, rather than

> MemTxAttrs

> >  * move MAIR lookup/index inline, since it turned out to be simple

> >  * implement attributes for stage 2 translations

> >  * combine attributes from stages 1 and 2 when required

> 

> Hi. This is looking pretty good, but I have a few comments below,

> and we're pretty much at the softfreeze date (KVM Forum last week

> meant I didn't get much code review done, unfortunately). Would

> you be too sad if this missed 2.11 ?


No worries. It would be nice to have a stable release that we can tell people supports arm64 Windows, but I recognise that this is a non-trivial change, so if we have to wait for 2.12 to get that, then fair enough.

> > Attributes for short PTE formats remain unimplemented; there's a

> LOG_UNIMP for

> > this case, but it's likely to be noisy for guests that trigger it -- do we need

> > a one-shot mechanism for the log statement?

> 

> I think we should just drop that LOG_UNIMP.


Ok.

> > @@ -8929,6 +8939,28 @@ static bool get_phys_addr_lpae(CPUARMState

> *env, target_ulong address,

> >           */

> >          txattrs->secure = false;

> >      }

> > +

> > +    if (cacheattrs != NULL) {

> > +        if (mmu_idx == ARMMMUIdx_S2NS) {

> > +            /* Translate from the 4-bit stage 2 representation of

> > +             * memory attributes (without cache-allocation hints) to

> > +             * the 8-bit representation of the stage 1 MAIR registers

> > +             * (which includes allocation hints).

> > +             */

> > +            uint8_t memattr = extract32(attrs, 0, 4);

> > +            cacheattrs->attrs = (extract32(memattr, 2, 2) << 4)

> > +                              | (extract32(memattr, 0, 2) << 2);

> 

> Pseudocode S2ConvertAttrsHints() specifies some hint bit defaults

> (no-allocate for NC; RW-allocate for WT or WB) -- do we want to

> follow that?


Thanks for the pointer. Yes, I think we do.

> 

> > +            cacheattrs->shareability = extract32(attrs, 4, 2);

> 

> Are you sure this is the right bit offset for the shareability bits?

> I think 4,2 is the S2AP (access) bits, and the SH bits are in 6,2, same

> as for stage 1 descriptors.


You're right. I was convinced it differed, but I don't recall why. Thanks for catching this.

> > +        } else {

> > +            /* Index into MAIR registers for cache attributes */

> > +            uint8_t attrindx = extract32(attrs, 0, 3);

> > +            uint64_t mair = env->cp15.mair_el[regime_el(env, mmu_idx)];

> > +            assert(attrindx <= 7);

> > +            cacheattrs->attrs = extract64(mair, attrindx * 8, 8);

> > +            cacheattrs->shareability = extract32(attrs, 6, 2);

> > +        }

> > +    }

> > +

> >      *phys_ptr = descaddr;

> >      *page_size_ptr = page_size;

> >      return false;

> > @@ -9490,6 +9522,89 @@ static bool

> get_phys_addr_pmsav5(CPUARMState *env, uint32_t address,

> >      return false;

> >  }

> >

> > +/* Combine either inner or outer cacheability attributes for normal

> > + * memory, according to table D4-42 of ARM DDI 0487B.b (the ARMv8

> ARM).

> > + *

> > + * NB: only stage 1 includes allocation hints (RW bits), leading to

> > + * some asymmetry.

> > + */

> > +static uint8_t combine_cacheattr_nibble(uint8_t s1, uint8_t s2)

> > +{

> > +    if (s1 == 4 || s2 == 4) {

> > +        /* non-cacheable has precedence */

> > +        return 4;

> > +    } else if (extract32(s1, 2, 2) == 0 || extract32(s1, 2, 2) == 2) {

> > +        /* stage 1 write-through takes precedence */

> > +        return s1;

> > +    } else if (extract32(s2, 2, 2) == 2) {

> > +        /* stage 2 write-through takes precedence */

> > +        return s2;

> > +    } else { /* write-back */

> > +        return s1;

> > +    }

> 

> The v8A ARM ARM pseudocode CombineS1S2AttrHints() says that the hint

> bits always come from s1 regardless of whose attrs won.


Aha, I was wondering about this. Thanks for the pointer to the pseudocode... it isn't referenced anywhere in the relevant section! It's reassuring to see that, aside from the hints (where the English was ambiguous IIRC), I seem to have got the rest of the translation correct.

> (I was hoping you could write this function as something like a

> MAX or MIN, but the complexities of the writethrough-transient

> encoding and the hint bits mean it doesn't work out.)

> 

> > +}

> > +

> > +/* Combine S1 and S2 cacheability/shareability attributes, per D4.5.4

> > + *

> > + * @s1:      Attributes from stage 1 walk

> > + * @s2:      Attributes from stage 2 walk

> > + */

> > +static ARMCacheAttrs combine_cacheattrs(ARMCacheAttrs s1,

> ARMCacheAttrs s2)

> > +{

> > +    uint8_t s1lo = extract32(s1.attrs, 0, 4), s2lo = extract32(s2.attrs, 0, 4);

> > +    uint8_t s1hi = extract32(s1.attrs, 4, 4), s2hi = extract32(s2.attrs, 4, 4);

> > +    ARMCacheAttrs ret;

> > +

> > +    /* Combine shareability attributes (table D4-43) */

> > +    if (s1.shareability == 2 || s2.shareability == 2) {

> > +        /* if either are outer-shareable, the result is outer-shareable */

> > +        ret.shareability = 2;

> > +    } else if (s1.shareability == 3 || s2.shareability == 3) {

> > +        /* if either are inner-shareable, the result is inner-shareable */

> > +        ret.shareability = 3;

> > +    } else {

> > +        /* both non-shareable */

> > +        ret.shareability = 0;

> > +    }

> 

> You can play bit games with the format here, because

> what we're effectively implementing is "whichever is last in

> the order '0, 3, 2' wins", which is

>    ret.shareability = MIN(s1.shareability ^ 1, s2.shareability ^ 1) ^ 1;

> (since the xor with 1 transforms (0,3,2) to (1,2,3) and is self-inverse).

> Is that better than the if ladder above? Not entirely sure :-)


I've no confidence in my ability to get that bit-logic right, or even to check that you did. I'd rather leave it to the compiler to figure out how to play those optimisation games :)

> > +    /* Combine memory type and cacheability attributes */

> > +    if (s1hi == 0 || s2hi == 0) {

> > +        /* Device has precedence over normal */

> > +        if (s1lo == 0 || s2lo == 0) {

> > +            /* nGnRnE has precedence over anything */

> > +            ret.attrs = 0;

> > +        } else if (s1lo == 4 || s2lo == 4) {

> > +            /* non-Reordering has precedence over Reordering */

> > +            ret.attrs = 4;  /* nGnRE */

> > +        } else if (s1lo == 8 || s2lo == 8) {

> > +            /* non-Gathering has precedence over Gathering */

> > +            ret.attrs = 8;  /* nGRE */

> > +        } else {

> > +            ret.attrs = 0xc; /* GRE */

> > +        }

> 

> Isn't this if-ladder equivalent to just "ret.attrs = MIN(s1lo, s2lo);" ?


Assuming both s1lo and s2lo avoid undefined encodings, yes it is. I tend to prefer the if-ladder just because it's a more direct translation of the spec (e.g. CombineS1S2Device() pseudocode). But in this case if you prefer I could change it to the MIN version for brevity, since it's fairly straightforward.

Thanks,
Andrew
Peter Maydell Oct. 31, 2017, 9:53 a.m. UTC | #3
On 31 October 2017 at 01:23, Andrew Baumann
<Andrew.Baumann@microsoft.com> wrote:
>> From: Peter Maydell [mailto:peter.maydell@linaro.org]
>> Hi. This is looking pretty good, but I have a few comments below,
>> and we're pretty much at the softfreeze date (KVM Forum last week
>> meant I didn't get much code review done, unfortunately). Would
>> you be too sad if this missed 2.11 ?
>
> No worries. It would be nice to have a stable release that we can tell people supports arm64 Windows, but I recognise that this is a non-trivial change, so if we have to wait for 2.12 to get that, then fair enough.

Is this the only missing piece, then? If we squint a bit we can
call it a bug fix as long as we can get it in early in the softfreeze
rather than later...


>> The v8A ARM ARM pseudocode CombineS1S2AttrHints() says that the hint
>> bits always come from s1 regardless of whose attrs won.
>
> Aha, I was wondering about this. Thanks for the pointer to the pseudocode... it isn't referenced anywhere in the relevant section! It's reassuring to see that, aside from the hints (where the English was ambiguous IIRC), I seem to have got the rest of the translation correct.

I didn't find anything in the English text about hints at all.
(I'll have to go and have another look.)

>> You can play bit games with the format here, because
>> what we're effectively implementing is "whichever is last in
>> the order '0, 3, 2' wins", which is
>>    ret.shareability = MIN(s1.shareability ^ 1, s2.shareability ^ 1) ^ 1;
>> (since the xor with 1 transforms (0,3,2) to (1,2,3) and is self-inverse).
>> Is that better than the if ladder above? Not entirely sure :-)
>
> I've no confidence in my ability to get that bit-logic right, or
> even to check that you did. I'd rather leave it to the compiler to
> figure out how to play those optimisation games :)

Mmm. This isn't on a fast path I think so we may as well write
it clearly.

>> > +    /* Combine memory type and cacheability attributes */
>> > +    if (s1hi == 0 || s2hi == 0) {
>> > +        /* Device has precedence over normal */
>> > +        if (s1lo == 0 || s2lo == 0) {
>> > +            /* nGnRnE has precedence over anything */
>> > +            ret.attrs = 0;
>> > +        } else if (s1lo == 4 || s2lo == 4) {
>> > +            /* non-Reordering has precedence over Reordering */
>> > +            ret.attrs = 4;  /* nGnRE */
>> > +        } else if (s1lo == 8 || s2lo == 8) {
>> > +            /* non-Gathering has precedence over Gathering */
>> > +            ret.attrs = 8;  /* nGRE */
>> > +        } else {
>> > +            ret.attrs = 0xc; /* GRE */
>> > +        }
>>
>> Isn't this if-ladder equivalent to just "ret.attrs = MIN(s1lo, s2lo);" ?
>
> Assuming both s1lo and s2lo avoid undefined encodings, yes it is. I tend to prefer the if-ladder just because it's a more direct translation of the spec (e.g. CombineS1S2Device() pseudocode). But in this case if you prefer I could change it to the MIN version for brevity, since it's fairly straightforward.

Your choice either way, then.

thanks
-- PMM
Cameron Esfahani via Oct. 31, 2017, 10:28 a.m. UTC | #4
> From: Peter Maydell [mailto:peter.maydell@linaro.org]

> Sent: Tuesday, 31 October 2017 17:53

> 

> On 31 October 2017 at 01:23, Andrew Baumann

> <Andrew.Baumann@microsoft.com> wrote:

> >> From: Peter Maydell [mailto:peter.maydell@linaro.org]

> >> Hi. This is looking pretty good, but I have a few comments below,

> >> and we're pretty much at the softfreeze date (KVM Forum last week

> >> meant I didn't get much code review done, unfortunately). Would

> >> you be too sad if this missed 2.11 ?

> >

> > No worries. It would be nice to have a stable release that we can tell people

> supports arm64 Windows, but I recognise that this is a non-trivial change, so if

> we have to wait for 2.12 to get that, then fair enough.

> 

> Is this the only missing piece, then? If we squint a bit we can

> call it a bug fix as long as we can get it in early in the softfreeze

> rather than later...


It is the last piece, and I'll try to get you a v3 soon, but this week it's my turn to be travelling, so it may take a day or two. I'll understand if we don't get it in -- not a show-stopper.

> >> The v8A ARM ARM pseudocode CombineS1S2AttrHints() says that the hint

> >> bits always come from s1 regardless of whose attrs won.

> >

> > Aha, I was wondering about this. Thanks for the pointer to the pseudocode...

> it isn't referenced anywhere in the relevant section! It's reassuring to see that,

> aside from the hints (where the English was ambiguous IIRC), I seem to have

> got the rest of the translation correct.

> 

> I didn't find anything in the English text about hints at all.

> (I'll have to go and have another look.)


I didn't find any mention of it. That's pretty ambiguous IMO :)

Andrew
diff mbox series

Patch

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 96113fe989..60cdc7ce1f 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -19,17 +19,23 @@ 
 #define ARM_CPU_FREQ 1000000000 /* FIXME: 1 GHz, should be configurable */
 
 #ifndef CONFIG_USER_ONLY
+/* Cacheability and shareability attributes for a memory access */
+typedef struct ARMCacheAttrs {
+    unsigned int attrs:8; /* as in the MAIR register encoding */
+    unsigned int shareability:2; /* as in the SH field of the VMSAv8-64 PTEs */
+} ARMCacheAttrs;
+
 static bool get_phys_addr(CPUARMState *env, target_ulong address,
                           MMUAccessType access_type, ARMMMUIdx mmu_idx,
                           hwaddr *phys_ptr, MemTxAttrs *attrs, int *prot,
                           target_ulong *page_size, uint32_t *fsr,
-                          ARMMMUFaultInfo *fi);
+                          ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs);
 
 static bool get_phys_addr_lpae(CPUARMState *env, target_ulong address,
                                MMUAccessType access_type, ARMMMUIdx mmu_idx,
                                hwaddr *phys_ptr, MemTxAttrs *txattrs, int *prot,
                                target_ulong *page_size_ptr, uint32_t *fsr,
-                               ARMMMUFaultInfo *fi);
+                               ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs);
 
 /* Security attributes for an address, as returned by v8m_security_lookup. */
 typedef struct V8M_SAttributes {
@@ -2159,9 +2165,10 @@  static uint64_t do_ats_write(CPUARMState *env, uint64_t value,
     uint64_t par64;
     MemTxAttrs attrs = {};
     ARMMMUFaultInfo fi = {};
+    ARMCacheAttrs cacheattrs = {};
 
-    ret = get_phys_addr(env, value, access_type, mmu_idx,
-                        &phys_addr, &attrs, &prot, &page_size, &fsr, &fi);
+    ret = get_phys_addr(env, value, access_type, mmu_idx, &phys_addr, &attrs,
+                        &prot, &page_size, &fsr, &fi, &cacheattrs);
     if (extended_addresses_enabled(env)) {
         /* fsr is a DFSR/IFSR value for the long descriptor
          * translation table format, but with WnR always clear.
@@ -2173,7 +2180,8 @@  static uint64_t do_ats_write(CPUARMState *env, uint64_t value,
             if (!attrs.secure) {
                 par64 |= (1 << 9); /* NS */
             }
-            /* We don't set the ATTR or SH fields in the PAR. */
+            par64 |= (uint64_t)cacheattrs.attrs << 56; /* ATTR */
+            par64 |= cacheattrs.shareability << 7; /* SH */
         } else {
             par64 |= 1; /* F */
             par64 |= (fsr & 0x3f) << 1; /* FS */
@@ -2189,6 +2197,8 @@  static uint64_t do_ats_write(CPUARMState *env, uint64_t value,
          */
         if (!ret) {
             /* We do not set any attribute bits in the PAR */
+            qemu_log_mask(LOG_UNIMP, "ats_write: cache/shareability attributes"
+                          " for short PTEs not implemented in PAR\n");
             if (page_size == (1 << 24)
                 && arm_feature(env, ARM_FEATURE_V7)) {
                 par64 = (phys_addr & 0xff000000) | (1 << 1);
@@ -6925,7 +6935,7 @@  static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx,
         return false;
     }
     if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx,
-                      &physaddr, &attrs, &prot, &page_size, &fsr, &fi)) {
+                      &physaddr, &attrs, &prot, &page_size, &fsr, &fi, NULL)) {
         /* the MPU lookup failed */
         env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
         armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure);
@@ -8207,7 +8217,7 @@  static hwaddr S1_ptw_translate(CPUARMState *env, ARMMMUIdx mmu_idx,
         int ret;
 
         ret = get_phys_addr_lpae(env, addr, 0, ARMMMUIdx_S2NS, &s2pa,
-                                 &txattrs, &s2prot, &s2size, fsr, fi);
+                                 &txattrs, &s2prot, &s2size, fsr, fi, NULL);
         if (ret) {
             fi->s2addr = addr;
             fi->stage2 = true;
@@ -8612,7 +8622,7 @@  static bool get_phys_addr_lpae(CPUARMState *env, target_ulong address,
                                MMUAccessType access_type, ARMMMUIdx mmu_idx,
                                hwaddr *phys_ptr, MemTxAttrs *txattrs, int *prot,
                                target_ulong *page_size_ptr, uint32_t *fsr,
-                               ARMMMUFaultInfo *fi)
+                               ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs)
 {
     ARMCPU *cpu = arm_env_get_cpu(env);
     CPUState *cs = CPU(cpu);
@@ -8929,6 +8939,28 @@  static bool get_phys_addr_lpae(CPUARMState *env, target_ulong address,
          */
         txattrs->secure = false;
     }
+
+    if (cacheattrs != NULL) {
+        if (mmu_idx == ARMMMUIdx_S2NS) {
+            /* Translate from the 4-bit stage 2 representation of
+             * memory attributes (without cache-allocation hints) to
+             * the 8-bit representation of the stage 1 MAIR registers
+             * (which includes allocation hints).
+             */
+            uint8_t memattr = extract32(attrs, 0, 4);
+            cacheattrs->attrs = (extract32(memattr, 2, 2) << 4)
+                              | (extract32(memattr, 0, 2) << 2);
+            cacheattrs->shareability = extract32(attrs, 4, 2);
+        } else {
+            /* Index into MAIR registers for cache attributes */
+            uint8_t attrindx = extract32(attrs, 0, 3);
+            uint64_t mair = env->cp15.mair_el[regime_el(env, mmu_idx)];
+            assert(attrindx <= 7);
+            cacheattrs->attrs = extract64(mair, attrindx * 8, 8);
+            cacheattrs->shareability = extract32(attrs, 6, 2);
+        }
+    }
+
     *phys_ptr = descaddr;
     *page_size_ptr = page_size;
     return false;
@@ -9490,6 +9522,89 @@  static bool get_phys_addr_pmsav5(CPUARMState *env, uint32_t address,
     return false;
 }
 
+/* Combine either inner or outer cacheability attributes for normal
+ * memory, according to table D4-42 of ARM DDI 0487B.b (the ARMv8 ARM).
+ *
+ * NB: only stage 1 includes allocation hints (RW bits), leading to
+ * some asymmetry.
+ */
+static uint8_t combine_cacheattr_nibble(uint8_t s1, uint8_t s2)
+{
+    if (s1 == 4 || s2 == 4) {
+        /* non-cacheable has precedence */
+        return 4;
+    } else if (extract32(s1, 2, 2) == 0 || extract32(s1, 2, 2) == 2) {
+        /* stage 1 write-through takes precedence */
+        return s1;
+    } else if (extract32(s2, 2, 2) == 2) {
+        /* stage 2 write-through takes precedence */
+        return s2;
+    } else { /* write-back */
+        return s1;
+    }
+}
+
+/* Combine S1 and S2 cacheability/shareability attributes, per D4.5.4
+ *
+ * @s1:      Attributes from stage 1 walk
+ * @s2:      Attributes from stage 2 walk
+ */
+static ARMCacheAttrs combine_cacheattrs(ARMCacheAttrs s1, ARMCacheAttrs s2)
+{
+    uint8_t s1lo = extract32(s1.attrs, 0, 4), s2lo = extract32(s2.attrs, 0, 4);
+    uint8_t s1hi = extract32(s1.attrs, 4, 4), s2hi = extract32(s2.attrs, 4, 4);
+    ARMCacheAttrs ret;
+
+    /* Combine shareability attributes (table D4-43) */
+    if (s1.shareability == 2 || s2.shareability == 2) {
+        /* if either are outer-shareable, the result is outer-shareable */
+        ret.shareability = 2;
+    } else if (s1.shareability == 3 || s2.shareability == 3) {
+        /* if either are inner-shareable, the result is inner-shareable */
+        ret.shareability = 3;
+    } else {
+        /* both non-shareable */
+        ret.shareability = 0;
+    }
+
+    /* Combine memory type and cacheability attributes */
+    if (s1hi == 0 || s2hi == 0) {
+        /* Device has precedence over normal */
+        if (s1lo == 0 || s2lo == 0) {
+            /* nGnRnE has precedence over anything */
+            ret.attrs = 0;
+        } else if (s1lo == 4 || s2lo == 4) {
+            /* non-Reordering has precedence over Reordering */
+            ret.attrs = 4;  /* nGnRE */
+        } else if (s1lo == 8 || s2lo == 8) {
+            /* non-Gathering has precedence over Gathering */
+            ret.attrs = 8;  /* nGRE */
+        } else {
+            ret.attrs = 0xc; /* GRE */
+        }
+
+        /* Any location for which the resultant memory type is any
+         * type of Device memory is always treated as Outer Shareable.
+         */
+        ret.shareability = 2;
+    } else { /* Normal memory */
+        /* Outer/inner cacheability combine independently */
+        ret.attrs = combine_cacheattr_nibble(s1hi, s2hi) << 4
+                  | combine_cacheattr_nibble(s1lo, s2lo);
+
+        if (ret.attrs == 0x44) {
+            /* Any location for which the resultant memory type is Normal
+             * Inner Non-cacheable, Outer Non-cacheable is always treated
+             * as Outer Shareable.
+             */
+            ret.shareability = 2;
+        }
+    }
+
+    return ret;
+}
+
+
 /* get_phys_addr - get the physical address for this virtual address
  *
  * Find the physical address corresponding to the given virtual address,
@@ -9514,12 +9629,14 @@  static bool get_phys_addr_pmsav5(CPUARMState *env, uint32_t address,
  * @prot: set to the permissions for the page containing phys_ptr
  * @page_size: set to the size of the page containing phys_ptr
  * @fsr: set to the DFSR/IFSR value on failure
+ * @fi: set to fault info if the translation fails
+ * @cacheattrs: (if non-NULL) set to the cacheability/shareability attributes
  */
 static bool get_phys_addr(CPUARMState *env, target_ulong address,
                           MMUAccessType access_type, ARMMMUIdx mmu_idx,
                           hwaddr *phys_ptr, MemTxAttrs *attrs, int *prot,
                           target_ulong *page_size, uint32_t *fsr,
-                          ARMMMUFaultInfo *fi)
+                          ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs)
 {
     if (mmu_idx == ARMMMUIdx_S12NSE0 || mmu_idx == ARMMMUIdx_S12NSE1) {
         /* Call ourselves recursively to do the stage 1 and then stage 2
@@ -9529,10 +9646,11 @@  static bool get_phys_addr(CPUARMState *env, target_ulong address,
             hwaddr ipa;
             int s2_prot;
             int ret;
+            ARMCacheAttrs cacheattrs2 = {};
 
             ret = get_phys_addr(env, address, access_type,
                                 stage_1_mmu_idx(mmu_idx), &ipa, attrs,
-                                prot, page_size, fsr, fi);
+                                prot, page_size, fsr, fi, cacheattrs);
 
             /* If S1 fails or S2 is disabled, return early.  */
             if (ret || regime_translation_disabled(env, ARMMMUIdx_S2NS)) {
@@ -9543,10 +9661,17 @@  static bool get_phys_addr(CPUARMState *env, target_ulong address,
             /* S1 is done. Now do S2 translation.  */
             ret = get_phys_addr_lpae(env, ipa, access_type, ARMMMUIdx_S2NS,
                                      phys_ptr, attrs, &s2_prot,
-                                     page_size, fsr, fi);
+                                     page_size, fsr, fi,
+                                     cacheattrs != NULL ? &cacheattrs2 : NULL);
             fi->s2addr = ipa;
             /* Combine the S1 and S2 perms.  */
             *prot &= s2_prot;
+
+            /* Combine the S1 and S2 cache attributes, if needed */
+            if (!ret && cacheattrs != NULL) {
+                *cacheattrs = combine_cacheattrs(*cacheattrs, cacheattrs2);
+            }
+
             return ret;
         } else {
             /*
@@ -9617,7 +9742,7 @@  static bool get_phys_addr(CPUARMState *env, target_ulong address,
 
     if (regime_using_lpae_format(env, mmu_idx)) {
         return get_phys_addr_lpae(env, address, access_type, mmu_idx, phys_ptr,
-                                  attrs, prot, page_size, fsr, fi);
+                                  attrs, prot, page_size, fsr, fi, cacheattrs);
     } else if (regime_sctlr(env, mmu_idx) & SCTLR_XP) {
         return get_phys_addr_v6(env, address, access_type, mmu_idx, phys_ptr,
                                 attrs, prot, page_size, fsr, fi);
@@ -9645,7 +9770,7 @@  bool arm_tlb_fill(CPUState *cs, vaddr address,
 
     ret = get_phys_addr(env, address, access_type,
                         core_to_arm_mmu_idx(env, mmu_idx), &phys_addr,
-                        &attrs, &prot, &page_size, fsr, fi);
+                        &attrs, &prot, &page_size, fsr, fi, NULL);
     if (!ret) {
         /* Map a single [sub]page.  */
         phys_addr &= TARGET_PAGE_MASK;
@@ -9674,7 +9799,7 @@  hwaddr arm_cpu_get_phys_page_attrs_debug(CPUState *cs, vaddr addr,
     *attrs = (MemTxAttrs) {};
 
     ret = get_phys_addr(env, addr, 0, mmu_idx, &phys_addr,
-                        attrs, &prot, &page_size, &fsr, &fi);
+                        attrs, &prot, &page_size, &fsr, &fi, NULL);
 
     if (ret) {
         return -1;