diff mbox series

[v2] target/ppc: Generate storage interrupts for radix RC changes

Message ID 20230712161322.2729950-1-sanastasio@raptorengineering.com
State Accepted
Delegated to: Cédric Le Goater
Headers show
Series [v2] target/ppc: Generate storage interrupts for radix RC changes | expand

Commit Message

Shawn Anastasio July 12, 2023, 4:13 p.m. UTC
Change radix model to always generate a storage interrupt when the R/C
bits are not set appropriately in a PTE instead of setting the bits
itself.  According to the ISA both behaviors are valid, but in practice
this change more closely matches behavior observed on the POWER9 CPU.

From the POWER9 Processor User's Manual, Section 4.10.13.1: "When
performing Radix translation, the POWER9 hardware triggers the
appropriate interrupt ... for the mode and type of access whenever
Reference (R) and Change (C) bits require setting in either the guest or
host page-table entry (PTE)."

Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
---
Changes in v2:
  - Raise interrupt in ppc_radix64_process_scoped_xlate and
    ppc_radix64_partition_scoped_xlate instead of ppc_radix64_check_rc

 target/ppc/mmu-radix64.c | 74 ++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 25 deletions(-)

--
2.30.2

Comments

Cédric Le Goater July 12, 2023, 4:56 p.m. UTC | #1
Hello Shawn,

On 7/12/23 18:13, Shawn Anastasio wrote:
> Change radix model to always generate a storage interrupt when the R/C
> bits are not set appropriately in a PTE instead of setting the bits
> itself.  According to the ISA both behaviors are valid, but in practice
> this change more closely matches behavior observed on the POWER9 CPU.

How did you spotted this dark corner case in emulation ? Do you have
MMU unit tests ?
  
>  From the POWER9 Processor User's Manual, Section 4.10.13.1: "When
> performing Radix translation, the POWER9 hardware triggers the
> appropriate interrupt ... for the mode and type of access whenever
> Reference (R) and Change (C) bits require setting in either the guest or
> host page-table entry (PTE)."

Nick, could you please take a look also ? You know better that part
in Linux than I do.
  
> Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
Reviewed-by: Cédric Le Goater <clg@kaod.org>

Thanks,

C.

> ---
> Changes in v2:
>    - Raise interrupt in ppc_radix64_process_scoped_xlate and
>      ppc_radix64_partition_scoped_xlate instead of ppc_radix64_check_rc
> 
>   target/ppc/mmu-radix64.c | 74 ++++++++++++++++++++++++++--------------
>   1 file changed, 49 insertions(+), 25 deletions(-)
> 
> diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
> index 920084bd8f..5823e039e6 100644
> --- a/target/ppc/mmu-radix64.c
> +++ b/target/ppc/mmu-radix64.c
> @@ -219,27 +219,25 @@ static bool ppc_radix64_check_prot(PowerPCCPU *cpu, MMUAccessType access_type,
>       return false;
>   }
> 
> -static void ppc_radix64_set_rc(PowerPCCPU *cpu, MMUAccessType access_type,
> -                               uint64_t pte, hwaddr pte_addr, int *prot)
> +static int ppc_radix64_check_rc(MMUAccessType access_type, uint64_t pte)
>   {
> -    CPUState *cs = CPU(cpu);
> -    uint64_t npte;
> -
> -    npte = pte | R_PTE_R; /* Always set reference bit */
> +    switch (access_type) {
> +    case MMU_DATA_STORE:
> +        if (!(pte & R_PTE_C)) {
> +            break;
> +        }
> +        /* fall through */
> +    case MMU_INST_FETCH:
> +    case MMU_DATA_LOAD:
> +        if (!(pte & R_PTE_R)) {
> +            break;
> +        }
> 
> -    if (access_type == MMU_DATA_STORE) { /* Store/Write */
> -        npte |= R_PTE_C; /* Set change bit */
> -    } else {
> -        /*
> -         * Treat the page as read-only for now, so that a later write
> -         * will pass through this function again to set the C bit.
> -         */
> -        *prot &= ~PAGE_WRITE;
> +        /* R/C bits are already set appropriately for this access */
> +        return 0;
>       }
> 
> -    if (pte ^ npte) { /* If pte has changed then write it back */
> -        stq_phys(cs->as, pte_addr, npte);
> -    }
> +    return 1;
>   }
> 
>   static bool ppc_radix64_is_valid_level(int level, int psize, uint64_t nls)
> @@ -380,7 +378,8 @@ static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu,
>                                                 ppc_v3_pate_t pate,
>                                                 hwaddr *h_raddr, int *h_prot,
>                                                 int *h_page_size, bool pde_addr,
> -                                              int mmu_idx, bool guest_visible)
> +                                              int mmu_idx, uint64_t lpid,
> +                                              bool guest_visible)
>   {
>       MMUAccessType access_type = orig_access_type;
>       int fault_cause = 0;
> @@ -418,7 +417,24 @@ static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu,
>       }
> 
>       if (guest_visible) {
> -        ppc_radix64_set_rc(cpu, access_type, pte, pte_addr, h_prot);
> +        if (ppc_radix64_check_rc(access_type, pte)) {
> +            /*
> +             * Per ISA 3.1 Book III, 7.5.3 and 7.5.5, failure to set R/C during
> +             * partition-scoped translation when effLPID = 0 results in normal
> +             * (non-Hypervisor) Data and Instruction Storage Interrupts
> +             * respectively.
> +             *
> +             * ISA 3.0 is ambiguous about this, but tests on POWER9 hardware
> +             * seem to exhibit the same behavior.
> +             */
> +            if (lpid > 0) {
> +                ppc_radix64_raise_hsi(cpu, access_type, eaddr, g_raddr,
> +                                      DSISR_ATOMIC_RC);
> +            } else {
> +                ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_ATOMIC_RC);
> +            }
> +            return 1;
> +        }
>       }
> 
>       return 0;
> @@ -447,7 +463,8 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
>                                               vaddr eaddr, uint64_t pid,
>                                               ppc_v3_pate_t pate, hwaddr *g_raddr,
>                                               int *g_prot, int *g_page_size,
> -                                            int mmu_idx, bool guest_visible)
> +                                            int mmu_idx, uint64_t lpid,
> +                                            bool guest_visible)
>   {
>       CPUState *cs = CPU(cpu);
>       CPUPPCState *env = &cpu->env;
> @@ -497,7 +514,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
>           ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,
>                                                    prtbe_addr, pate, &h_raddr,
>                                                    &h_prot, &h_page_size, true,
> -                                                 5, guest_visible);
> +                                                 5, lpid, guest_visible);
>           if (ret) {
>               return ret;
>           }
> @@ -539,7 +556,8 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
>               ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,
>                                                        pte_addr, pate, &h_raddr,
>                                                        &h_prot, &h_page_size,
> -                                                     true, 5, guest_visible);
> +                                                     true, 5, lpid,
> +                                                     guest_visible);
>               if (ret) {
>                   return ret;
>               }
> @@ -580,7 +598,11 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
>       }
> 
>       if (guest_visible) {
> -        ppc_radix64_set_rc(cpu, access_type, pte, pte_addr, g_prot);
> +        /* R/C bits not appropriately set for access */
> +        if (ppc_radix64_check_rc(access_type, pte)) {
> +            ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_ATOMIC_RC);
> +            return 1;
> +        }
>       }
> 
>       return 0;
> @@ -695,7 +717,8 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
>       if (relocation) {
>           int ret = ppc_radix64_process_scoped_xlate(cpu, access_type, eaddr, pid,
>                                                      pate, &g_raddr, &prot,
> -                                                   &psize, mmu_idx, guest_visible);
> +                                                   &psize, mmu_idx, lpid,
> +                                                   guest_visible);
>           if (ret) {
>               return false;
>           }
> @@ -719,7 +742,8 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
>               ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,
>                                                        g_raddr, pate, raddr,
>                                                        &prot, &psize, false,
> -                                                     mmu_idx, guest_visible);
> +                                                     mmu_idx, lpid,
> +                                                     guest_visible);
>               if (ret) {
>                   return false;
>               }
> --
> 2.30.2
>
Shawn Anastasio July 12, 2023, 5:35 p.m. UTC | #2
On 7/12/23 11:56 AM, Cédric Le Goater wrote:
> Hello Shawn,
> 
> On 7/12/23 18:13, Shawn Anastasio wrote:
>> Change radix model to always generate a storage interrupt when the R/C
>> bits are not set appropriately in a PTE instead of setting the bits
>> itself.  According to the ISA both behaviors are valid, but in practice
>> this change more closely matches behavior observed on the POWER9 CPU.
> 
> How did you spotted this dark corner case in emulation ? Do you have
> MMU unit tests ?

I'm currently porting Xen to Power and have been using QEMU's powernv
model extensively for early bring up. I noticed the issue when my radix
implementation worked in QEMU but failed on actual hardware since I
didn't have a proper storage interrupt handler implemented.

>> Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
> Reviewed-by: Cédric Le Goater <clg@kaod.org>

Much appreciated.

> Thanks,
> 
> C.

Thanks,
Shawn
Nicholas Piggin July 21, 2023, 2:38 a.m. UTC | #3
On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote:
> On 7/12/23 11:56 AM, Cédric Le Goater wrote:
> > Hello Shawn,
> > 
> > On 7/12/23 18:13, Shawn Anastasio wrote:
> >> Change radix model to always generate a storage interrupt when the R/C
> >> bits are not set appropriately in a PTE instead of setting the bits
> >> itself.  According to the ISA both behaviors are valid, but in practice
> >> this change more closely matches behavior observed on the POWER9 CPU.
> > 
> > How did you spotted this dark corner case in emulation ? Do you have
> > MMU unit tests ?
>
> I'm currently porting Xen to Power and have been using QEMU's powernv
> model extensively for early bring up. I noticed the issue when my radix
> implementation worked in QEMU but failed on actual hardware since I
> didn't have a proper storage interrupt handler implemented.

Cool. This was on my todo list because we rely on it for nested HV
KVM too.

I actually didn't know about that odd effLIPD=0 exception, but it
looks right. How did you test that, by running with MSR[HV]=0 and
LPIDR=0 ?

For the patch,

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>

Thanks,
Nick
Daniel Henrique Barboza July 21, 2023, 4:08 p.m. UTC | #4
On 7/20/23 23:38, Nicholas Piggin wrote:
> On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote:
>> On 7/12/23 11:56 AM, Cédric Le Goater wrote:
>>> Hello Shawn,
>>>
>>> On 7/12/23 18:13, Shawn Anastasio wrote:
>>>> Change radix model to always generate a storage interrupt when the R/C
>>>> bits are not set appropriately in a PTE instead of setting the bits
>>>> itself.  According to the ISA both behaviors are valid, but in practice
>>>> this change more closely matches behavior observed on the POWER9 CPU.
>>>
>>> How did you spotted this dark corner case in emulation ? Do you have
>>> MMU unit tests ?
>>
>> I'm currently porting Xen to Power and have been using QEMU's powernv
>> model extensively for early bring up. I noticed the issue when my radix
>> implementation worked in QEMU but failed on actual hardware since I
>> didn't have a proper storage interrupt handler implemented.
> 
> Cool. This was on my todo list because we rely on it for nested HV
> KVM too.
> 
> I actually didn't know about that odd effLIPD=0 exception, but it
> looks right. How did you test that, by running with MSR[HV]=0 and
> LPIDR=0 ?
> 
> For the patch,
> 
> Reviewed-by: Nicholas Piggin <npiggin@gmail.com>

Cedric, do you think this is 8.1 material?


Daniel

> 
> Thanks,
> Nick
Cédric Le Goater July 21, 2023, 4:27 p.m. UTC | #5
On 7/21/23 18:08, Daniel Henrique Barboza wrote:
> 
> 
> On 7/20/23 23:38, Nicholas Piggin wrote:
>> On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote:
>>> On 7/12/23 11:56 AM, Cédric Le Goater wrote:
>>>> Hello Shawn,
>>>>
>>>> On 7/12/23 18:13, Shawn Anastasio wrote:
>>>>> Change radix model to always generate a storage interrupt when the R/C
>>>>> bits are not set appropriately in a PTE instead of setting the bits
>>>>> itself.  According to the ISA both behaviors are valid, but in practice
>>>>> this change more closely matches behavior observed on the POWER9 CPU.
>>>>
>>>> How did you spotted this dark corner case in emulation ? Do you have
>>>> MMU unit tests ?
>>>
>>> I'm currently porting Xen to Power and have been using QEMU's powernv
>>> model extensively for early bring up. I noticed the issue when my radix
>>> implementation worked in QEMU but failed on actual hardware since I
>>> didn't have a proper storage interrupt handler implemented.
>>
>> Cool. This was on my todo list because we rely on it for nested HV
>> KVM too.
>>
>> I actually didn't know about that odd effLIPD=0 exception, but it
>> looks right. How did you test that, by running with MSR[HV]=0 and
>> LPIDR=0 ?
>>
>> For the patch,
>>
>> Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
> 
> Cedric, do you think this is 8.1 material?

No. It is improving the MMU model but it is not fixing something that
was "broken" in QEMU. I would leave it for 8.2.

However, these are 8.1 material :

  https://patchwork.ozlabs.org/project/qemu-ppc/list/?series=364971
  https://patchwork.ozlabs.org/project/qemu-ppc/list/?series=364984

but they lack a Fixes tag. I asked Nick.

C.
diff mbox series

Patch

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 920084bd8f..5823e039e6 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -219,27 +219,25 @@  static bool ppc_radix64_check_prot(PowerPCCPU *cpu, MMUAccessType access_type,
     return false;
 }

-static void ppc_radix64_set_rc(PowerPCCPU *cpu, MMUAccessType access_type,
-                               uint64_t pte, hwaddr pte_addr, int *prot)
+static int ppc_radix64_check_rc(MMUAccessType access_type, uint64_t pte)
 {
-    CPUState *cs = CPU(cpu);
-    uint64_t npte;
-
-    npte = pte | R_PTE_R; /* Always set reference bit */
+    switch (access_type) {
+    case MMU_DATA_STORE:
+        if (!(pte & R_PTE_C)) {
+            break;
+        }
+        /* fall through */
+    case MMU_INST_FETCH:
+    case MMU_DATA_LOAD:
+        if (!(pte & R_PTE_R)) {
+            break;
+        }

-    if (access_type == MMU_DATA_STORE) { /* Store/Write */
-        npte |= R_PTE_C; /* Set change bit */
-    } else {
-        /*
-         * Treat the page as read-only for now, so that a later write
-         * will pass through this function again to set the C bit.
-         */
-        *prot &= ~PAGE_WRITE;
+        /* R/C bits are already set appropriately for this access */
+        return 0;
     }

-    if (pte ^ npte) { /* If pte has changed then write it back */
-        stq_phys(cs->as, pte_addr, npte);
-    }
+    return 1;
 }

 static bool ppc_radix64_is_valid_level(int level, int psize, uint64_t nls)
@@ -380,7 +378,8 @@  static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu,
                                               ppc_v3_pate_t pate,
                                               hwaddr *h_raddr, int *h_prot,
                                               int *h_page_size, bool pde_addr,
-                                              int mmu_idx, bool guest_visible)
+                                              int mmu_idx, uint64_t lpid,
+                                              bool guest_visible)
 {
     MMUAccessType access_type = orig_access_type;
     int fault_cause = 0;
@@ -418,7 +417,24 @@  static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu,
     }

     if (guest_visible) {
-        ppc_radix64_set_rc(cpu, access_type, pte, pte_addr, h_prot);
+        if (ppc_radix64_check_rc(access_type, pte)) {
+            /*
+             * Per ISA 3.1 Book III, 7.5.3 and 7.5.5, failure to set R/C during
+             * partition-scoped translation when effLPID = 0 results in normal
+             * (non-Hypervisor) Data and Instruction Storage Interrupts
+             * respectively.
+             *
+             * ISA 3.0 is ambiguous about this, but tests on POWER9 hardware
+             * seem to exhibit the same behavior.
+             */
+            if (lpid > 0) {
+                ppc_radix64_raise_hsi(cpu, access_type, eaddr, g_raddr,
+                                      DSISR_ATOMIC_RC);
+            } else {
+                ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_ATOMIC_RC);
+            }
+            return 1;
+        }
     }

     return 0;
@@ -447,7 +463,8 @@  static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
                                             vaddr eaddr, uint64_t pid,
                                             ppc_v3_pate_t pate, hwaddr *g_raddr,
                                             int *g_prot, int *g_page_size,
-                                            int mmu_idx, bool guest_visible)
+                                            int mmu_idx, uint64_t lpid,
+                                            bool guest_visible)
 {
     CPUState *cs = CPU(cpu);
     CPUPPCState *env = &cpu->env;
@@ -497,7 +514,7 @@  static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
         ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,
                                                  prtbe_addr, pate, &h_raddr,
                                                  &h_prot, &h_page_size, true,
-                                                 5, guest_visible);
+                                                 5, lpid, guest_visible);
         if (ret) {
             return ret;
         }
@@ -539,7 +556,8 @@  static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
             ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,
                                                      pte_addr, pate, &h_raddr,
                                                      &h_prot, &h_page_size,
-                                                     true, 5, guest_visible);
+                                                     true, 5, lpid,
+                                                     guest_visible);
             if (ret) {
                 return ret;
             }
@@ -580,7 +598,11 @@  static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu,
     }

     if (guest_visible) {
-        ppc_radix64_set_rc(cpu, access_type, pte, pte_addr, g_prot);
+        /* R/C bits not appropriately set for access */
+        if (ppc_radix64_check_rc(access_type, pte)) {
+            ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_ATOMIC_RC);
+            return 1;
+        }
     }

     return 0;
@@ -695,7 +717,8 @@  static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
     if (relocation) {
         int ret = ppc_radix64_process_scoped_xlate(cpu, access_type, eaddr, pid,
                                                    pate, &g_raddr, &prot,
-                                                   &psize, mmu_idx, guest_visible);
+                                                   &psize, mmu_idx, lpid,
+                                                   guest_visible);
         if (ret) {
             return false;
         }
@@ -719,7 +742,8 @@  static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr eaddr,
             ret = ppc_radix64_partition_scoped_xlate(cpu, access_type, eaddr,
                                                      g_raddr, pate, raddr,
                                                      &prot, &psize, false,
-                                                     mmu_idx, guest_visible);
+                                                     mmu_idx, lpid,
+                                                     guest_visible);
             if (ret) {
                 return false;
             }