diff mbox

[v5,12/37] target-arm: A64: Implement DC ZVA

Message ID 1396023024-2262-13-git-send-email-peter.maydell@linaro.org
State New
Headers show

Commit Message

Peter Maydell March 28, 2014, 4:09 p.m. UTC
Implement the DC ZVA instruction, which clears a block of memory.
The fast path obtains a pointer to the underlying RAM via the TCG TLB
data structure so we can do a direct memset(), with fallback to a
simple byte-store loop in the slow path.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 include/exec/softmmu_exec.h |  52 +++++++++++++++++++
 target-arm/cpu-qom.h        |   2 +
 target-arm/cpu.h            |   3 +-
 target-arm/cpu64.c          |   1 +
 target-arm/helper.c         | 122 ++++++++++++++++++++++++++++++++++++++++++--
 target-arm/helper.h         |   1 +
 target-arm/translate-a64.c  |   5 ++
 7 files changed, 180 insertions(+), 6 deletions(-)

Comments

Richard Henderson March 28, 2014, 6:42 p.m. UTC | #1
On 03/28/2014 09:09 AM, Peter Maydell wrote:
> +            for (i = 0; i < maxidx; i++) {
> +                hostaddr[i] = tlb_vaddr_to_host(env,
> +                                                vaddr + TARGET_PAGE_SIZE * i,
> +                                                1, cpu_mmu_index(env));
> +                if (!hostaddr[i]) {
> +                    break;
> +                }
> +            }
> +            if (i == maxidx) {
> +                /* If it's all in the TLB it's fair game for just writing to;
> +                 * we know we don't need to update dirty status, etc.
> +                 */
> +                for (i = 0; i < maxidx - 1; i++) {
> +                    memset(hostaddr[i], 0, TARGET_PAGE_SIZE);
> +                }
> +                memset(hostaddr[i], 0, blocklen - (i * TARGET_PAGE_SIZE));
> +                return;
> +            }

Doesn't this fail if blocklen < TARGET_PAGE_SIZE?

Since blocklen must be a power of 4, it's either less than TARGET_PAGE_SIZE or
a multiple of TARGET_PAGE_SIZE, so that last memset looks suspect.

I think all this would be easier to follow as two cases:

    if (blocklen <= TARGET_PAGE_SIZE) {
        // One look up and no hostaddr array
    } else {
        // Multiple pages; much of what you have now, only no partial pages
    }


r~
Peter Maydell April 4, 2014, 2:12 p.m. UTC | #2
On 28 March 2014 18:42, Richard Henderson <rth@twiddle.net> wrote:
> On 03/28/2014 09:09 AM, Peter Maydell wrote:
>> +            for (i = 0; i < maxidx; i++) {
>> +                hostaddr[i] = tlb_vaddr_to_host(env,
>> +                                                vaddr + TARGET_PAGE_SIZE * i,
>> +                                                1, cpu_mmu_index(env));
>> +                if (!hostaddr[i]) {
>> +                    break;
>> +                }
>> +            }
>> +            if (i == maxidx) {
>> +                /* If it's all in the TLB it's fair game for just writing to;
>> +                 * we know we don't need to update dirty status, etc.
>> +                 */
>> +                for (i = 0; i < maxidx - 1; i++) {
>> +                    memset(hostaddr[i], 0, TARGET_PAGE_SIZE);
>> +                }
>> +                memset(hostaddr[i], 0, blocklen - (i * TARGET_PAGE_SIZE));
>> +                return;
>> +            }
>
> Doesn't this fail if blocklen < TARGET_PAGE_SIZE?

I can't see where it does -- in that case, maxidx is 1,
and so we do the tlb_vaddr_to_host loopup once, then
(assuming the TLB hit) i == 1 so we take this if, and
the for() loop does nothing (except set i to 0); then we
do all the work via the final memset, where i == 0 and
so we memset blocklen bytes exactly.

> Since blocklen must be a power of 4, it's either less than TARGET_PAGE_SIZE or
> a multiple of TARGET_PAGE_SIZE, so that last memset looks suspect.

If blocklen is a multiple of TARGET_PAGE_SIZE then
blocklen - (i * TARGET_PAGE_SIZE) will always be
TARGET_PAGE_SIZE. As it happens the code will work
for any blocksize, but multiples of TARGET_PAGE_SIZE
are just a special case of that :-)

> I think all this would be easier to follow as two cases:
>
>     if (blocklen <= TARGET_PAGE_SIZE) {
>         // One look up and no hostaddr array
>     } else {
>         // Multiple pages; much of what you have now, only no partial pages
>     }

I'm not convinced. You get a dozen extra lines of code for the
<= TARGET_PAGE_SIZE case, and the only change in the
other case is that you get to delete that one memset because
you can adjust the upper bound of the loop. So it doesn't
seem to me like it's any easier to review, because it's
basically the same code plus extra code that isn't needed.

thanks
-- PMM
diff mbox

Patch

diff --git a/include/exec/softmmu_exec.h b/include/exec/softmmu_exec.h
index 6fde154..470db20 100644
--- a/include/exec/softmmu_exec.h
+++ b/include/exec/softmmu_exec.h
@@ -162,3 +162,55 @@ 
 #define stw(p, v) stw_data(p, v)
 #define stl(p, v) stl_data(p, v)
 #define stq(p, v) stq_data(p, v)
+
+/**
+ * tlb_vaddr_to_host:
+ * @env: CPUArchState
+ * @addr: guest virtual address to look up
+ * @access_type: 0 for read, 1 for write, 2 for execute
+ * @mmu_idx: MMU index to use for lookup
+ *
+ * Look up the specified guest virtual index in the TCG softmmu TLB.
+ * If the TLB contains a host virtual address suitable for direct RAM
+ * access, then return it. Otherwise (TLB miss, TLB entry is for an
+ * I/O access, etc) return NULL.
+ *
+ * This is the equivalent of the initial fast-path code used by
+ * TCG backends for guest load and store accesses.
+ */
+static inline void *tlb_vaddr_to_host(CPUArchState *env, target_ulong addr,
+                                      int access_type, int mmu_idx)
+{
+    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    CPUTLBEntry *tlbentry = &env->tlb_table[mmu_idx][index];
+    target_ulong tlb_addr;
+    uintptr_t haddr;
+
+    switch (access_type) {
+    case 0:
+        tlb_addr = tlbentry->addr_read;
+        break;
+    case 1:
+        tlb_addr = tlbentry->addr_write;
+        break;
+    case 2:
+        tlb_addr = tlbentry->addr_code;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if ((addr & TARGET_PAGE_MASK)
+        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+        /* TLB entry is for a different page */
+        return NULL;
+    }
+
+    if (tlb_addr & ~TARGET_PAGE_MASK) {
+        /* IO access */
+        return NULL;
+    }
+
+    haddr = addr + env->tlb_table[mmu_idx][index].addend;
+    return (void *)haddr;
+}
diff --git a/target-arm/cpu-qom.h b/target-arm/cpu-qom.h
index 00234e1..41caa6c 100644
--- a/target-arm/cpu-qom.h
+++ b/target-arm/cpu-qom.h
@@ -150,6 +150,8 @@  typedef struct ARMCPU {
     uint32_t reset_cbar;
     uint32_t reset_auxcr;
     bool reset_hivecs;
+    /* DCZ blocksize, in log_2(words), ie low 4 bits of DCZID_EL0 */
+    uint32_t dcz_blocksize;
 } ARMCPU;
 
 #define TYPE_AARCH64_CPU "aarch64-cpu"
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index ff56519..a00ff73 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -758,7 +758,8 @@  static inline uint64_t cpreg_to_kvm_id(uint32_t cpregid)
 #define ARM_CP_WFI (ARM_CP_SPECIAL | (2 << 8))
 #define ARM_CP_NZCV (ARM_CP_SPECIAL | (3 << 8))
 #define ARM_CP_CURRENTEL (ARM_CP_SPECIAL | (4 << 8))
-#define ARM_LAST_SPECIAL ARM_CP_CURRENTEL
+#define ARM_CP_DC_ZVA (ARM_CP_SPECIAL | (5 << 8))
+#define ARM_LAST_SPECIAL ARM_CP_DC_ZVA
 /* Used only as a terminator for ARMCPRegInfo lists */
 #define ARM_CP_SENTINEL 0xffff
 /* Mask of only the flag bits in a type field */
diff --git a/target-arm/cpu64.c b/target-arm/cpu64.c
index 8426bf1..fccecc2 100644
--- a/target-arm/cpu64.c
+++ b/target-arm/cpu64.c
@@ -46,6 +46,7 @@  static void aarch64_any_initfn(Object *obj)
     set_feature(&cpu->env, ARM_FEATURE_V7MP);
     set_feature(&cpu->env, ARM_FEATURE_AARCH64);
     cpu->ctr = 0x80030003; /* 32 byte I and D cacheline size, VIPT icache */
+    cpu->dcz_blocksize = 7; /*  512 bytes */
 }
 #endif
 
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 62f7fd3..2ffc588 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -10,6 +10,8 @@ 
 #include <zlib.h> /* For crc32 */
 
 #ifndef CONFIG_USER_ONLY
+#include "exec/softmmu_exec.h"
+
 static inline int get_phys_addr(CPUARMState *env, target_ulong address,
                                 int access_type, int is_user,
                                 hwaddr *phys_ptr, int *prot,
@@ -1745,6 +1747,29 @@  static void tlbi_aa64_asid_write(CPUARMState *env, const ARMCPRegInfo *ri,
     tlb_flush(CPU(cpu), asid == 0);
 }
 
+static CPAccessResult aa64_zva_access(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    /* We don't implement EL2, so the only control on DC ZVA is the
+     * bit in the SCTLR which can prohibit access for EL0.
+     */
+    if (arm_current_pl(env) == 0 && !(env->cp15.c1_sys & SCTLR_DZE)) {
+        return CP_ACCESS_TRAP;
+    }
+    return CP_ACCESS_OK;
+}
+
+static uint64_t aa64_dczid_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    int dzp_bit = 1 << 4;
+
+    /* DZP indicates whether DC ZVA access is allowed */
+    if (aa64_zva_access(env, NULL) != CP_ACCESS_OK) {
+        dzp_bit = 0;
+    }
+    return cpu->dcz_blocksize | dzp_bit;
+}
+
 static const ARMCPRegInfo v8_cp_reginfo[] = {
     /* Minimal set of EL0-visible registers. This will need to be expanded
      * significantly for system emulation of AArch64 CPUs.
@@ -1764,13 +1789,18 @@  static const ARMCPRegInfo v8_cp_reginfo[] = {
     { .name = "FPSR", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .opc2 = 1, .crn = 4, .crm = 4,
       .access = PL0_RW, .readfn = aa64_fpsr_read, .writefn = aa64_fpsr_write },
-    /* Prohibit use of DC ZVA. OPTME: implement DC ZVA and allow its use.
-     * For system mode the DZP bit here will need to be computed, not constant.
-     */
     { .name = "DCZID_EL0", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 3, .opc2 = 7, .crn = 0, .crm = 0,
-      .access = PL0_R, .type = ARM_CP_CONST,
-      .resetvalue = 0x10 },
+      .access = PL0_R, .type = ARM_CP_NO_MIGRATE,
+      .readfn = aa64_dczid_read },
+    { .name = "DC_ZVA", .state = ARM_CP_STATE_AA64,
+      .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 4, .opc2 = 1,
+      .access = PL0_W, .type = ARM_CP_DC_ZVA,
+#ifndef CONFIG_USER_ONLY
+      /* Avoid overhead of an access check that always passes in user-mode */
+      .accessfn = aa64_zva_access,
+#endif
+    },
     { .name = "CURRENTEL", .state = ARM_CP_STATE_AA64,
       .opc0 = 3, .opc1 = 0, .opc2 = 2, .crn = 4, .crm = 2,
       .access = PL1_R, .type = ARM_CP_CURRENTEL },
@@ -3930,6 +3960,88 @@  void HELPER(v7m_msr)(CPUARMState *env, uint32_t reg, uint32_t val)
 
 #endif
 
+void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
+{
+    /* Implement DC ZVA, which zeroes a fixed-length block of memory.
+     * Note that we do not implement the (architecturally mandated)
+     * alignment fault for attempts to use this on Device memory
+     * (which matches the usual QEMU behaviour of not implementing either
+     * alignment faults or any memory attribute handling).
+     */
+
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    uint64_t blocklen = 4 << cpu->dcz_blocksize;
+    uint64_t vaddr = vaddr_in & ~(blocklen - 1);
+
+#ifndef CONFIG_USER_ONLY
+    {
+        /* Slightly awkwardly, QEMU's TARGET_PAGE_SIZE may be less than
+         * the block size so we might have to do more than one TLB lookup.
+         * We know that in fact for any v8 CPU the page size is at least 4K
+         * and the block size must be 2K or less, but TARGET_PAGE_SIZE is only
+         * 1K as an artefact of legacy v5 subpage support being present in the
+         * same QEMU executable.
+         */
+        int maxidx = DIV_ROUND_UP(blocklen, TARGET_PAGE_SIZE);
+        void *hostaddr[maxidx];
+        int try, i;
+
+        for (try = 0; try < 2; try++) {
+
+            for (i = 0; i < maxidx; i++) {
+                hostaddr[i] = tlb_vaddr_to_host(env,
+                                                vaddr + TARGET_PAGE_SIZE * i,
+                                                1, cpu_mmu_index(env));
+                if (!hostaddr[i]) {
+                    break;
+                }
+            }
+            if (i == maxidx) {
+                /* If it's all in the TLB it's fair game for just writing to;
+                 * we know we don't need to update dirty status, etc.
+                 */
+                for (i = 0; i < maxidx - 1; i++) {
+                    memset(hostaddr[i], 0, TARGET_PAGE_SIZE);
+                }
+                memset(hostaddr[i], 0, blocklen - (i * TARGET_PAGE_SIZE));
+                return;
+            }
+            /* OK, try a store and see if we can populate the tlb. This
+             * might cause an exception if the memory isn't writable,
+             * in which case we will longjmp out of here. We must for
+             * this purpose use the actual register value passed to us
+             * so that we get the fault address right.
+             */
+            helper_ret_stb_mmu(env, vaddr_in, 0, cpu_mmu_index(env), GETRA());
+            /* Now we can populate the other TLB entries, if any */
+            for (i = 0; i < maxidx; i++) {
+                uint64_t va = vaddr + TARGET_PAGE_SIZE * i;
+                if (va != (vaddr_in & TARGET_PAGE_MASK)) {
+                    helper_ret_stb_mmu(env, va, 0, cpu_mmu_index(env), GETRA());
+                }
+            }
+        }
+
+        /* Slow path (probably attempt to do this to an I/O device or
+         * similar, or clearing of a block of code we have translations
+         * cached for). Just do a series of byte writes as the architecture
+         * demands. It's not worth trying to use a cpu_physical_memory_map(),
+         * memset(), unmap() sequence here because:
+         *  + we'd need to account for the blocksize being larger than a page
+         *  + the direct-RAM access case is almost always going to be dealt
+         *    with in the fastpath code above, so there's no speed benefit
+         *  + we would have to deal with the map returning NULL because the
+         *    bounce buffer was in use
+         */
+        for (i = 0; i < blocklen; i++) {
+            helper_ret_stb_mmu(env, vaddr + i, 0, cpu_mmu_index(env), GETRA());
+        }
+    }
+#else
+    memset(g2h(vaddr), 0, blocklen);
+#endif
+}
+
 /* Note that signed overflow is undefined in C.  The following routines are
    careful to use unsigned types where modulo arithmetic is required.
    Failure to do so _will_ break on newer gcc.  */
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 0abdb0c..5977169 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -515,6 +515,7 @@  DEF_HELPER_4(crypto_aesmc, void, env, i32, i32, i32)
 
 DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)
 DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)
+DEF_HELPER_2(dc_zva, void, env, i64)
 
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index b7cf907..863a6db 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1334,6 +1334,11 @@  static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
         tcg_rt = cpu_reg(s, rt);
         tcg_gen_movi_i64(tcg_rt, s->current_pl << 2);
         return;
+    case ARM_CP_DC_ZVA:
+        /* Writes clear the aligned block of memory which rt points into. */
+        tcg_rt = cpu_reg(s, rt);
+        gen_helper_dc_zva(cpu_env, tcg_rt);
+        return;
     default:
         break;
     }