diff mbox series

[v2,09/30] tcg/loongarch64: Implement tcg_out_mov and tcg_out_movi

Message ID 20210921201915.601245-10-git@xen0n.name
State New
Headers show
Series LoongArch64 port of QEMU TCG | expand

Commit Message

WANG Xuerui Sept. 21, 2021, 8:18 p.m. UTC
Signed-off-by: WANG Xuerui <git@xen0n.name>
---
 tcg/loongarch64/tcg-target.c.inc | 89 ++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

Comments

Richard Henderson Sept. 22, 2021, 4:25 a.m. UTC | #1
On 9/21/21 1:18 PM, WANG Xuerui wrote:
> +    /* Test for PC-relative values that can be loaded faster.  */
> +    intptr_t pc_offset = val - (uintptr_t)s->code_ptr;

This isn't quite right for split r^x code buffer.
You should have seen this with --enable-debug-tcg...

You need pc_offset = tcg_pcrel_diff(s, (void *)val).

> +    if (pc_offset == (int32_t)pc_offset) {
> +        tcg_target_long lo = sextreg(pc_offset, 0, 12);
> +        tcg_target_long hi = pc_offset - lo;
> +        tcg_out_opc_pcaddu12i(s, rd, hi >> 12);

And... this doesn't quite work, right at the edges.  If lo is negative, hi can overflow 
out of range.  There are a number of ways to fix this.  One is to extract the pieces and 
re-assemble to see if it matches.  Another is to rearrange the arithmetic just a little 
and use PCALAU12I.

> +    tcg_target_long upper = (val >> 12) & 0xfffff;
> +    tcg_target_long higher = (val >> 32) & 0xfffff;

Better to use extract64(val, 12, 20) and extract64(val, 32, 30).


r~
WANG Xuerui Sept. 22, 2021, 3:16 p.m. UTC | #2
Hi Richard,

On 9/22/21 12:25, Richard Henderson wrote:
> On 9/21/21 1:18 PM, WANG Xuerui wrote:
>> +    /* Test for PC-relative values that can be loaded faster.  */
>> +    intptr_t pc_offset = val - (uintptr_t)s->code_ptr;
>
> This isn't quite right for split r^x code buffer.
> You should have seen this with --enable-debug-tcg...
>
> You need pc_offset = tcg_pcrel_diff(s, (void *)val).
Indeed; I just realized TCG debugging isn't fully enabled with 
--enable-debug only. Will fix in v3.
>
>> +    if (pc_offset == (int32_t)pc_offset) {
>> +        tcg_target_long lo = sextreg(pc_offset, 0, 12);
>> +        tcg_target_long hi = pc_offset - lo;
>> +        tcg_out_opc_pcaddu12i(s, rd, hi >> 12);
>
> And... this doesn't quite work, right at the edges.  If lo is 
> negative, hi can overflow out of range.  There are a number of ways to 
> fix this.  One is to extract the pieces and re-assemble to see if it 
> matches.  Another is to rearrange the arithmetic just a little and use 
> PCALAU12I.
I actually wrote a small test program to test for this, but found no 
overflow issues here; rather the tcg_out_opc_ori call below has 
signedness problem (need to mask the low variable, which is signed, with 
0xfff to avoid overwriting the opcode field). I think I'll add a 
tcg_debug_assert here, but keep the logic intact.
>
>> +    tcg_target_long upper = (val >> 12) & 0xfffff;
>> +    tcg_target_long higher = (val >> 32) & 0xfffff;
>
> Better to use extract64(val, 12, 20) and extract64(val, 32, 30).
Sure; but as the instructions perform sign-extension, thus taking signed 
operands, sextract64 or the wrapped sextreg will do it.
>
>
> r~
Richard Henderson Sept. 22, 2021, 3:17 p.m. UTC | #3
On 9/22/21 8:16 AM, WANG Xuerui wrote:
> Hi Richard,
> 
> On 9/22/21 12:25, Richard Henderson wrote:
>> On 9/21/21 1:18 PM, WANG Xuerui wrote:
>>> +    /* Test for PC-relative values that can be loaded faster.  */
>>> +    intptr_t pc_offset = val - (uintptr_t)s->code_ptr;
>>
>> This isn't quite right for split r^x code buffer.
>> You should have seen this with --enable-debug-tcg...
>>
>> You need pc_offset = tcg_pcrel_diff(s, (void *)val).
> Indeed; I just realized TCG debugging isn't fully enabled with --enable-debug only.

Um... it should be.
WANG Xuerui Sept. 22, 2021, 5:22 p.m. UTC | #4
Hi Richard,

On 9/22/21 23:17, Richard Henderson wrote:
> On 9/22/21 8:16 AM, WANG Xuerui wrote:
>> Hi Richard,
>>
>> On 9/22/21 12:25, Richard Henderson wrote:
>>> On 9/21/21 1:18 PM, WANG Xuerui wrote:
>>>> +    /* Test for PC-relative values that can be loaded faster.  */
>>>> +    intptr_t pc_offset = val - (uintptr_t)s->code_ptr;
>>>
>>> This isn't quite right for split r^x code buffer.
>>> You should have seen this with --enable-debug-tcg...
>>>
>>> You need pc_offset = tcg_pcrel_diff(s, (void *)val).
>> Indeed; I just realized TCG debugging isn't fully enabled with 
>> --enable-debug only.
>
> Um... it should be.
Hmm, maybe I was having the wrong impression, I even grepped for 
CONFIG_DEBUG_TCG and it showed 1, yet my assertions didn't fire during 
one of my debugging sessions... Maybe I was just asserting at the wrong 
place. Never mind though, problems are all solved now.
diff mbox series

Patch

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index 338b772732..e4e7e5e903 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -247,6 +247,93 @@  static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out_opc_dbar(s, 0);
 }
 
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+{
+    if (ret == arg) {
+        return true;
+    }
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+        /*
+         * Conventional register-register move used in LoongArch is
+         * `or dst, src, zero`.
+         */
+        tcg_out_opc_or(s, ret, arg, TCG_REG_ZERO);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+                         tcg_target_long val)
+{
+    if (type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Single-instruction cases.  */
+    tcg_target_long low = sextreg(val, 0, 12);
+    if (low == val) {
+        /* val fits in simm12: addi.w rd, zero, val */
+        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
+        return;
+    }
+    if (0x800 <= val && val <= 0xfff) {
+        /* val fits in uimm12: ori rd, zero, val */
+        tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
+        return;
+    }
+
+    /* Test for PC-relative values that can be loaded faster.  */
+    intptr_t pc_offset = val - (uintptr_t)s->code_ptr;
+    if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) {
+        tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
+        return;
+    }
+    if (pc_offset == (int32_t)pc_offset) {
+        tcg_target_long lo = sextreg(pc_offset, 0, 12);
+        tcg_target_long hi = pc_offset - lo;
+        tcg_out_opc_pcaddu12i(s, rd, hi >> 12);
+        tcg_out_opc_addi_d(s, rd, rd, lo);
+        return;
+    }
+
+    /*
+     * Slow path: at most lu12i.w + ori + cu32i.d + cu52i.d.
+     *
+     * Chop upper bits into 3 immediate-field-sized segments respectively.
+     */
+    tcg_target_long upper = (val >> 12) & 0xfffff;
+    tcg_target_long higher = (val >> 32) & 0xfffff;
+    tcg_target_long top = val >> 52;
+
+    tcg_out_opc_lu12i_w(s, rd, upper);
+    if (low != 0) {
+        tcg_out_opc_ori(s, rd, rd, low & 0xfff);
+    }
+
+    if (sextreg(val, 0, 32) == val) {
+        /*
+         * Fits in 32-bits, upper bits are already properly sign-extended by
+         * lu12i.w.
+         */
+        return;
+    }
+    tcg_out_opc_cu32i_d(s, rd, higher);
+
+    if (sextreg(val, 0, 52) == val) {
+        /*
+         * Fits in 52-bits, upper bits are already properly sign-extended by
+         * cu32i.d.
+         */
+        return;
+    }
+    tcg_out_opc_cu52i_d(s, rd, rd, top);
+}
+
 /*
  * Entry-points
  */
@@ -262,6 +349,8 @@  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_mb(s, a0);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
     default:
         g_assert_not_reached();
     }