Message ID | 1635698589-31849-29-git-send-email-tsimpson@quicinc.com |
---|---|
State | New |
Headers | show |
Series | [PULL,01/30] Hexagon HVX (target/hexagon) README | expand |
On 10/31/21 17:43, Taylor Simpson wrote: > Tests for > packet semantics > vector loads (aligned and unaligned) > vector stores (aligned and unaligned) > vector masked stores > vector new value store > maximum HVX temps in a packet > vector operations > > Acked-by: Richard Henderson <richard.henderson@linaro.org> > Signed-off-by: Taylor Simpson <tsimpson@quicinc.com> > --- > tests/tcg/hexagon/hvx_misc.c | 469 ++++++++++++++++++++++++++++++++++++++ > tests/tcg/hexagon/Makefile.target | 2 + > 2 files changed, 471 insertions(+) > create mode 100644 tests/tcg/hexagon/hvx_misc.c > +static void test_load_tmp(void) > +{ > + void *p0 = buffer0; > + void *p1 = buffer1; > + void *pout = output; > + > + for (int i = 0; i < BUFSIZE; i++) { > + /* > + * Load into v12 as .tmp, then use it in the next packet > + * Should get the new value within the same packet and > + * the old value in the next packet > + */ > + asm("v3 = vmem(%0 + #0)\n\t" > + "r1 = #1\n\t" > + "v12 = vsplat(r1)\n\t" > + "{\n\t" > + " v12.tmp = vmem(%1 + #0)\n\t" > + " v4.w = vadd(v12.w, v3.w)\n\t" > + "}\n\t" > + "v4.w = vadd(v4.w, v12.w)\n\t" > + "vmem(%2 + #0) = v4\n\t" > + : : "r"(p0), "r"(p1), "r"(pout) > + : "r1", "v12", "v3", "v4", "v6", "memory"); > + p0 += sizeof(MMVector); > + p1 += sizeof(MMVector); > + pout += sizeof(MMVector); > + > + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { > + expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; > + } > + } > + > + check_output_w(__LINE__, BUFSIZE); > +} This test fails guest-tests: tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12' in asm : "r1", "v12", "v3", "v4", "v6", "memory"); ^ tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro 'TEST_VEC_OP2' VEC_OP2(ASM, EL, p0, p1, pout); \ ^
On 11/1/21 6:33 AM, Philippe Mathieu-Daudé wrote: > On 10/31/21 17:43, Taylor Simpson wrote: >> Tests for >> packet semantics >> vector loads (aligned and unaligned) >> vector stores (aligned and unaligned) >> vector masked stores >> vector new value store >> maximum HVX temps in a packet >> vector operations >> >> Acked-by: Richard Henderson <richard.henderson@linaro.org> >> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com> >> --- >> tests/tcg/hexagon/hvx_misc.c | 469 ++++++++++++++++++++++++++++++++++++++ >> tests/tcg/hexagon/Makefile.target | 2 + >> 2 files changed, 471 insertions(+) >> create mode 100644 tests/tcg/hexagon/hvx_misc.c > >> +static void test_load_tmp(void) >> +{ >> + void *p0 = buffer0; >> + void *p1 = buffer1; >> + void *pout = output; >> + >> + for (int i = 0; i < BUFSIZE; i++) { >> + /* >> + * Load into v12 as .tmp, then use it in the next packet >> + * Should get the new value within the same packet and >> + * the old value in the next packet >> + */ >> + asm("v3 = vmem(%0 + #0)\n\t" >> + "r1 = #1\n\t" >> + "v12 = vsplat(r1)\n\t" >> + "{\n\t" >> + " v12.tmp = vmem(%1 + #0)\n\t" >> + " v4.w = vadd(v12.w, v3.w)\n\t" >> + "}\n\t" >> + "v4.w = vadd(v4.w, v12.w)\n\t" >> + "vmem(%2 + #0) = v4\n\t" >> + : : "r"(p0), "r"(p1), "r"(pout) >> + : "r1", "v12", "v3", "v4", "v6", "memory"); >> + p0 += sizeof(MMVector); >> + p1 += sizeof(MMVector); >> + pout += sizeof(MMVector); >> + >> + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { >> + expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; >> + } >> + } >> + >> + check_output_w(__LINE__, BUFSIZE); >> +} > > This test fails guest-tests: > > tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12' > in asm > : "r1", "v12", "v3", "v4", "v6", "memory"); > ^ > tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro 'TEST_VEC_OP2' > VEC_OP2(ASM, EL, p0, p1, pout); \ > ^ > Yep, this PR depends on the toolchain update that's pending. I'll ask Alex to hurry up the docker patch queue. r~
Plan-A should be to update the container with the new toolchain. Plan-B would be to remove the vector registers from the inline asm. Thanks! Taylor > -----Original Message----- > From: Richard Henderson <richard.henderson@linaro.org> > Sent: Monday, November 1, 2021 8:44 AM > To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson > <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée > <alex.bennee@linaro.org> > Cc: peter.maydell@linaro.org > Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test > > WARNING: This email originated from outside of Qualcomm. Please be wary > of any links or attachments, and do not enable macros. > > On 11/1/21 6:33 AM, Philippe Mathieu-Daudé wrote: > > On 10/31/21 17:43, Taylor Simpson wrote: > >> Tests for > >> packet semantics > >> vector loads (aligned and unaligned) > >> vector stores (aligned and unaligned) > >> vector masked stores > >> vector new value store > >> maximum HVX temps in a packet > >> vector operations > >> > >> Acked-by: Richard Henderson <richard.henderson@linaro.org> > >> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com> > >> --- > >> tests/tcg/hexagon/hvx_misc.c | 469 > ++++++++++++++++++++++++++++++++++++++ > >> tests/tcg/hexagon/Makefile.target | 2 + > >> 2 files changed, 471 insertions(+) > >> create mode 100644 tests/tcg/hexagon/hvx_misc.c > > > >> +static void test_load_tmp(void) > >> +{ > >> + void *p0 = buffer0; > >> + void *p1 = buffer1; > >> + void *pout = output; > >> + > >> + for (int i = 0; i < BUFSIZE; i++) { > >> + /* > >> + * Load into v12 as .tmp, then use it in the next packet > >> + * Should get the new value within the same packet and > >> + * the old value in the next packet > >> + */ > >> + asm("v3 = vmem(%0 + #0)\n\t" > >> + "r1 = #1\n\t" > >> + "v12 = vsplat(r1)\n\t" > >> + "{\n\t" > >> + " v12.tmp = vmem(%1 + #0)\n\t" > >> + " v4.w = vadd(v12.w, v3.w)\n\t" > >> + "}\n\t" > >> + "v4.w = vadd(v4.w, v12.w)\n\t" > >> + "vmem(%2 + #0) = v4\n\t" > >> + : : "r"(p0), "r"(p1), "r"(pout) > >> + : "r1", "v12", "v3", "v4", "v6", "memory"); > >> + p0 += sizeof(MMVector); > >> + p1 += sizeof(MMVector); > >> + pout += sizeof(MMVector); > >> + > >> + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { > >> + expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; > >> + } > >> + } > >> + > >> + check_output_w(__LINE__, BUFSIZE); } > > > > This test fails guest-tests: > > > > tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12' > > in asm > > : "r1", "v12", "v3", "v4", "v6", "memory"); > > ^ > > tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro > 'TEST_VEC_OP2' > > VEC_OP2(ASM, EL, p0, p1, pout); \ > > ^ > > > > Yep, this PR depends on the toolchain update that's pending. > I'll ask Alex to hurry up the docker patch queue. > > > r~
On 11/1/21 15:09, Taylor Simpson wrote: > Plan-A should be to update the container with the new toolchain. IIUC Richard is going with Plan-A: wait Alex queue get merged, then retry merging this pull request. > Plan-B would be to remove the vector registers from the inline asm. > > > Thanks! > Taylor > > >> -----Original Message----- >> From: Richard Henderson <richard.henderson@linaro.org> >> Sent: Monday, November 1, 2021 8:44 AM >> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson >> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée >> <alex.bennee@linaro.org> >> Cc: peter.maydell@linaro.org >> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test >> >> WARNING: This email originated from outside of Qualcomm. Please be wary >> of any links or attachments, and do not enable macros. >> >> On 11/1/21 6:33 AM, Philippe Mathieu-Daudé wrote: >>> On 10/31/21 17:43, Taylor Simpson wrote: >>>> Tests for >>>> packet semantics >>>> vector loads (aligned and unaligned) >>>> vector stores (aligned and unaligned) >>>> vector masked stores >>>> vector new value store >>>> maximum HVX temps in a packet >>>> vector operations >>>> >>>> Acked-by: Richard Henderson <richard.henderson@linaro.org> >>>> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com> >>>> --- >>>> tests/tcg/hexagon/hvx_misc.c | 469 >> ++++++++++++++++++++++++++++++++++++++ >>>> tests/tcg/hexagon/Makefile.target | 2 + >>>> 2 files changed, 471 insertions(+) >>>> create mode 100644 tests/tcg/hexagon/hvx_misc.c >>> >>>> +static void test_load_tmp(void) >>>> +{ >>>> + void *p0 = buffer0; >>>> + void *p1 = buffer1; >>>> + void *pout = output; >>>> + >>>> + for (int i = 0; i < BUFSIZE; i++) { >>>> + /* >>>> + * Load into v12 as .tmp, then use it in the next packet >>>> + * Should get the new value within the same packet and >>>> + * the old value in the next packet >>>> + */ >>>> + asm("v3 = vmem(%0 + #0)\n\t" >>>> + "r1 = #1\n\t" >>>> + "v12 = vsplat(r1)\n\t" >>>> + "{\n\t" >>>> + " v12.tmp = vmem(%1 + #0)\n\t" >>>> + " v4.w = vadd(v12.w, v3.w)\n\t" >>>> + "}\n\t" >>>> + "v4.w = vadd(v4.w, v12.w)\n\t" >>>> + "vmem(%2 + #0) = v4\n\t" >>>> + : : "r"(p0), "r"(p1), "r"(pout) >>>> + : "r1", "v12", "v3", "v4", "v6", "memory"); >>>> + p0 += sizeof(MMVector); >>>> + p1 += sizeof(MMVector); >>>> + pout += sizeof(MMVector); >>>> + >>>> + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { >>>> + expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; >>>> + } >>>> + } >>>> + >>>> + check_output_w(__LINE__, BUFSIZE); } >>> >>> This test fails guest-tests: >>> >>> tests/tcg/hexagon/hvx_misc.c:111:21: error: unknown register name 'v12' >>> in asm >>> : "r1", "v12", "v3", "v4", "v6", "memory"); >>> ^ >>> tests/tcg/hexagon/hvx_misc.c:362:9: note: expanded from macro >> 'TEST_VEC_OP2' >>> VEC_OP2(ASM, EL, p0, p1, pout); \ >>> ^ >>> >> >> Yep, this PR depends on the toolchain update that's pending. >> I'll ask Alex to hurry up the docker patch queue. >> >> >> r~
On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote: > On 11/1/21 15:09, Taylor Simpson wrote: >> Plan-A should be to update the container with the new toolchain. > > IIUC Richard is going with Plan-A: wait Alex queue get merged, > then retry merging this pull request. Correct. r~
> -----Original Message----- > From: Richard Henderson <richard.henderson@linaro.org> > Sent: Monday, November 1, 2021 10:03 AM > To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson > <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée > <alex.bennee@linaro.org> > Cc: peter.maydell@linaro.org > Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test > > On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote: > > On 11/1/21 15:09, Taylor Simpson wrote: > >> Plan-A should be to update the container with the new toolchain. > > > > IIUC Richard is going with Plan-A: wait Alex queue get merged, then > > retry merging this pull request. > > Correct. Agreed. Just let me know if Alex isn't going to get the new toolchain merged in time, and I can go to plan B. Thanks, Taylor
Taylor Simpson <tsimpson@quicinc.com> writes: >> -----Original Message----- >> From: Richard Henderson <richard.henderson@linaro.org> >> Sent: Monday, November 1, 2021 10:03 AM >> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson >> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée >> <alex.bennee@linaro.org> >> Cc: peter.maydell@linaro.org >> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test >> >> On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote: >> > On 11/1/21 15:09, Taylor Simpson wrote: >> >> Plan-A should be to update the container with the new toolchain. >> > >> > IIUC Richard is going with Plan-A: wait Alex queue get merged, then >> > retry merging this pull request. >> >> Correct. > > Agreed. Just let me know if Alex isn't going to get the new toolchain > merged in time, and I can go to plan B. The PR has been a pain to get working but it should be in soon. Just need to work out why the signals test has broken between master and my PR: https://gitlab.com/qemu-project/qemu/-/jobs/1739288510#L1318 vs https://gitlab.com/stsquad/qemu/-/jobs/1740048034#L1316 Could it be a toolchain thing? > > Thanks, > Taylor
> -----Original Message----- > From: Alex Bennée <alex.bennee@linaro.org> > Sent: Tuesday, November 2, 2021 11:42 AM > To: Taylor Simpson <tsimpson@quicinc.com> > Cc: Richard Henderson <richard.henderson@linaro.org>; Philippe Mathieu- > Daudé <f4bug@amsat.org>; qemu-devel@nongnu.org; > peter.maydell@linaro.org > Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test > > Taylor Simpson <tsimpson@quicinc.com> writes: > > >> -----Original Message----- > >> From: Richard Henderson <richard.henderson@linaro.org> > >> Sent: Monday, November 1, 2021 10:03 AM > >> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson > >> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée > >> <alex.bennee@linaro.org> > >> Cc: peter.maydell@linaro.org > >> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc > >> test > >> > >> On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote: > >> > On 11/1/21 15:09, Taylor Simpson wrote: > >> >> Plan-A should be to update the container with the new toolchain. > >> > > >> > IIUC Richard is going with Plan-A: wait Alex queue get merged, then > >> > retry merging this pull request. > >> > >> Correct. > > > > Agreed. Just let me know if Alex isn't going to get the new toolchain > > merged in time, and I can go to plan B. > > The PR has been a pain to get working but it should be in soon. Just need to > work out why the signals test has broken between master and my > PR: > > https://gitlab.com/qemu-project/qemu/-/jobs/1739288510#L1318 > > vs > > https://gitlab.com/stsquad/qemu/-/jobs/1740048034#L1316 > > Could it be a toolchain thing? Not likely a toolchain problem. If I can access both of the signals binaries, I can confirm. Richard was doing some changes in qemu related to signals. Are his changes available in both repos? Taylor
Taylor Simpson <tsimpson@quicinc.com> writes: >> -----Original Message----- >> From: Alex Bennée <alex.bennee@linaro.org> >> Sent: Tuesday, November 2, 2021 11:42 AM >> To: Taylor Simpson <tsimpson@quicinc.com> >> Cc: Richard Henderson <richard.henderson@linaro.org>; Philippe Mathieu- >> Daudé <f4bug@amsat.org>; qemu-devel@nongnu.org; >> peter.maydell@linaro.org >> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc test >> >> Taylor Simpson <tsimpson@quicinc.com> writes: >> >> >> -----Original Message----- >> >> From: Richard Henderson <richard.henderson@linaro.org> >> >> Sent: Monday, November 1, 2021 10:03 AM >> >> To: Philippe Mathieu-Daudé <f4bug@amsat.org>; Taylor Simpson >> >> <tsimpson@quicinc.com>; qemu-devel@nongnu.org; Alex Bennée >> >> <alex.bennee@linaro.org> >> >> Cc: peter.maydell@linaro.org >> >> Subject: Re: [PULL 28/30] Hexagon HVX (tests/tcg/hexagon) hvx_misc >> >> test >> >> >> >> On 11/1/21 10:17 AM, Philippe Mathieu-Daudé wrote: >> >> > On 11/1/21 15:09, Taylor Simpson wrote: >> >> >> Plan-A should be to update the container with the new toolchain. >> >> > >> >> > IIUC Richard is going with Plan-A: wait Alex queue get merged, then >> >> > retry merging this pull request. >> >> >> >> Correct. >> > >> > Agreed. Just let me know if Alex isn't going to get the new toolchain >> > merged in time, and I can go to plan B. >> >> The PR has been a pain to get working but it should be in soon. Just need to >> work out why the signals test has broken between master and my >> PR: >> >> https://gitlab.com/qemu-project/qemu/-/jobs/1739288510#L1318 >> >> vs >> >> https://gitlab.com/stsquad/qemu/-/jobs/1740048034#L1316 >> >> Could it be a toolchain thing? > > Not likely a toolchain problem. If I can access both of the signals > binaries, I can confirm. Testing against two signals binaries I see a 4-7% failure rate against the new binary versus the original pre-toolchain change one. That's not to say the binary is broken - it could be a subtle change that exacerbated our existing poor signals support. https://transfer.sh/xA2ejk/signals.old (pre-toolchain change) https://transfer.sh/vSsn5s/signals something in the CI ensures it fails much more reliably as U can't get it to pass on a retry. > > Richard was doing some changes in qemu related to signals. Are his > changes available in both repos? I've tested against master and rth/tgt-next (742f07628c0) > > > Taylor
On 11/3/21 9:31 AM, Alex Bennée wrote: >>> Could it be a toolchain thing? >> >> Not likely a toolchain problem. If I can access both of the signals >> binaries, I can confirm. > > Testing against two signals binaries I see a 4-7% failure rate against the > new binary versus the original pre-toolchain change one. That's not to > say the binary is broken - it could be a subtle change that exacerbated > our existing poor signals support. > > https://transfer.sh/xA2ejk/signals.old (pre-toolchain change) > https://transfer.sh/vSsn5s/signals > > something in the CI ensures it fails much more reliably as U can't get > it to pass on a retry. I've had a closer look at the signals failure, and it really could be a toolchain problem. The sigsegv is at #0 0x00005555557a387f in stb_p (ptr=0x10000, v=0 '\000') at /home/richard.henderson/qemu/src/include/qemu/bswap.h:326 #1 0x00005555557a4bc5 in cpu_stb_mmu (env=0x555555e4eb50, addr=0, val=0 '\000', oi=0, ra=93824992935986) at ../src/accel/tcg/user-exec.c:359 #2 0x00005555557a5396 in cpu_stb_mmuidx_ra (env=0x555555e4eb50, addr=0, val=0, mmu_idx=0, ra=93824992935986) at ../src/accel/tcg/ldst_common.c.inc:83 #3 0x00005555557a57e6 in cpu_stb_data_ra (env=0x555555e4eb50, addr=0, val=0, ra=93824992935986) at ../src/accel/tcg/ldst_common.c.inc:183 #4 0x00005555555ff6f0 in helper_commit_store (env=0x555555e4eb50, slot_num=1) at ../src/target/hexagon/op_helper.c:151 #5 0x0000555555600032 in check_noshuf (env=0x555555e4eb50, slot=0) at ../src/target/hexagon/op_helper.c:407 #6 0x00005555556000e4 in mem_load4 (env=0x555555e4eb50, slot=0, vaddr=305000) at ../src/target/hexagon/op_helper.c:431 #7 0x00005555556063c0 in helper_L2_loadri_io (env=0x555555e4eb50, RsV=305000, siV=0, slot=0) at target/hexagon/helper_funcs_generated.c.inc:1013 #8 0x00007fffe8034f5a in code_gen_buffer () which is a store to address 0, which obviously should fail. This comes from IN: nontrivial_free 0x000224c4: 0x78004003 { R3 = #0x0 0x000224c8: 0xf204d001 P1 = cmp.eq(R4,R16) } 0x000224cc: 0x5c00413e { if (P1) jump:nt PC+124 0x000224d0: 0x38034000 if (P0) memb(R3+#0x0) = #0x0 0x000224d4: 0x9180c002 R2 = memw(R0+#0x0) } which is part of the new toolchain's libc. This is quite obviously a store to address 0 if P0 is true. Which looks pretty questionable. Presumably P0 is not always set, which is why the program does not always crash. But there doesn't appear to be anything wrong with the qemu translation. I'm suspicious of the new compiler. This looks like some sort of code scheduling bug, where R3=0 got moved ahead of the final use of the previous value in R3. In the short term, I recommend dropping the hexagon toolchain update and that Taylor generate a new HVX pull request with the new tests present but disabled in the makefile. r~
diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c new file mode 100644 index 0000000..312bb98 --- /dev/null +++ b/tests/tcg/hexagon/hvx_misc.c @@ -0,0 +1,469 @@ +/* + * Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <string.h> + +int err; + +static void __check(int line, int i, int j, uint64_t result, uint64_t expect) +{ + if (result != expect) { + printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n", + line, i, j, result, expect); + err++; + } +} + +#define check(RES, EXP) __check(__LINE__, RES, EXP) + +#define MAX_VEC_SIZE_BYTES 128 + +typedef union { + uint64_t ud[MAX_VEC_SIZE_BYTES / 8]; + int64_t d[MAX_VEC_SIZE_BYTES / 8]; + uint32_t uw[MAX_VEC_SIZE_BYTES / 4]; + int32_t w[MAX_VEC_SIZE_BYTES / 4]; + uint16_t uh[MAX_VEC_SIZE_BYTES / 2]; + int16_t h[MAX_VEC_SIZE_BYTES / 2]; + uint8_t ub[MAX_VEC_SIZE_BYTES / 1]; + int8_t b[MAX_VEC_SIZE_BYTES / 1]; +} MMVector; + +#define BUFSIZE 16 +#define OUTSIZE 16 +#define MASKMOD 3 + +MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); +MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); +MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); +MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); +MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); + +#define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \ +static void check_output_##FIELD(int line, size_t num_vectors) \ +{ \ + for (int i = 0; i < num_vectors; i++) { \ + for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ + __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \ + } \ + } \ +} + +CHECK_OUTPUT_FUNC(d, 8) +CHECK_OUTPUT_FUNC(w, 4) +CHECK_OUTPUT_FUNC(h, 2) +CHECK_OUTPUT_FUNC(b, 1) + +static void init_buffers(void) +{ + int counter0 = 0; + int counter1 = 17; + for (int i = 0; i < BUFSIZE; i++) { + for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { + buffer0[i].b[j] = counter0++; + buffer1[i].b[j] = counter1++; + } + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { + mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1; + } + } +} + +static void test_load_tmp(void) +{ + void *p0 = buffer0; + void *p1 = buffer1; + void *pout = output; + + for (int i = 0; i < BUFSIZE; i++) { + /* + * Load into v12 as .tmp, then use it in the next packet + * Should get the new value within the same packet and + * the old value in the next packet + */ + asm("v3 = vmem(%0 + #0)\n\t" + "r1 = #1\n\t" + "v12 = vsplat(r1)\n\t" + "{\n\t" + " v12.tmp = vmem(%1 + #0)\n\t" + " v4.w = vadd(v12.w, v3.w)\n\t" + "}\n\t" + "v4.w = vadd(v4.w, v12.w)\n\t" + "vmem(%2 + #0) = v4\n\t" + : : "r"(p0), "r"(p1), "r"(pout) + : "r1", "v12", "v3", "v4", "v6", "memory"); + p0 += sizeof(MMVector); + p1 += sizeof(MMVector); + pout += sizeof(MMVector); + + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { + expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; + } + } + + check_output_w(__LINE__, BUFSIZE); +} + +static void test_load_cur(void) +{ + void *p0 = buffer0; + void *pout = output; + + for (int i = 0; i < BUFSIZE; i++) { + asm("{\n\t" + " v2.cur = vmem(%0 + #0)\n\t" + " vmem(%1 + #0) = v2\n\t" + "}\n\t" + : : "r"(p0), "r"(pout) : "v2", "memory"); + p0 += sizeof(MMVector); + pout += sizeof(MMVector); + + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { + expect[i].uw[j] = buffer0[i].uw[j]; + } + } + + check_output_w(__LINE__, BUFSIZE); +} + +static void test_load_aligned(void) +{ + /* Aligned loads ignore the low bits of the address */ + void *p0 = buffer0; + void *pout = output; + const size_t offset = 13; + + p0 += offset; /* Create an unaligned address */ + asm("v2 = vmem(%0 + #0)\n\t" + "vmem(%1 + #0) = v2\n\t" + : : "r"(p0), "r"(pout) : "v2", "memory"); + + expect[0] = buffer0[0]; + + check_output_w(__LINE__, 1); +} + +static void test_load_unaligned(void) +{ + void *p0 = buffer0; + void *pout = output; + const size_t offset = 12; + + p0 += offset; /* Create an unaligned address */ + asm("v2 = vmemu(%0 + #0)\n\t" + "vmem(%1 + #0) = v2\n\t" + : : "r"(p0), "r"(pout) : "v2", "memory"); + + memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector)); + + check_output_w(__LINE__, 1); +} + +static void test_store_aligned(void) +{ + /* Aligned stores ignore the low bits of the address */ + void *p0 = buffer0; + void *pout = output; + const size_t offset = 13; + + pout += offset; /* Create an unaligned address */ + asm("v2 = vmem(%0 + #0)\n\t" + "vmem(%1 + #0) = v2\n\t" + : : "r"(p0), "r"(pout) : "v2", "memory"); + + expect[0] = buffer0[0]; + + check_output_w(__LINE__, 1); +} + +static void test_store_unaligned(void) +{ + void *p0 = buffer0; + void *pout = output; + const size_t offset = 12; + + pout += offset; /* Create an unaligned address */ + asm("v2 = vmem(%0 + #0)\n\t" + "vmemu(%1 + #0) = v2\n\t" + : : "r"(p0), "r"(pout) : "v2", "memory"); + + memcpy(expect, buffer0, 2 * sizeof(MMVector)); + memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector)); + + check_output_w(__LINE__, 2); +} + +static void test_masked_store(bool invert) +{ + void *p0 = buffer0; + void *pmask = mask; + void *pout = output; + + memset(expect, 0xff, sizeof(expect)); + memset(output, 0xff, sizeof(expect)); + + for (int i = 0; i < BUFSIZE; i++) { + if (invert) { + asm("r4 = #0\n\t" + "v4 = vsplat(r4)\n\t" + "v5 = vmem(%0 + #0)\n\t" + "q0 = vcmp.eq(v4.w, v5.w)\n\t" + "v5 = vmem(%1)\n\t" + "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */ + : : "r"(pmask), "r"(p0), "r"(pout) + : "r4", "v4", "v5", "q0", "memory"); + } else { + asm("r4 = #0\n\t" + "v4 = vsplat(r4)\n\t" + "v5 = vmem(%0 + #0)\n\t" + "q0 = vcmp.eq(v4.w, v5.w)\n\t" + "v5 = vmem(%1)\n\t" + "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */ + : : "r"(pmask), "r"(p0), "r"(pout) + : "r4", "v4", "v5", "q0", "memory"); + } + p0 += sizeof(MMVector); + pmask += sizeof(MMVector); + pout += sizeof(MMVector); + + for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { + if (invert) { + if (i + j % MASKMOD != 0) { + expect[i].w[j] = buffer0[i].w[j]; + } + } else { + if (i + j % MASKMOD == 0) { + expect[i].w[j] = buffer0[i].w[j]; + } + } + } + } + + check_output_w(__LINE__, BUFSIZE); +} + +static void test_new_value_store(void) +{ + void *p0 = buffer0; + void *pout = output; + + asm("{\n\t" + " v2 = vmem(%0 + #0)\n\t" + " vmem(%1 + #0) = v2.new\n\t" + "}\n\t" + : : "r"(p0), "r"(pout) : "v2", "memory"); + + expect[0] = buffer0[0]; + + check_output_w(__LINE__, 1); +} + +static void test_max_temps() +{ + void *p0 = buffer0; + void *pout = output; + + asm("v0 = vmem(%0 + #0)\n\t" + "v1 = vmem(%0 + #1)\n\t" + "v2 = vmem(%0 + #2)\n\t" + "v3 = vmem(%0 + #3)\n\t" + "v4 = vmem(%0 + #4)\n\t" + "{\n\t" + " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t" + " v2.b = vshuffe(v3.b, v2.b)\n\t" + " v3.w = vadd(v1.w, v4.w)\n\t" + " v4.tmp = vmem(%0 + #5)\n\t" + "}\n\t" + "vmem(%1 + #0) = v0\n\t" + "vmem(%1 + #1) = v1\n\t" + "vmem(%1 + #2) = v2\n\t" + "vmem(%1 + #3) = v3\n\t" + "vmem(%1 + #4) = v4\n\t" + : : "r"(p0), "r"(pout) : "memory"); + + /* The first two vectors come from the vadd-pair instruction */ + for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { + expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i]; + expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i]; + } + /* The third vector comes from the vshuffe instruction */ + for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) { + expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) | + (buffer0[3].uh[i] & 0xff) << 8; + } + /* The fourth vector comes from the vadd-single instruction */ + for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { + expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i]; + } + /* + * The fifth vector comes from the load to v4 + * make sure the .tmp is dropped + */ + expect[4] = buffer0[4]; + + check_output_b(__LINE__, 5); +} + +#define VEC_OP1(ASM, EL, IN, OUT) \ + asm("v2 = vmem(%0 + #0)\n\t" \ + "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \ + "vmem(%1 + #0) = v2\n\t" \ + : : "r"(IN), "r"(OUT) : "v2", "memory") + +#define VEC_OP2(ASM, EL, IN0, IN1, OUT) \ + asm("v2 = vmem(%0 + #0)\n\t" \ + "v3 = vmem(%1 + #0)\n\t" \ + "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \ + "vmem(%2 + #0) = v2\n\t" \ + : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory") + +#define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \ +static void test_##NAME(void) \ +{ \ + void *pin = buffer0; \ + void *pout = output; \ + for (int i = 0; i < BUFSIZE; i++) { \ + VEC_OP1(ASM, EL, pin, pout); \ + pin += sizeof(MMVector); \ + pout += sizeof(MMVector); \ + } \ + for (int i = 0; i < BUFSIZE; i++) { \ + for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ + expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \ + } \ + } \ + check_output_##FIELD(__LINE__, BUFSIZE); \ +} + +#define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \ +static void test_##NAME(void) \ +{ \ + void *p0 = buffer0; \ + void *p1 = buffer1; \ + void *pout = output; \ + for (int i = 0; i < BUFSIZE; i++) { \ + VEC_OP2(ASM, EL, p0, p1, pout); \ + p0 += sizeof(MMVector); \ + p1 += sizeof(MMVector); \ + pout += sizeof(MMVector); \ + } \ + for (int i = 0; i < BUFSIZE; i++) { \ + for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ + expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \ + } \ + } \ + check_output_##FIELD(__LINE__, BUFSIZE); \ +} + +#define THRESHOLD 31 + +#define PRED_OP2(ASM, IN0, IN1, OUT, INV) \ + asm("r4 = #%3\n\t" \ + "v1.b = vsplat(r4)\n\t" \ + "v2 = vmem(%0 + #0)\n\t" \ + "q0 = vcmp.gt(v2.b, v1.b)\n\t" \ + "v3 = vmem(%1 + #0)\n\t" \ + "q1 = vcmp.gt(v3.b, v1.b)\n\t" \ + "q2 = " #ASM "(q0, " INV "q1)\n\t" \ + "r4 = #0xff\n\t" \ + "v1.b = vsplat(r4)\n\t" \ + "if (q2) vmem(%2 + #0) = v1\n\t" \ + : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \ + : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory") + +#define TEST_PRED_OP2(NAME, ASM, OP, INV) \ +static void test_##NAME(bool invert) \ +{ \ + void *p0 = buffer0; \ + void *p1 = buffer1; \ + void *pout = output; \ + memset(output, 0, sizeof(expect)); \ + for (int i = 0; i < BUFSIZE; i++) { \ + PRED_OP2(ASM, p0, p1, pout, INV); \ + p0 += sizeof(MMVector); \ + p1 += sizeof(MMVector); \ + pout += sizeof(MMVector); \ + } \ + for (int i = 0; i < BUFSIZE; i++) { \ + for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \ + bool p0 = (buffer0[i].b[j] > THRESHOLD); \ + bool p1 = (buffer1[i].b[j] > THRESHOLD); \ + if (invert) { \ + expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \ + } else { \ + expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \ + } \ + } \ + } \ + check_output_b(__LINE__, BUFSIZE); \ +} + +TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +) +TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +) +TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +) +TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -) +TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -) +TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -) +TEST_VEC_OP2(vxor, vxor, , d, 8, ^) +TEST_VEC_OP2(vand, vand, , d, 8, &) +TEST_VEC_OP2(vor, vor, , d, 8, |) +TEST_VEC_OP1(vnot, vnot, , d, 8, ~) + +TEST_PRED_OP2(pred_or, or, |, "") +TEST_PRED_OP2(pred_or_n, or, |, "!") +TEST_PRED_OP2(pred_and, and, &, "") +TEST_PRED_OP2(pred_and_n, and, &, "!") +TEST_PRED_OP2(pred_xor, xor, ^, "") + +int main() +{ + init_buffers(); + + test_load_tmp(); + test_load_cur(); + test_load_aligned(); + test_load_unaligned(); + test_store_aligned(); + test_store_unaligned(); + test_masked_store(false); + test_masked_store(true); + test_new_value_store(); + test_max_temps(); + + test_vadd_w(); + test_vadd_h(); + test_vadd_b(); + test_vsub_w(); + test_vsub_h(); + test_vsub_b(); + test_vxor(); + test_vand(); + test_vor(); + test_vnot(); + + test_pred_or(false); + test_pred_or_n(true); + test_pred_and(false); + test_pred_and_n(true); + test_pred_xor(false); + + puts(err ? "FAIL" : "PASS"); + return err ? 1 : 0; +} diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target index c598ac0..59b5bb1 100644 --- a/tests/tcg/hexagon/Makefile.target +++ b/tests/tcg/hexagon/Makefile.target @@ -42,7 +42,9 @@ HEX_TESTS += vector_add_int HEX_TESTS += atomics HEX_TESTS += fpstuff HEX_TESTS += overflow +HEX_TESTS += hvx_misc TESTS += $(HEX_TESTS) vector_add_int: CFLAGS += -mhvx -fvectorize +hvx_misc: CFLAGS += -mhvx