@@ -451,6 +451,9 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
return val;
}
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values. */
#define NEON_FN(dest, src1, src2) do { \
int8_t tmp; \
tmp = (int8_t)src2; \
@@ -459,11 +462,12 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
} else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
dest = src1 >> (sizeof(src1) * 8 - 1); \
} else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (tmp - 1); \
+ dest = src1 >> (-tmp - 1); \
dest++; \
dest >>= 1; \
} else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
+ int64_t big_dest = ((int64_t)src1 + (1 << (-1 - tmp))); \
+ dest = big_dest >> -tmp; \
} else { \
dest = src1 << tmp; \
}} while (0)
@@ -472,6 +476,8 @@ NEON_VOP(rshl_s16, neon_s16, 2)
NEON_VOP(rshl_s32, neon_s32, 1)
#undef NEON_FN
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values. */
uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
{
int8_t shift = (int8_t)shiftop;
@@ -480,18 +486,37 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
val = 0;
} else if (shift < -64) {
val >>= 63;
- } else if (shift == -63) {
+ } else if (shift == -64) {
val >>= 63;
val++;
val >>= 1;
} else if (shift < 0) {
- val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
+ int64_t round = (int64_t)1 << (-1 - shift);
+ /* Reduce the range as long as the addition overflows. It's
+ * sufficient to check if (val+round) is < 0 and val > 0
+ * because round is > 0. */
+ while ((val > 0) && ((val + round) < 0) && round > 1) {
+ shift++;
+ round >>= 1;
+ val >>= 1;
+ }
+ if ((val > 0) && (val + round) < 0) {
+ /* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0x7FFFFFFFFFFFFFFF. */
+ val = 0x4000000000000000LL;
+ } else {
+ val = (val + round) >> -shift;
+ }
} else {
val <<= shift;
}
return val;
}
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values. */
#define NEON_FN(dest, src1, src2) do { \
int8_t tmp; \
tmp = (int8_t)src2; \
@@ -499,9 +524,10 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
tmp < -(ssize_t)sizeof(src1) * 8) { \
dest = 0; \
} else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (tmp - 1); \
+ dest = src1 >> (-tmp - 1); \
} else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
+ uint64_t big_dest = ((uint64_t)src1 + (1 << (-1 - tmp))); \
+ dest = big_dest >> -tmp; \
} else { \
dest = src1 << tmp; \
}} while (0)
@@ -513,14 +539,29 @@ NEON_VOP(rshl_u32, neon_u32, 1)
uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
{
int8_t shift = (uint8_t)shiftop;
- if (shift >= 64 || shift < 64) {
+ if (shift >= 64 || shift < -64) {
val = 0;
} else if (shift == -64) {
/* Rounding a 1-bit result just preserves that bit. */
val >>= 63;
- } if (shift < 0) {
- val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
- val >>= -shift;
+ } else if (shift < 0) {
+ uint64_t round = (uint64_t)1 << (-1 - shift);
+ /* Reduce the range as long as the addition overflows. It's
+ * sufficient to check if (val+round) is < val
+ * because val and round are > 0. */
+ while (((val + round) < val) && round > 1) {
+ shift++;
+ round >>= 1;
+ val >>= 1;
+ }
+ if ((val + round) < val) {
+ /* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0x&FFFFFFFFFFFFFFF. */
+ val = 0x8000000000000000LL;
+ } else {
+ val = (val + round) >> -shift;
+ }
} else {
val <<= shift;
}
@@ -4877,10 +4877,18 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
if (size == 0) {
imm = (0xffu >> (8 - shift));
imm |= imm << 16;
- } else {
+ } else if (size == 1) {
imm = 0xffff >> (16 - shift);
+ } else {
+ /* size == 2 */
+ imm = 0xffffffff >> (32 - shift);
+ }
+ if (size < 2) {
+ imm64 = imm | (((uint64_t)imm) << 32);
+ } else {
+ imm64 = imm;
}
- imm64 = imm | (((uint64_t)imm) << 32);
+ imm64 = ~imm64;
tcg_gen_andi_i64(cpu_V0, cpu_V0, imm64);
}
}