@@ -41,6 +41,7 @@
#endif
#define LANE_WIDTH (SHIFT ? 16 : 8)
+#define PACK_WIDTH (LANE_WIDTH / 2)
/*
* Copy the relevant parts of a Reg value around. In the case where
@@ -474,71 +475,81 @@ void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
}
#endif
+#define SHUFFLE4(F, a, b, offset) do { \
+ r0 = a->F((order & 3) + offset); \
+ r1 = a->F(((order >> 2) & 3) + offset); \
+ r2 = b->F(((order >> 4) & 3) + offset); \
+ r3 = b->F(((order >> 6) & 3) + offset); \
+ d->F(offset) = r0; \
+ d->F(offset + 1) = r1; \
+ d->F(offset + 2) = r2; \
+ d->F(offset + 3) = r3; \
+ } while (0)
+
#if SHIFT == 0
void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- MOVE(*d, r);
+ SHUFFLE4(W, s, s, 0);
}
#else
void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ Reg *v = d;
+ uint32_t r0, r1, r2, r3;
+ int i;
- r.L(0) = d->L(order & 3);
- r.L(1) = d->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- MOVE(*d, r);
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ SHUFFLE4(L, v, s, i);
+ }
}
void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ Reg *v = d;
+ uint64_t r0, r1;
+ int i;
- r.Q(0) = d->Q(order & 1);
- r.Q(1) = s->Q((order >> 1) & 1);
- MOVE(*d, r);
+ for (i = 0; i < 1 << SHIFT; i += 2) {
+ r0 = v->Q(((order & 1) & 1) + i);
+ r1 = s->Q(((order >> 1) & 1) + i);
+ d->Q(i) = r0;
+ d->Q(i + 1) = r1;
+ order >>= 2;
+ }
}
void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint32_t r0, r1, r2, r3;
+ int i;
- r.L(0) = s->L(order & 3);
- r.L(1) = s->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- MOVE(*d, r);
+ for (i = 0; i < 2 << SHIFT; i += 4) {
+ SHUFFLE4(L, s, s, i);
+ }
}
void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
+ int i, j;
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- r.Q(1) = s->Q(1);
- MOVE(*d, r);
+ for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
+ SHUFFLE4(W, s, s, i);
+ d->Q(j) = s->Q(j);
+ }
}
void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint16_t r0, r1, r2, r3;
+ int i, j;
- r.Q(0) = s->Q(0);
- r.W(4) = s->W(4 + (order & 3));
- r.W(5) = s->W(4 + ((order >> 2) & 3));
- r.W(6) = s->W(4 + ((order >> 4) & 3));
- r.W(7) = s->W(4 + ((order >> 6) & 3));
- MOVE(*d, r);
+ for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
+ d->Q(j) = s->Q(j);
+ SHUFFLE4(W, s, s, i);
+ }
}
#endif
@@ -1091,156 +1102,132 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
return val;
}
-void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.B(0) = satsb((int16_t)d->W(0));
- r.B(1) = satsb((int16_t)d->W(1));
- r.B(2) = satsb((int16_t)d->W(2));
- r.B(3) = satsb((int16_t)d->W(3));
-#if SHIFT == 1
- r.B(4) = satsb((int16_t)d->W(4));
- r.B(5) = satsb((int16_t)d->W(5));
- r.B(6) = satsb((int16_t)d->W(6));
- r.B(7) = satsb((int16_t)d->W(7));
-#endif
- r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
- r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
- r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
- r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
-#if SHIFT == 1
- r.B(12) = satsb((int16_t)s->W(4));
- r.B(13) = satsb((int16_t)s->W(5));
- r.B(14) = satsb((int16_t)s->W(6));
- r.B(15) = satsb((int16_t)s->W(7));
-#endif
- MOVE(*d, r);
+#define PACK_HELPER_B(name, F) \
+void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int j, k; \
+ for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
+ for (k = 0; k < PACK_WIDTH; k++) { \
+ r[k] = F((int16_t)v->W(j + k)); \
+ } \
+ for (k = 0; k < PACK_WIDTH; k++) { \
+ r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \
+ } \
+ for (k = 0; k < PACK_WIDTH * 2; k++) { \
+ d->B(2 * j + k) = r[k]; \
+ } \
+ } \
}
-void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
- Reg r;
-
- r.B(0) = satub((int16_t)d->W(0));
- r.B(1) = satub((int16_t)d->W(1));
- r.B(2) = satub((int16_t)d->W(2));
- r.B(3) = satub((int16_t)d->W(3));
-#if SHIFT == 1
- r.B(4) = satub((int16_t)d->W(4));
- r.B(5) = satub((int16_t)d->W(5));
- r.B(6) = satub((int16_t)d->W(6));
- r.B(7) = satub((int16_t)d->W(7));
-#endif
- r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
- r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
- r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
- r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
-#if SHIFT == 1
- r.B(12) = satub((int16_t)s->W(4));
- r.B(13) = satub((int16_t)s->W(5));
- r.B(14) = satub((int16_t)s->W(6));
- r.B(15) = satub((int16_t)s->W(7));
-#endif
- MOVE(*d, r);
-}
+PACK_HELPER_B(sswb, satsb)
+PACK_HELPER_B(uswb, satub)
void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
- Reg r;
+ Reg *v = d;
+ uint16_t r[PACK_WIDTH];
+ int j, k;
- r.W(0) = satsw(d->L(0));
- r.W(1) = satsw(d->L(1));
-#if SHIFT == 1
- r.W(2) = satsw(d->L(2));
- r.W(3) = satsw(d->L(3));
-#endif
- r.W((2 << SHIFT) + 0) = satsw(s->L(0));
- r.W((2 << SHIFT) + 1) = satsw(s->L(1));
-#if SHIFT == 1
- r.W(6) = satsw(s->L(2));
- r.W(7) = satsw(s->L(3));
-#endif
- MOVE(*d, r);
+ for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
+ for (k = 0; k < PACK_WIDTH / 2; k++) {
+ r[k] = satsw(v->L(j + k));
+ }
+ for (k = 0; k < PACK_WIDTH / 2; k++) {
+ r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
+ }
+ for (k = 0; k < PACK_WIDTH; k++) {
+ d->W(2 * j + k) = r[k];
+ }
+ }
}
#define UNPCK_OP(base_name, base) \
\
void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int j, i; \
\
- r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
- r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
- r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
- r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
- r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
- r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
- r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
- r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
- XMM_ONLY( \
- r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
- r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
- r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
- r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
- r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
- r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
- r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
- r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 8 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH; \
+ for (i = 0; i < PACK_WIDTH; i++) { \
+ r[2 * i] = v->B(k + i); \
+ r[2 * i + 1] = s->B(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \
+ d->B(j) = r[i]; \
+ } \
+ } \
} \
\
void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint16_t r[PACK_WIDTH]; \
+ int j, i; \
\
- r.W(0) = d->W((base << (SHIFT + 1)) + 0); \
- r.W(1) = s->W((base << (SHIFT + 1)) + 0); \
- r.W(2) = d->W((base << (SHIFT + 1)) + 1); \
- r.W(3) = s->W((base << (SHIFT + 1)) + 1); \
- XMM_ONLY( \
- r.W(4) = d->W((base << (SHIFT + 1)) + 2); \
- r.W(5) = s->W((base << (SHIFT + 1)) + 2); \
- r.W(6) = d->W((base << (SHIFT + 1)) + 3); \
- r.W(7) = s->W((base << (SHIFT + 1)) + 3); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 4 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH / 2; \
+ for (i = 0; i < PACK_WIDTH / 2; i++) { \
+ r[2 * i] = v->W(k + i); \
+ r[2 * i + 1] = s->W(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH; i++, j++) { \
+ d->W(j) = r[i]; \
+ } \
+ } \
} \
\
void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint32_t r[PACK_WIDTH / 2]; \
+ int j, i; \
\
- r.L(0) = d->L((base << SHIFT) + 0); \
- r.L(1) = s->L((base << SHIFT) + 0); \
- XMM_ONLY( \
- r.L(2) = d->L((base << SHIFT) + 1); \
- r.L(3) = s->L((base << SHIFT) + 1); \
- ) \
- MOVE(*d, r); \
+ for (j = 0; j < 2 << SHIFT; ) { \
+ int k = j + base * PACK_WIDTH / 4; \
+ for (i = 0; i < PACK_WIDTH / 4; i++) { \
+ r[2 * i] = v->L(k + i); \
+ r[2 * i + 1] = s->L(k + i); \
+ } \
+ for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \
+ d->L(j) = r[i]; \
+ } \
+ } \
} \
\
XMM_ONLY( \
- \
+ CPUX86State *env, Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint64_t r[2]; \
+ int i; \
\
- r.Q(0) = d->Q(base); \
- r.Q(1) = s->Q(base); \
- MOVE(*d, r); \
+ for (i = 0; i < 1 << SHIFT; i += 2) { \
+ r[0] = v->Q(base + i); \
+ r[1] = s->Q(base + i); \
+ d->Q(i) = r[0]; \
+ d->Q(i + 1) = r[1]; \
+ } \
} \
)
UNPCK_OP(l, 0)
UNPCK_OP(h, 1)
+#undef PACK_WIDTH
+#undef PACK_HELPER_B
+#undef UNPCK_OP
+
+
/* 3DNow! float ops */
#if SHIFT == 0
void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
@@ -1393,122 +1380,86 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
/* SSSE3 op helpers */
void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)