Patch Detail

GET /api/patches/812958/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 812958,
    "url": "http://patchwork.ozlabs.org/api/patches/812958/?format=api",
    "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20170912162513.21694-10-richard.henderson@linaro.org/",
    "project": {
        "id": 14,
        "url": "http://patchwork.ozlabs.org/api/projects/14/?format=api",
        "name": "QEMU Development",
        "link_name": "qemu-devel",
        "list_id": "qemu-devel.nongnu.org",
        "list_email": "qemu-devel@nongnu.org",
        "web_url": "",
        "scm_url": "",
        "webscm_url": "",
        "list_archive_url": "",
        "list_archive_url_format": "",
        "commit_url_format": ""
    },
    "msgid": "<20170912162513.21694-10-richard.henderson@linaro.org>",
    "list_archive_url": null,
    "date": "2017-09-12T16:25:06",
    "name": "[v2,09/16] tcg/i386: Add vector operations",
    "commit_ref": null,
    "pull_url": null,
    "state": "new",
    "archived": false,
    "hash": "df5cf86f7e9b3746f6617a9e1b7a7b674b987f5e",
    "submitter": {
        "id": 72104,
        "url": "http://patchwork.ozlabs.org/api/people/72104/?format=api",
        "name": "Richard Henderson",
        "email": "richard.henderson@linaro.org"
    },
    "delegate": null,
    "mbox": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20170912162513.21694-10-richard.henderson@linaro.org/mbox/",
    "series": [
        {
            "id": 2737,
            "url": "http://patchwork.ozlabs.org/api/series/2737/?format=api",
            "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/list/?series=2737",
            "date": "2017-09-12T16:24:59",
            "name": "TCG vectorization and example conversion",
            "version": 2,
            "mbox": "http://patchwork.ozlabs.org/series/2737/mbox/"
        }
    ],
    "comments": "http://patchwork.ozlabs.org/api/patches/812958/comments/",
    "check": "pending",
    "checks": "http://patchwork.ozlabs.org/api/patches/812958/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>",
        "X-Original-To": "incoming@patchwork.ozlabs.org",
        "Delivered-To": "patchwork-incoming@bilbo.ozlabs.org",
        "Authentication-Results": [
            "ozlabs.org;\n\tspf=pass (mailfrom) smtp.mailfrom=nongnu.org\n\t(client-ip=2001:4830:134:3::11; helo=lists.gnu.org;\n\tenvelope-from=qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org;\n\treceiver=<UNKNOWN>)",
            "ozlabs.org;\n\tdkim=fail reason=\"signature verification failed\" (1024-bit key;\n\tunprotected) header.d=linaro.org header.i=@linaro.org\n\theader.b=\"D47IKS5M\"; dkim-atps=neutral"
        ],
        "Received": [
            "from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11])\n\t(using TLSv1 with cipher AES256-SHA (256/256 bits))\n\t(No client certificate requested)\n\tby ozlabs.org (Postfix) with ESMTPS id 3xs9KZ1tYcz9s7g\n\tfor <incoming@patchwork.ozlabs.org>;\n\tWed, 13 Sep 2017 02:31:30 +1000 (AEST)",
            "from localhost ([::1]:36907 helo=lists.gnu.org)\n\tby lists.gnu.org with esmtp (Exim 4.71) (envelope-from\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>)\n\tid 1dro5w-0007GI-93\n\tfor incoming@patchwork.ozlabs.org; Tue, 12 Sep 2017 12:31:28 -0400",
            "from eggs.gnu.org ([2001:4830:134:3::10]:38111)\n\tby lists.gnu.org with esmtp (Exim 4.71)\n\t(envelope-from <richard.henderson@linaro.org>) id 1dro0E-0001pL-44\n\tfor qemu-devel@nongnu.org; Tue, 12 Sep 2017 12:25:37 -0400",
            "from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)\n\t(envelope-from <richard.henderson@linaro.org>) id 1dro0A-00075X-Is\n\tfor qemu-devel@nongnu.org; Tue, 12 Sep 2017 12:25:34 -0400",
            "from mail-pf0-x233.google.com ([2607:f8b0:400e:c00::233]:35556)\n\tby eggs.gnu.org with esmtps (TLS1.0:RSA_AES_128_CBC_SHA1:16)\n\t(Exim 4.71) (envelope-from <richard.henderson@linaro.org>)\n\tid 1dro0A-00074W-7q\n\tfor qemu-devel@nongnu.org; Tue, 12 Sep 2017 12:25:30 -0400",
            "by mail-pf0-x233.google.com with SMTP id q76so5835189pfq.2\n\tfor <qemu-devel@nongnu.org>; Tue, 12 Sep 2017 09:25:30 -0700 (PDT)",
            "from bigtime.twiddle.net (97-126-103-167.tukw.qwest.net.\n\t[97.126.103.167]) by smtp.gmail.com with ESMTPSA id\n\tb22sm20382140pfh.175.2017.09.12.09.25.26\n\t(version=TLS1_2 cipher=ECDHE-RSA-CHACHA20-POLY1305 bits=256/256);\n\tTue, 12 Sep 2017 09:25:27 -0700 (PDT)"
        ],
        "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google;\n\th=from:to:cc:subject:date:message-id:in-reply-to:references;\n\tbh=qZd9rXD4x3sGgQqm73GKr9CNNq9c12Y54lVEupqGsLk=;\n\tb=D47IKS5MsEknDhmR6kMr+HsSCihZIf8CdiivHRESwcbXyJWuZyk72knlDE33K6W75T\n\tQ6yDIbZcfQuc+5dwwNqp1inAtI2kCCEkgCEq5uJylOnx+KMKeG+LuP9VefKUYGyN/2S5\n\tEiyDDk5IVpHj/3adzxVgzTEM1sElKFPIdA4pI=",
        "X-Google-DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed;\n\td=1e100.net; s=20161025;\n\th=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to\n\t:references;\n\tbh=qZd9rXD4x3sGgQqm73GKr9CNNq9c12Y54lVEupqGsLk=;\n\tb=KMpOOMmuS9uwaFNyHw9z2F6FzI/U7LYBnFEG/CvSwChp1r6laOJlc96VD4fK9NlTbK\n\tXItqxKud5YeGBop3CB1yMnz671dr1dNvOWgqBsMHL+EXELsU+o1503/aq9ImsQ/pMe9m\n\tDXgppB3RMhcBQnlzAUZtb9U3hPIzGNUnaUQERq10l+4go8LP7ZjC+7+eWUQydWlEBI16\n\tqsrT80X9rnUi1wZWJvrqI5igNNEJFxqe1UC4/ZxN7gWTrqBYAr6CfGuccPPnjJRpu1pD\n\tDwFMzsXvi1je90+b1MeCgHSMijuU1zC4hhgWeo8pqjpvYw6FIvfkPaUeJkiv8KjC5u1S\n\t1rDA==",
        "X-Gm-Message-State": "AHPjjUg/pNPNgsV6OawqcTU0hPuzINJ9LCpyDJoCDGlCiqhsv6QdrxDe\n\tKisZYvB3REucQ9dKNQOgtA==",
        "X-Google-Smtp-Source": "AOwi7QC22HxhlvbqDmssv1PH3sv/p78GZH6cRsVG48G3Guj6nVbpHICDlYANsk/gdu+rtyXTz2Dv8g==",
        "X-Received": "by 10.159.246.2 with SMTP id b2mr1431282pls.85.1505233528585;\n\tTue, 12 Sep 2017 09:25:28 -0700 (PDT)",
        "From": "Richard Henderson <richard.henderson@linaro.org>",
        "To": "qemu-devel@nongnu.org",
        "Date": "Tue, 12 Sep 2017 09:25:06 -0700",
        "Message-Id": "<20170912162513.21694-10-richard.henderson@linaro.org>",
        "X-Mailer": "git-send-email 2.13.5",
        "In-Reply-To": "<20170912162513.21694-1-richard.henderson@linaro.org>",
        "References": "<20170912162513.21694-1-richard.henderson@linaro.org>",
        "X-detected-operating-system": "by eggs.gnu.org: Genre and OS details not\n\trecognized.",
        "X-Received-From": "2607:f8b0:400e:c00::233",
        "Subject": "[Qemu-devel] [PATCH v2 09/16] tcg/i386: Add vector operations",
        "X-BeenThere": "qemu-devel@nongnu.org",
        "X-Mailman-Version": "2.1.21",
        "Precedence": "list",
        "List-Id": "<qemu-devel.nongnu.org>",
        "List-Unsubscribe": "<https://lists.nongnu.org/mailman/options/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>",
        "List-Archive": "<http://lists.nongnu.org/archive/html/qemu-devel/>",
        "List-Post": "<mailto:qemu-devel@nongnu.org>",
        "List-Help": "<mailto:qemu-devel-request@nongnu.org?subject=help>",
        "List-Subscribe": "<https://lists.nongnu.org/mailman/listinfo/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=subscribe>",
        "Cc": "alex.bennee@linaro.org, f4bug@amsat.org",
        "Errors-To": "qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org",
        "Sender": "\"Qemu-devel\"\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>"
    },
    "content": "Signed-off-by: Richard Henderson <richard.henderson@linaro.org>\n---\n tcg/i386/tcg-target.h     |  46 ++++-\n tcg/i386/tcg-target.inc.c | 438 +++++++++++++++++++++++++++++++++++++++++-----\n 2 files changed, 438 insertions(+), 46 deletions(-)",
    "diff": "diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h\nindex b89dababf4..03f2506223 100644\n--- a/tcg/i386/tcg-target.h\n+++ b/tcg/i386/tcg-target.h\n@@ -30,11 +30,10 @@\n \n #ifdef __x86_64__\n # define TCG_TARGET_REG_BITS  64\n-# define TCG_TARGET_NB_REGS   16\n #else\n # define TCG_TARGET_REG_BITS  32\n-# define TCG_TARGET_NB_REGS    8\n #endif\n+# define TCG_TARGET_NB_REGS   24\n \n typedef enum {\n     TCG_REG_EAX = 0,\n@@ -56,6 +55,19 @@ typedef enum {\n     TCG_REG_R13,\n     TCG_REG_R14,\n     TCG_REG_R15,\n+\n+    /* SSE registers; 64-bit has access to 8 more, but we won't\n+       need more than a few and using only the first 8 minimizes\n+       the need for a rex prefix on the sse instructions.  */\n+    TCG_REG_XMM0,\n+    TCG_REG_XMM1,\n+    TCG_REG_XMM2,\n+    TCG_REG_XMM3,\n+    TCG_REG_XMM4,\n+    TCG_REG_XMM5,\n+    TCG_REG_XMM6,\n+    TCG_REG_XMM7,\n+\n     TCG_REG_RAX = TCG_REG_EAX,\n     TCG_REG_RCX = TCG_REG_ECX,\n     TCG_REG_RDX = TCG_REG_EDX,\n@@ -78,6 +90,17 @@ typedef enum {\n extern bool have_bmi1;\n extern bool have_popcnt;\n \n+#ifdef __SSE2__\n+#define have_sse2  true\n+#else\n+extern bool have_sse2;\n+#endif\n+#ifdef __AVX2__\n+#define have_avx2  true\n+#else\n+extern bool have_avx2;\n+#endif\n+\n /* optional instructions */\n #define TCG_TARGET_HAS_div2_i32         1\n #define TCG_TARGET_HAS_rot_i32          1\n@@ -146,6 +169,25 @@ extern bool have_popcnt;\n #define TCG_TARGET_HAS_mulsh_i64        0\n #endif\n \n+#define TCG_TARGET_HAS_v64              have_sse2\n+#define TCG_TARGET_HAS_v128             have_sse2\n+#define TCG_TARGET_HAS_v256             have_avx2\n+\n+#define TCG_TARGET_HAS_andc_v64         TCG_TARGET_HAS_v64\n+#define TCG_TARGET_HAS_orc_v64          0\n+#define TCG_TARGET_HAS_not_v64          0\n+#define TCG_TARGET_HAS_neg_v64          0\n+\n+#define TCG_TARGET_HAS_andc_v128        TCG_TARGET_HAS_v128\n+#define TCG_TARGET_HAS_orc_v128         0\n+#define TCG_TARGET_HAS_not_v128         0\n+#define TCG_TARGET_HAS_neg_v128         0\n+\n+#define TCG_TARGET_HAS_andc_v256        TCG_TARGET_HAS_v256\n+#define TCG_TARGET_HAS_orc_v256         0\n+#define TCG_TARGET_HAS_not_v256         0\n+#define TCG_TARGET_HAS_neg_v256         0\n+\n #define TCG_TARGET_deposit_i32_valid(ofs, len) \\\n     (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \\\n      ((ofs) == 0 && (len) == 16))\ndiff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c\nindex 5231056fd3..fbb41c3b7a 100644\n--- a/tcg/i386/tcg-target.inc.c\n+++ b/tcg/i386/tcg-target.inc.c\n@@ -28,10 +28,11 @@\n static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {\n #if TCG_TARGET_REG_BITS == 64\n     \"%rax\", \"%rcx\", \"%rdx\", \"%rbx\", \"%rsp\", \"%rbp\", \"%rsi\", \"%rdi\",\n-    \"%r8\",  \"%r9\",  \"%r10\", \"%r11\", \"%r12\", \"%r13\", \"%r14\", \"%r15\",\n #else\n     \"%eax\", \"%ecx\", \"%edx\", \"%ebx\", \"%esp\", \"%ebp\", \"%esi\", \"%edi\",\n #endif\n+    \"%r8\",  \"%r9\",  \"%r10\", \"%r11\", \"%r12\", \"%r13\", \"%r14\", \"%r15\",\n+    \"%xmm0\", \"%xmm1\", \"%xmm2\", \"%xmm3\", \"%xmm4\", \"%xmm5\", \"%xmm6\", \"%xmm7\",\n };\n #endif\n \n@@ -61,6 +62,14 @@ static const int tcg_target_reg_alloc_order[] = {\n     TCG_REG_EDX,\n     TCG_REG_EAX,\n #endif\n+    TCG_REG_XMM0,\n+    TCG_REG_XMM1,\n+    TCG_REG_XMM2,\n+    TCG_REG_XMM3,\n+    TCG_REG_XMM4,\n+    TCG_REG_XMM5,\n+    TCG_REG_XMM6,\n+    TCG_REG_XMM7,\n };\n \n static const int tcg_target_call_iarg_regs[] = {\n@@ -94,7 +103,7 @@ static const int tcg_target_call_oarg_regs[] = {\n #define TCG_CT_CONST_I32 0x400\n #define TCG_CT_CONST_WSZ 0x800\n \n-/* Registers used with L constraint, which are the first argument \n+/* Registers used with L constraint, which are the first argument\n    registers on x86_64, and two random call clobbered registers on\n    i386. */\n #if TCG_TARGET_REG_BITS == 64\n@@ -126,6 +135,16 @@ static bool have_cmov;\n bool have_bmi1;\n bool have_popcnt;\n \n+#ifndef have_sse2\n+bool have_sse2;\n+#endif\n+#ifdef have_avx2\n+#define have_avx1  have_avx2\n+#else\n+static bool have_avx1;\n+bool have_avx2;\n+#endif\n+\n #ifdef CONFIG_CPUID_H\n static bool have_movbe;\n static bool have_bmi2;\n@@ -192,6 +211,7 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,\n         tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);\n         break;\n     case 'q':\n+        /* A register that can be used as a byte operand.  */\n         ct->ct |= TCG_CT_REG;\n         if (TCG_TARGET_REG_BITS == 64) {\n             tcg_regset_set32(ct->u.regs, 0, 0xffff);\n@@ -200,10 +220,12 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,\n         }\n         break;\n     case 'Q':\n+        /* A register with an addressable second byte (e.g. %ah).  */\n         ct->ct |= TCG_CT_REG;\n         tcg_regset_set32(ct->u.regs, 0, 0xf);\n         break;\n     case 'r':\n+        /* A general register.  */\n         ct->ct |= TCG_CT_REG;\n         if (TCG_TARGET_REG_BITS == 64) {\n             tcg_regset_set32(ct->u.regs, 0, 0xffff);\n@@ -215,6 +237,11 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,\n         /* With TZCNT/LZCNT, we can have operand-size as an input.  */\n         ct->ct |= TCG_CT_CONST_WSZ;\n         break;\n+    case 'x':\n+        /* A vector register.  */\n+        ct->ct |= TCG_CT_REG;\n+        tcg_regset_set32(ct->u.regs, 0, 0xff0000);\n+        break;\n \n         /* qemu_ld/st address constraint */\n     case 'L':\n@@ -289,8 +316,9 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,\n # define P_REXB_RM\t0\n # define P_GS           0\n #endif\n-#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */\n-#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */\n+#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */\n+#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */\n+#define P_VEXL          0x80000         /* Set VEX.L = 1 */\n \n #define OPC_ARITH_EvIz\t(0x81)\n #define OPC_ARITH_EvIb\t(0x83)\n@@ -322,11 +350,29 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,\n #define OPC_MOVL_Iv     (0xb8)\n #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)\n #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)\n+#define OPC_MOVDQA_GyMy (0x6f | P_EXT | P_DATA16)\n+#define OPC_MOVDQA_MyGy (0x7f | P_EXT | P_DATA16)\n+#define OPC_MOVDQU_GyMy (0x6f | P_EXT | P_SIMDF3)\n+#define OPC_MOVDQU_MyGy (0x7f | P_EXT | P_SIMDF3)\n+#define OPC_MOVQ_GyMy   (0x7e | P_EXT | P_SIMDF3)\n+#define OPC_MOVQ_MyGy   (0xd6 | P_EXT | P_DATA16)\n #define OPC_MOVSBL\t(0xbe | P_EXT)\n #define OPC_MOVSWL\t(0xbf | P_EXT)\n #define OPC_MOVSLQ\t(0x63 | P_REXW)\n #define OPC_MOVZBL\t(0xb6 | P_EXT)\n #define OPC_MOVZWL\t(0xb7 | P_EXT)\n+#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)\n+#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)\n+#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)\n+#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)\n+#define OPC_PAND        (0xdb | P_EXT | P_DATA16)\n+#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)\n+#define OPC_POR         (0xeb | P_EXT | P_DATA16)\n+#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)\n+#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)\n+#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)\n+#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)\n+#define OPC_PXOR        (0xef | P_EXT | P_DATA16)\n #define OPC_POP_r32\t(0x58)\n #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)\n #define OPC_PUSH_r32\t(0x50)\n@@ -342,6 +388,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,\n #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)\n #define OPC_TESTL\t(0x85)\n #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)\n+#define OPC_VZEROUPPER  (0x77 | P_EXT)\n #define OPC_XCHG_ax_r32\t(0x90)\n \n #define OPC_GRP3_Ev\t(0xf7)\n@@ -491,11 +538,20 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)\n     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));\n }\n \n-static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)\n+static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,\n+                            int rm, int index)\n {\n     int tmp;\n \n-    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {\n+    /* Use the two byte form if possible, which cannot encode\n+       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */\n+    if ((opc & (P_EXT | P_EXT38 | P_REXW)) == P_EXT\n+        && ((rm | index) & 8) == 0) {\n+        /* Two byte VEX prefix.  */\n+        tcg_out8(s, 0xc5);\n+\n+        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */\n+    } else {\n         /* Three byte VEX prefix.  */\n         tcg_out8(s, 0xc4);\n \n@@ -505,20 +561,17 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)\n         } else if (opc & P_EXT) {\n             tmp = 1;\n         } else {\n-            tcg_abort();\n+            g_assert_not_reached();\n         }\n-        tmp |= 0x40;                       /* VEX.X */\n-        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */\n-        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */\n+        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */\n+        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */\n+        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */\n         tcg_out8(s, tmp);\n \n-        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */\n-    } else {\n-        /* Two byte VEX prefix.  */\n-        tcg_out8(s, 0xc5);\n-\n-        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */\n+        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */\n     }\n+\n+    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */\n     /* VEX.pp */\n     if (opc & P_DATA16) {\n         tmp |= 1;                          /* 0x66 */\n@@ -530,6 +583,11 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)\n     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */\n     tcg_out8(s, tmp);\n     tcg_out8(s, opc);\n+}\n+\n+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)\n+{\n+    tcg_out_vex_opc(s, opc, r, v, rm, 0);\n     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));\n }\n \n@@ -538,8 +596,8 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)\n    mode for absolute addresses, ~RM is the size of the immediate operand\n    that will follow the instruction.  */\n \n-static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n-                                     int index, int shift, intptr_t offset)\n+static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,\n+                               int shift, intptr_t offset)\n {\n     int mod, len;\n \n@@ -550,7 +608,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;\n             intptr_t disp = offset - pc;\n             if (disp == (int32_t)disp) {\n-                tcg_out_opc(s, opc, r, 0, 0);\n                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);\n                 tcg_out32(s, disp);\n                 return;\n@@ -560,7 +617,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n                use of the MODRM+SIB encoding and is therefore larger than\n                rip-relative addressing.  */\n             if (offset == (int32_t)offset) {\n-                tcg_out_opc(s, opc, r, 0, 0);\n                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);\n                 tcg_out8(s, (4 << 3) | 5);\n                 tcg_out32(s, offset);\n@@ -568,10 +624,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n             }\n \n             /* ??? The memory isn't directly addressable.  */\n-            tcg_abort();\n+            g_assert_not_reached();\n         } else {\n             /* Absolute address.  */\n-            tcg_out_opc(s, opc, r, 0, 0);\n             tcg_out8(s, (r << 3) | 5);\n             tcg_out32(s, offset);\n             return;\n@@ -594,7 +649,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n        that would be used for %esp is the escape to the two byte form.  */\n     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {\n         /* Single byte MODRM format.  */\n-        tcg_out_opc(s, opc, r, rm, 0);\n         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));\n     } else {\n         /* Two byte MODRM+SIB format.  */\n@@ -608,7 +662,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n             tcg_debug_assert(index != TCG_REG_ESP);\n         }\n \n-        tcg_out_opc(s, opc, r, rm, index);\n         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);\n         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));\n     }\n@@ -620,6 +673,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n     }\n }\n \n+static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,\n+                                     int index, int shift, intptr_t offset)\n+{\n+    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);\n+    tcg_out_sib_offset(s, r, rm, index, shift, offset);\n+}\n+\n+static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,\n+                                         int rm, int index, int shift,\n+                                         intptr_t offset)\n+{\n+    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);\n+    tcg_out_sib_offset(s, r, rm, index, shift, offset);\n+}\n+\n /* A simplification of the above with no index or shift.  */\n static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,\n                                         int rm, intptr_t offset)\n@@ -627,6 +695,31 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,\n     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);\n }\n \n+static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,\n+                                            int v, int rm, intptr_t offset)\n+{\n+    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);\n+}\n+\n+static void tcg_out_maybe_vex_modrm(TCGContext *s, int opc, int r, int rm)\n+{\n+    if (have_avx1) {\n+        tcg_out_vex_modrm(s, opc, r, 0, rm);\n+    } else {\n+        tcg_out_modrm(s, opc, r, rm);\n+    }\n+}\n+\n+static void tcg_out_maybe_vex_modrm_offset(TCGContext *s, int opc, int r,\n+                                           int rm, intptr_t offset)\n+{\n+    if (have_avx1) {\n+        tcg_out_vex_modrm_offset(s, opc, r, 0, rm, offset);\n+    } else {\n+        tcg_out_modrm_offset(s, opc, r, rm, offset);\n+    }\n+}\n+\n /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */\n static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)\n {\n@@ -637,12 +730,33 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)\n     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);\n }\n \n-static inline void tcg_out_mov(TCGContext *s, TCGType type,\n-                               TCGReg ret, TCGReg arg)\n+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)\n {\n-    if (arg != ret) {\n-        int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);\n-        tcg_out_modrm(s, opc, ret, arg);\n+    if (arg == ret) {\n+        return;\n+    }\n+    switch (type) {\n+    case TCG_TYPE_I32:\n+        tcg_debug_assert(ret < 16 && arg < 16);\n+        tcg_out_modrm(s, OPC_MOVL_GvEv, ret, arg);\n+        break;\n+    case TCG_TYPE_I64:\n+        tcg_debug_assert(ret < 16 && arg < 16);\n+        tcg_out_modrm(s, OPC_MOVL_GvEv | P_REXW, ret, arg);\n+        break;\n+\n+    case TCG_TYPE_V256:\n+        tcg_debug_assert(ret >= 16 && arg >= 16);\n+        tcg_out_vex_modrm(s, OPC_MOVDQA_GyMy | P_VEXL, ret, 0, arg);\n+        break;\n+    case TCG_TYPE_V128:\n+    case TCG_TYPE_V64:\n+        tcg_debug_assert(ret >= 16 && arg >= 16);\n+        tcg_out_maybe_vex_modrm(s, OPC_MOVDQA_GyMy, ret, arg);\n+        break;\n+\n+    default:\n+        g_assert_not_reached();\n     }\n }\n \n@@ -651,6 +765,29 @@ static void tcg_out_movi(TCGContext *s, TCGType type,\n {\n     tcg_target_long diff;\n \n+    switch (type) {\n+    case TCG_TYPE_I32:\n+    case TCG_TYPE_I64:\n+        tcg_debug_assert(ret < 16);\n+        break;\n+\n+    case TCG_TYPE_V64:\n+    case TCG_TYPE_V128:\n+    case TCG_TYPE_V256:\n+        tcg_debug_assert(ret >= 16);\n+        /* ??? Revisit this as the implementation progresses.  */\n+        tcg_debug_assert(arg == 0);\n+        if (have_avx1) {\n+            tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);\n+        } else {\n+            tcg_out_modrm(s, OPC_PXOR, ret, ret);\n+        }\n+        return;\n+\n+    default:\n+        g_assert_not_reached();\n+    }\n+\n     if (arg == 0) {\n         tgen_arithr(s, ARITH_XOR, ret, ret);\n         return;\n@@ -714,18 +851,64 @@ static inline void tcg_out_pop(TCGContext *s, int reg)\n     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);\n }\n \n-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,\n-                              TCGReg arg1, intptr_t arg2)\n+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,\n+                       TCGReg arg1, intptr_t arg2)\n {\n-    int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);\n-    tcg_out_modrm_offset(s, opc, ret, arg1, arg2);\n+    switch (type) {\n+    case TCG_TYPE_I64:\n+        tcg_debug_assert(ret < 16);\n+        tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);\n+        break;\n+    case TCG_TYPE_I32:\n+        tcg_debug_assert(ret < 16);\n+        tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);\n+        break;\n+    case TCG_TYPE_V64:\n+        tcg_debug_assert(ret >= 16);\n+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_GyMy, ret, arg1, arg2);\n+        break;\n+    case TCG_TYPE_V128:\n+        tcg_debug_assert(ret >= 16);\n+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_GyMy, ret, arg1, arg2);\n+        break;\n+    case TCG_TYPE_V256:\n+        tcg_debug_assert(ret >= 16);\n+        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_GyMy | P_VEXL,\n+                                 ret, 0, arg1, arg2);\n+        break;\n+    default:\n+        g_assert_not_reached();\n+    }\n }\n \n-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,\n-                              TCGReg arg1, intptr_t arg2)\n+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,\n+                       TCGReg arg1, intptr_t arg2)\n {\n-    int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);\n-    tcg_out_modrm_offset(s, opc, arg, arg1, arg2);\n+    switch (type) {\n+    case TCG_TYPE_I64:\n+        tcg_debug_assert(arg < 16);\n+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);\n+        break;\n+    case TCG_TYPE_I32:\n+        tcg_debug_assert(arg < 16);\n+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);\n+        break;\n+    case TCG_TYPE_V64:\n+        tcg_debug_assert(arg >= 16);\n+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_MyGy, arg, arg1, arg2);\n+        break;\n+    case TCG_TYPE_V128:\n+        tcg_debug_assert(arg >= 16);\n+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_MyGy, arg, arg1, arg2);\n+        break;\n+    case TCG_TYPE_V256:\n+        tcg_debug_assert(arg >= 16);\n+        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_MyGy | P_VEXL,\n+                                 arg, 0, arg1, arg2);\n+        break;\n+    default:\n+        g_assert_not_reached();\n+    }\n }\n \n static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,\n@@ -737,6 +920,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,\n             return false;\n         }\n         rexw = P_REXW;\n+    } else if (type != TCG_TYPE_I32) {\n+        return false;\n     }\n     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);\n     tcg_out32(s, val);\n@@ -1871,6 +2056,15 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,\n         case glue(glue(INDEX_op_, x), _i32)\n #endif\n \n+#define OP_128_256(x) \\\n+        case glue(glue(INDEX_op_, x), _v256): \\\n+            rexw = P_VEXL; /* FALLTHRU */     \\\n+        case glue(glue(INDEX_op_, x), _v128)\n+\n+#define OP_64_128_256(x) \\\n+        OP_128_256(x):   \\\n+        case glue(glue(INDEX_op_, x), _v64)\n+\n     /* Hoist the loads of the most common arguments.  */\n     a0 = args[0];\n     a1 = args[1];\n@@ -2266,19 +2460,98 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,\n         }\n         break;\n \n+    OP_64_128_256(add8):\n+        c = OPC_PADDB;\n+        goto gen_simd;\n+    OP_64_128_256(add16):\n+        c = OPC_PADDW;\n+        goto gen_simd;\n+    OP_64_128_256(add32):\n+        c = OPC_PADDD;\n+        goto gen_simd;\n+    OP_128_256(add64):\n+        c = OPC_PADDQ;\n+        goto gen_simd;\n+    OP_64_128_256(sub8):\n+        c = OPC_PSUBB;\n+        goto gen_simd;\n+    OP_64_128_256(sub16):\n+        c = OPC_PSUBW;\n+        goto gen_simd;\n+    OP_64_128_256(sub32):\n+        c = OPC_PSUBD;\n+        goto gen_simd;\n+    OP_128_256(sub64):\n+        c = OPC_PSUBQ;\n+        goto gen_simd;\n+    OP_64_128_256(and):\n+        c = OPC_PAND;\n+        goto gen_simd;\n+    OP_64_128_256(or):\n+        c = OPC_POR;\n+        goto gen_simd;\n+    OP_64_128_256(xor):\n+        c = OPC_PXOR;\n+    gen_simd:\n+        if (have_avx1) {\n+            tcg_out_vex_modrm(s, c, a0, a1, a2);\n+        } else {\n+            tcg_out_modrm(s, c, a0, a2);\n+        }\n+        break;\n+    OP_64_128_256(andc):\n+        if (have_avx1) {\n+            tcg_out_vex_modrm(s, OPC_PANDN, a0, a2, a1);\n+        } else {\n+            tcg_out_modrm(s, c, a0, a1);\n+        }\n+        break;\n+\n+    case INDEX_op_ld_v64:\n+        c = TCG_TYPE_V64;\n+        goto gen_simd_ld;\n+    case INDEX_op_ld_v128:\n+        c = TCG_TYPE_V128;\n+        goto gen_simd_ld;\n+    case INDEX_op_ld_v256:\n+        c = TCG_TYPE_V256;\n+    gen_simd_ld:\n+        tcg_out_ld(s, c, a0, a1, a2);\n+        break;\n+\n+    case INDEX_op_st_v64:\n+        c = TCG_TYPE_V64;\n+        goto gen_simd_st;\n+    case INDEX_op_st_v128:\n+        c = TCG_TYPE_V128;\n+        goto gen_simd_st;\n+    case INDEX_op_st_v256:\n+        c = TCG_TYPE_V256;\n+    gen_simd_st:\n+        tcg_out_st(s, c, a0, a1, a2);\n+        break;\n+\n     case INDEX_op_mb:\n         tcg_out_mb(s, a0);\n         break;\n     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */\n     case INDEX_op_mov_i64:\n+    case INDEX_op_mov_v64:\n+    case INDEX_op_mov_v128:\n+    case INDEX_op_mov_v256:\n     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */\n     case INDEX_op_movi_i64:\n+    case INDEX_op_movi_v64:\n+    case INDEX_op_movi_v128:\n+    case INDEX_op_movi_v256:\n     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */\n     default:\n         tcg_abort();\n     }\n \n #undef OP_32_64\n+#undef OP_128_256\n+#undef OP_64_128_256\n }\n \n static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)\n@@ -2304,6 +2577,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)\n         = { .args_ct_str = { \"r\", \"r\", \"L\", \"L\" } };\n     static const TCGTargetOpDef L_L_L_L\n         = { .args_ct_str = { \"L\", \"L\", \"L\", \"L\" } };\n+    static const TCGTargetOpDef x_0_x = { .args_ct_str = { \"x\", \"0\", \"x\" } };\n+    static const TCGTargetOpDef x_x_0 = { .args_ct_str = { \"x\", \"x\", \"0\" } };\n+    static const TCGTargetOpDef x_x_x = { .args_ct_str = { \"x\", \"x\", \"x\" } };\n+    static const TCGTargetOpDef x_r = { .args_ct_str = { \"x\", \"r\" } };\n \n     switch (op) {\n     case INDEX_op_goto_ptr:\n@@ -2505,6 +2782,53 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)\n             return &s2;\n         }\n \n+    case INDEX_op_ld_v64:\n+    case INDEX_op_ld_v128:\n+    case INDEX_op_ld_v256:\n+    case INDEX_op_st_v64:\n+    case INDEX_op_st_v128:\n+    case INDEX_op_st_v256:\n+        return &x_r;\n+\n+    case INDEX_op_add8_v64:\n+    case INDEX_op_add8_v128:\n+    case INDEX_op_add16_v64:\n+    case INDEX_op_add16_v128:\n+    case INDEX_op_add32_v64:\n+    case INDEX_op_add32_v128:\n+    case INDEX_op_add64_v128:\n+    case INDEX_op_sub8_v64:\n+    case INDEX_op_sub8_v128:\n+    case INDEX_op_sub16_v64:\n+    case INDEX_op_sub16_v128:\n+    case INDEX_op_sub32_v64:\n+    case INDEX_op_sub32_v128:\n+    case INDEX_op_sub64_v128:\n+    case INDEX_op_and_v64:\n+    case INDEX_op_and_v128:\n+    case INDEX_op_or_v64:\n+    case INDEX_op_or_v128:\n+    case INDEX_op_xor_v64:\n+    case INDEX_op_xor_v128:\n+        return have_avx1 ? &x_x_x : &x_0_x;\n+    case INDEX_op_andc_v64:\n+    case INDEX_op_andc_v128:\n+        return have_avx1 ? &x_x_x : &x_x_0;\n+\n+    case INDEX_op_add8_v256:\n+    case INDEX_op_add16_v256:\n+    case INDEX_op_add32_v256:\n+    case INDEX_op_add64_v256:\n+    case INDEX_op_sub8_v256:\n+    case INDEX_op_sub16_v256:\n+    case INDEX_op_sub32_v256:\n+    case INDEX_op_sub64_v256:\n+    case INDEX_op_and_v256:\n+    case INDEX_op_andc_v256:\n+    case INDEX_op_or_v256:\n+    case INDEX_op_xor_v256:\n+        return &x_x_x;\n+\n     default:\n         break;\n     }\n@@ -2589,6 +2913,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)\n \n     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);\n \n+    if (have_avx2) {\n+        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);\n+    }\n     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {\n         tcg_out_pop(s, tcg_target_callee_save_regs[i]);\n     }\n@@ -2610,9 +2937,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)\n static void tcg_target_init(TCGContext *s)\n {\n #ifdef CONFIG_CPUID_H\n-    unsigned a, b, c, d;\n+    unsigned a, b, c, d, b7 = 0;\n     int max = __get_cpuid_max(0, 0);\n \n+    if (max >= 7) {\n+        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */\n+        __cpuid_count(7, 0, a, b7, c, d);\n+        have_bmi1 = (b7 & bit_BMI) != 0;\n+        have_bmi2 = (b7 & bit_BMI2) != 0;\n+    }\n+\n     if (max >= 1) {\n         __cpuid(1, a, b, c, d);\n #ifndef have_cmov\n@@ -2621,17 +2955,26 @@ static void tcg_target_init(TCGContext *s)\n            available, we'll use a small forward branch.  */\n         have_cmov = (d & bit_CMOV) != 0;\n #endif\n+#ifndef have_sse2\n+        have_sse2 = (d & bit_SSE2) != 0;\n+#endif\n         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we\n            need to probe for it.  */\n         have_movbe = (c & bit_MOVBE) != 0;\n         have_popcnt = (c & bit_POPCNT) != 0;\n-    }\n \n-    if (max >= 7) {\n-        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */\n-        __cpuid_count(7, 0, a, b, c, d);\n-        have_bmi1 = (b & bit_BMI) != 0;\n-        have_bmi2 = (b & bit_BMI2) != 0;\n+#ifndef have_avx2\n+        /* There are a number of things we must check before we can be\n+           sure of not hitting invalid opcode.  */\n+        if (c & bit_OSXSAVE) {\n+            unsigned xcrl, xcrh;\n+            asm (\"xgetbv\" : \"=a\" (xcrl), \"=d\" (xcrh) : \"c\" (0));\n+            if ((xcrl & 6) == 6) {\n+                have_avx1 = (c & bit_AVX) != 0;\n+                have_avx2 = (b7 & bit_AVX2) != 0;\n+            }\n+        }\n+#endif\n     }\n \n     max = __get_cpuid_max(0x8000000, 0);\n@@ -2648,6 +2991,13 @@ static void tcg_target_init(TCGContext *s)\n     } else {\n         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);\n     }\n+    if (have_sse2) {\n+        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V64], 0, 0xff0000);\n+        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V128], 0, 0xff0000);\n+    }\n+    if (have_avx2) {\n+        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V256], 0, 0xff0000);\n+    }\n \n     tcg_regset_clear(tcg_target_call_clobber_regs);\n     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);\n",
    "prefixes": [
        "v2",
        "09/16"
    ]
}