From patchwork Sat Dec 21 16:43:43 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Aurelien Jarno X-Patchwork-Id: 304419 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 5A2E12C0096 for ; Sun, 22 Dec 2013 03:46:16 +1100 (EST) Received: from localhost ([::1]:54781 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VuPgr-0001A0-F5 for incoming@patchwork.ozlabs.org; Sat, 21 Dec 2013 11:46:13 -0500 Received: from eggs.gnu.org ([2001:4830:134:3::10]:50578) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VuPep-0006wl-Co for qemu-devel@nongnu.org; Sat, 21 Dec 2013 11:44:12 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1VuPef-0006Mk-Np for qemu-devel@nongnu.org; Sat, 21 Dec 2013 11:44:07 -0500 Received: from hall.aurel32.net ([2001:bc8:30d7:101::1]:46497) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VuPef-0006LS-E0 for qemu-devel@nongnu.org; Sat, 21 Dec 2013 11:43:57 -0500 Received: from pc-97-206-86-200.cm.vtr.net ([200.86.206.97] helo=ohm.rr44.fr) by hall.aurel32.net with esmtpsa (TLS1.2:DHE_RSA_AES_128_CBC_SHA1:128) (Exim 4.80) (envelope-from ) id 1VuPec-0004oc-JM; Sat, 21 Dec 2013 17:43:55 +0100 Received: from aurel32 by ohm.rr44.fr with local (Exim 4.80) (envelope-from ) id 1VuPeT-0000e6-F1; Sat, 21 Dec 2013 17:43:45 +0100 From: Aurelien Jarno To: qemu-devel@nongnu.org Date: Sat, 21 Dec 2013 17:43:43 +0100 Message-Id: <1387644224-2404-5-git-send-email-aurelien@aurel32.net> X-Mailer: git-send-email 1.7.10.4 In-Reply-To: <1387644224-2404-1-git-send-email-aurelien@aurel32.net> References: <1387644224-2404-1-git-send-email-aurelien@aurel32.net> X-detected-operating-system: by eggs.gnu.org: Error: Malformed IPv6 address (bad octet value). X-Received-From: 2001:bc8:30d7:101::1 Cc: Aurelien Jarno Subject: [Qemu-devel] [PATCH 4/5] tcg/i386: use movbe instruction in qemu_ldst routines X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org The movbe instruction has been added on some Intel Atom CPUs and on recent Intel Haswell CPUs. It allows to load/store a value and at the same time bswap it. This patch detects the avaibility of this instruction and when available use it in the qemu load/store routines in replacement of load/store + bswap. Note that for 16-bit unsigned loads, movbe + movzw is basically the same as movzw + bswap, so the patch doesn't touch this case. Signed-off-by: Aurelien Jarno --- tcg/i386/tcg-target.c | 152 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 107 insertions(+), 45 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index e247829..8fbb0be 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -99,18 +99,31 @@ static const int tcg_target_call_oarg_regs[] = { # define TCG_REG_L1 TCG_REG_EDX #endif +/* The host compiler should supply to enable runtime features + detection, as we're not going to go so far as our own inline assembly. + If not available, default values will be assumed. */ +#if defined(CONFIG_CPUID_H) +#include +#endif + /* For 32-bit, we are going to attempt to determine at runtime whether cmov - is available. However, the host compiler must supply , as we're - not going to go so far as our own inline assembly. */ + is available. */ #if TCG_TARGET_REG_BITS == 64 # define have_cmov 1 #elif defined(CONFIG_CPUID_H) -#include static bool have_cmov; #else # define have_cmov 0 #endif +/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are + going to attempt to determine at runtime whether movbe is available. */ +#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE) +static bool have_movbe; +#else +# define have_movbe 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -280,6 +293,8 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_MOVB_EvIz (0xc6) #define OPC_MOVL_EvIz (0xc7) #define OPC_MOVL_Iv (0xb8) +#define OPC_MOVBE_GyMy (0xf0 | P_EXT2) +#define OPC_MOVBE_MyGy (0xf1 | P_EXT2) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) @@ -1363,8 +1378,13 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, break; case MO_SW: if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - tcg_out_rolw_8(s, datalo); + if (have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); + tcg_out_rolw_8(s, datalo); + } tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, @@ -1372,16 +1392,25 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, } break; case MO_UL: - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); - if (bswap) { - tcg_out_bswap32(s, datalo); + if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + seg, datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); + if (bswap) { + tcg_out_bswap32(s, datalo); + } } break; #if TCG_TARGET_REG_BITS == 64 case MO_SL: if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); - tcg_out_bswap32(s, datalo); + if (have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); + tcg_out_bswap32(s, datalo); + } tcg_out_ext32s(s, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); @@ -1390,29 +1419,34 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, #endif case MO_Q: if (TCG_TARGET_REG_BITS == 64) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, - datalo, base, ofs); - if (bswap) { - tcg_out_bswap64(s, datalo); + if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_REXW + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, + datalo, base, ofs); + if (bswap) { + tcg_out_bswap64(s, datalo); + } } } else { + int opc = OPC_MOVL_GvEv; if (bswap) { int t = datalo; datalo = datahi; datahi = t; + if (have_movbe) { + opc = OPC_MOVBE_GyMy; + } } if (base != datalo) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); + tcg_out_modrm_offset(s, opc + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, opc + seg, datahi, base, ofs + 4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, opc + seg, datahi, base, ofs + 4); + tcg_out_modrm_offset(s, opc + seg, datalo, base, ofs); } - if (bswap) { + if (bswap && opc != OPC_MOVBE_GyMy) { tcg_out_bswap32(s, datalo); tcg_out_bswap32(s, datahi); } @@ -1506,31 +1540,48 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, datalo, base, ofs); break; case MO_16: - if (bswap) { - tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); - tcg_out_rolw_8(s, scratch); - datalo = scratch; + if (bswap & have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + P_DATA16 + seg, + datalo, base, ofs); + } else { + if (bswap) { + tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); + tcg_out_rolw_8(s, scratch); + datalo = scratch; + } + tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, + datalo, base, ofs); } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, - datalo, base, ofs); break; case MO_32: - if (bswap) { - tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); - tcg_out_bswap32(s, scratch); - datalo = scratch; + if (bswap & have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datalo, base, ofs); + } else { + if (bswap) { + tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); + tcg_out_bswap32(s, scratch); + datalo = scratch; + } + tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); break; case MO_64: if (TCG_TARGET_REG_BITS == 64) { - if (bswap) { - tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo); - tcg_out_bswap64(s, scratch); - datalo = scratch; + if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + P_REXW + seg, + datalo, base, ofs); + } else { + if (bswap) { + tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo); + tcg_out_bswap64(s, scratch); + datalo = scratch; + } + tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, + datalo, base, ofs); } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, - datalo, base, ofs); + } else if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datahi, base, ofs); + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datalo, base, ofs+4); } else if (bswap) { tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); tcg_out_bswap32(s, scratch); @@ -2167,13 +2218,24 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { - /* For 32-bit, 99% certainty that we're running on hardware that supports - cmov, but we still need to check. In case cmov is not available, we'll - use a small forward branch. */ -#ifndef have_cmov +#if !(defined(have_cmov) && defined(have_movbe)) { unsigned a, b, c, d; - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); + int ret; + ret = __get_cpuid(1, &a, &b, &c, &d); + +# ifndef have_cmov + /* For 32-bit, 99% certainty that we're running on hardware that + supports cmov, but we still need to check. In case cmov is not + available, we'll use a small forward branch. */ + have_cmov = ret && (d & bit_CMOV); +# endif + +# ifndef have_movbe + /* MOVBE is only available on Intel Atom and Haswell CPUs, so we + need to probe for it. */ + have_movbe = ret && (c & bit_MOVBE); +# endif } #endif