From patchwork Fri Nov 2 05:35:01 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: malc X-Patchwork-Id: 196487 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 8486E2C0358 for ; Fri, 2 Nov 2012 16:35:19 +1100 (EST) Received: from localhost ([::1]:40421 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TU9uX-0004iH-ME for incoming@patchwork.ozlabs.org; Fri, 02 Nov 2012 01:35:17 -0400 Received: from eggs.gnu.org ([208.118.235.92]:43293) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TU9uO-0004gz-1v for qemu-devel@nongnu.org; Fri, 02 Nov 2012 01:35:10 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1TU9uL-0004pU-Hr for qemu-devel@nongnu.org; Fri, 02 Nov 2012 01:35:07 -0400 Received: from fe01x03-cgp.akado.ru ([77.232.31.164]:59779 helo=akado.ru) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1TU9uK-0004ic-Qv for qemu-devel@nongnu.org; Fri, 02 Nov 2012 01:35:05 -0400 Received: from [10.0.66.9] ([10.0.66.9] verified) by fe01-cgp.akado.ru (CommuniGate Pro SMTP 5.2.13) with ESMTPS id 360921815; Fri, 02 Nov 2012 09:35:01 +0400 Date: Fri, 2 Nov 2012 09:35:01 +0400 (MSK) From: malc X-X-Sender: malc@linmac To: Yeongkyoon Lee In-Reply-To: <1351667065-16274-1-git-send-email-yeongkyoon.lee@samsung.com> Message-ID: References: <1351667065-16274-1-git-send-email-yeongkyoon.lee@samsung.com> User-Agent: Alpine 2.00 (LNX 1167 2008-08-23) MIME-Version: 1.0 X-detected-operating-system: by eggs.gnu.org: FreeBSD 8.x X-Received-From: 77.232.31.164 Cc: blauwirbel@gmail.com, qemu-devel@nongnu.org, aurelien@aurel32.net, rth@twiddle.net Subject: Re: [Qemu-devel] [PATCH v8 0/3] tcg: enhance code generation quality for qemu_ld/st IRs X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org On Wed, 31 Oct 2012, Yeongkyoon Lee wrote: > Here is the 8th version of the series optimizing TCG qemu_ld/st code generation. > > v8: > - Rebase [..snip..] FWIW here's ppc32 implementation of your idea, thanks for explaining the motivation behind certain aspects in our private discussion. diff --git a/configure b/configure index 4a54a8b..e42ad64 100755 --- a/configure +++ b/configure @@ -3844,7 +3844,7 @@ upper() { } case "$cpu" in - i386|x86_64) + i386|x86_64|ppc) echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_target_mak ;; esac diff --git a/exec-all.h b/exec-all.h index ad6d22b..c3b3e50 100644 --- a/exec-all.h +++ b/exec-all.h @@ -337,6 +337,9 @@ extern uintptr_t tci_tb_ptr; # define GETRA() ((uintptr_t)__builtin_return_address(0)) # define GETPC_LDST() ((uintptr_t)(GETRA() + 7 + \ *(int32_t *)((void *)GETRA() + 3) - 1)) +# elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64) +# define GETRA() ((uintptr_t)__builtin_return_address(0)) +# define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() + 4)) - 1)) # else # error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!" # endif diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c index 60b7b92..ec10fa8 100644 --- a/tcg/ppc/tcg-target.c +++ b/tcg/ppc/tcg-target.c @@ -39,8 +39,6 @@ static uint8_t *tb_ret_addr; #define LR_OFFSET 4 #endif -#define FAST_PATH - #ifndef GUEST_BASE #define GUEST_BASE 0 #endif @@ -520,6 +518,37 @@ static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg) #if defined(CONFIG_SOFTMMU) +static void add_qemu_ldst_label (TCGContext *s, + int is_ld, + int opc, + int data_reg, + int data_reg2, + int addrlo_reg, + int addrhi_reg, + int mem_index, + uint8_t *raddr, + uint8_t *label_ptr) +{ + int idx; + TCGLabelQemuLdst *label; + + if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) { + tcg_abort(); + } + + idx = s->nb_qemu_ldst_labels++; + label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx]; + label->is_ld = is_ld; + label->opc = opc; + label->datalo_reg = data_reg; + label->datahi_reg = data_reg2; + label->addrlo_reg = addrlo_reg; + label->addrhi_reg = addrhi_reg; + label->mem_index = mem_index; + label->raddr = raddr; + label->label_ptr[0] = label_ptr; +} + #include "../../softmmu_defs.h" /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, @@ -541,34 +570,11 @@ static const void * const qemu_st_helpers[4] = { }; #endif -static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) +static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2, + int addr_reg, int addr_reg2, int s_bits, + int offset1, int offset2, uint8_t **label_ptr) { - int addr_reg, data_reg, data_reg2, r0, r1, rbase, bswap; -#ifdef CONFIG_SOFTMMU - int mem_index, s_bits, r2, ir; - void *label1_ptr, *label2_ptr; -#if TARGET_LONG_BITS == 64 - int addr_reg2; -#endif -#endif - - data_reg = *args++; - if (opc == 3) - data_reg2 = *args++; - else - data_reg2 = 0; - addr_reg = *args++; - -#ifdef CONFIG_SOFTMMU -#if TARGET_LONG_BITS == 64 - addr_reg2 = *args++; -#endif - mem_index = *args; - s_bits = opc & 3; - r0 = 3; - r1 = 4; - r2 = 0; - rbase = 0; + uint16_t retranst; tcg_out32 (s, (RLWINM | RA (r0) @@ -582,7 +588,7 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) tcg_out32 (s, (LWZU | RT (r1) | RA (r0) - | offsetof (CPUArchState, tlb_table[mem_index][0].addr_read) + | offset1 ) ); tcg_out32 (s, (RLWINM @@ -600,77 +606,57 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) tcg_out32 (s, CMP | BF (6) | RA (addr_reg2) | RB (r1)); tcg_out32 (s, CRAND | BT (7, CR_EQ) | BA (6, CR_EQ) | BB (7, CR_EQ)); #endif + *label_ptr = s->code_ptr; + retranst = ((uint16_t *) s->code_ptr)[1] & ~3; + tcg_out32 (s, BC | BI (7, CR_EQ) | retranst | BO_COND_FALSE); - label1_ptr = s->code_ptr; -#ifdef FAST_PATH - tcg_out32 (s, BC | BI (7, CR_EQ) | BO_COND_TRUE); -#endif - - /* slow path */ - ir = 3; - tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0); -#if TARGET_LONG_BITS == 32 - tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); -#else -#ifdef TCG_TARGET_CALL_ALIGN_ARGS - ir |= 1; -#endif - tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg2); - tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); -#endif - tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index); - - tcg_out_call (s, (tcg_target_long) qemu_ld_helpers[s_bits], 1); - switch (opc) { - case 0|4: - tcg_out32 (s, EXTSB | RA (data_reg) | RS (3)); - break; - case 1|4: - tcg_out32 (s, EXTSH | RA (data_reg) | RS (3)); - break; - case 0: - case 1: - case 2: - if (data_reg != 3) - tcg_out_mov (s, TCG_TYPE_I32, data_reg, 3); - break; - case 3: - if (data_reg == 3) { - if (data_reg2 == 4) { - tcg_out_mov (s, TCG_TYPE_I32, 0, 4); - tcg_out_mov (s, TCG_TYPE_I32, 4, 3); - tcg_out_mov (s, TCG_TYPE_I32, 3, 0); - } - else { - tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3); - tcg_out_mov (s, TCG_TYPE_I32, 3, 4); - } - } - else { - if (data_reg != 4) tcg_out_mov (s, TCG_TYPE_I32, data_reg, 4); - if (data_reg2 != 3) tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3); - } - break; - } - label2_ptr = s->code_ptr; - tcg_out32 (s, B); - - /* label1: fast path */ -#ifdef FAST_PATH - reloc_pc14 (label1_ptr, (tcg_target_long) s->code_ptr); -#endif - - /* r0 now contains &env->tlb_table[mem_index][index].addr_read */ + /* r0 now contains &env->tlb_table[mem_index][index].addr_x */ tcg_out32 (s, (LWZ | RT (r0) | RA (r0) - | (offsetof (CPUTLBEntry, addend) - - offsetof (CPUTLBEntry, addr_read)) - )); + | offset2 + ) + ); /* r0 = env->tlb_table[mem_index][index].addend */ tcg_out32 (s, ADD | RT (r0) | RA (r0) | RB (addr_reg)); /* r0 = env->tlb_table[mem_index][index].addend + addr */ +} + +static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) +{ + int addr_reg, addr_reg2, data_reg, data_reg2, r0, r1, rbase, bswap; +#ifdef CONFIG_SOFTMMU + int mem_index, s_bits, r2; + uint8_t *label_ptr; +#endif + + data_reg = *args++; + if (opc == 3) + data_reg2 = *args++; + else + data_reg2 = 0; + addr_reg = *args++; + +#ifdef CONFIG_SOFTMMU +#if TARGET_LONG_BITS == 64 + addr_reg2 = *args++; +#else + addr_reg2 = 0; +#endif + mem_index = *args; + s_bits = opc & 3; + r0 = 3; + r1 = 4; + r2 = 0; + rbase = 0; + + tcg_out_tlb_check ( + s, r0, r1, r2, addr_reg, addr_reg2, s_bits, + offsetof (CPUArchState, tlb_table[mem_index][0].addr_read), + offsetof (CPUTLBEntry, addend) - offsetof (CPUTLBEntry, addr_read), + &label_ptr + ); #else /* !CONFIG_SOFTMMU */ r0 = addr_reg; r1 = 3; @@ -736,21 +722,26 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg *args, int opc) } break; } - #ifdef CONFIG_SOFTMMU - reloc_pc24 (label2_ptr, (tcg_target_long) s->code_ptr); + add_qemu_ldst_label (s, + 1, + opc, + data_reg, + data_reg2, + addr_reg, + addr_reg2, + mem_index, + s->code_ptr, + label_ptr); #endif } static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc) { - int addr_reg, r0, r1, data_reg, data_reg2, bswap, rbase; + int addr_reg, addr_reg2, r0, r1, data_reg, data_reg2, bswap, rbase; #ifdef CONFIG_SOFTMMU - int mem_index, r2, ir; - void *label1_ptr, *label2_ptr; -#if TARGET_LONG_BITS == 64 - int addr_reg2; -#endif + int mem_index, r2; + uint8_t *label_ptr; #endif data_reg = *args++; @@ -763,6 +754,8 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc) #ifdef CONFIG_SOFTMMU #if TARGET_LONG_BITS == 64 addr_reg2 = *args++; +#else + addr_reg2 = 0; #endif mem_index = *args; r0 = 3; @@ -770,41 +763,89 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc) r2 = 0; rbase = 0; - tcg_out32 (s, (RLWINM - | RA (r0) - | RS (addr_reg) - | SH (32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS)) - | MB (32 - (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)) - | ME (31 - CPU_TLB_ENTRY_BITS) - ) - ); - tcg_out32 (s, ADD | RT (r0) | RA (r0) | RB (TCG_AREG0)); - tcg_out32 (s, (LWZU - | RT (r1) - | RA (r0) - | offsetof (CPUArchState, tlb_table[mem_index][0].addr_write) - ) - ); - tcg_out32 (s, (RLWINM - | RA (r2) - | RS (addr_reg) - | SH (0) - | MB ((32 - opc) & 31) - | ME (31 - TARGET_PAGE_BITS) - ) + tcg_out_tlb_check ( + s, r0, r1, r2, addr_reg, addr_reg2, opc & 3, + offsetof (CPUArchState, tlb_table[mem_index][0].addr_write), + offsetof (CPUTLBEntry, addend) - offsetof (CPUTLBEntry, addr_write), + &label_ptr ); +#else /* !CONFIG_SOFTMMU */ + r0 = addr_reg; + r1 = 3; + rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0; +#endif - tcg_out32 (s, CMP | (7 << 23) | RA (r2) | RB (r1)); -#if TARGET_LONG_BITS == 64 - tcg_out32 (s, LWZ | RT (r1) | RA (r0) | 4); - tcg_out32 (s, CMP | BF (6) | RA (addr_reg2) | RB (r1)); - tcg_out32 (s, CRAND | BT (7, CR_EQ) | BA (6, CR_EQ) | BB (7, CR_EQ)); +#ifdef TARGET_WORDS_BIGENDIAN + bswap = 0; +#else + bswap = 1; #endif + switch (opc) { + case 0: + tcg_out32 (s, STBX | SAB (data_reg, rbase, r0)); + break; + case 1: + if (bswap) + tcg_out32 (s, STHBRX | SAB (data_reg, rbase, r0)); + else + tcg_out32 (s, STHX | SAB (data_reg, rbase, r0)); + break; + case 2: + if (bswap) + tcg_out32 (s, STWBRX | SAB (data_reg, rbase, r0)); + else + tcg_out32 (s, STWX | SAB (data_reg, rbase, r0)); + break; + case 3: + if (bswap) { + tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4); + tcg_out32 (s, STWBRX | SAB (data_reg, rbase, r0)); + tcg_out32 (s, STWBRX | SAB (data_reg2, rbase, r1)); + } + else { +#ifdef CONFIG_USE_GUEST_BASE + tcg_out32 (s, STWX | SAB (data_reg2, rbase, r0)); + tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4); + tcg_out32 (s, STWX | SAB (data_reg, rbase, r1)); +#else + tcg_out32 (s, STW | RS (data_reg2) | RA (r0)); + tcg_out32 (s, STW | RS (data_reg) | RA (r0) | 4); +#endif + } + break; + } - label1_ptr = s->code_ptr; -#ifdef FAST_PATH - tcg_out32 (s, BC | BI (7, CR_EQ) | BO_COND_TRUE); +#ifdef CONFIG_SOFTMMU + add_qemu_ldst_label (s, + 0, + opc, + data_reg, + data_reg2, + addr_reg, + addr_reg2, + mem_index, + s->code_ptr, + label_ptr); #endif +} + +#if defined(CONFIG_SOFTMMU) +static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label) +{ + int s_bits; + int ir; + int opc = label->opc; + int mem_index = label->mem_index; + int data_reg = label->datalo_reg; + int data_reg2 = label->datahi_reg; + int addr_reg = label->addrlo_reg; + uint8_t *raddr = label->raddr; + uint8_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolve label address */ + reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr); /* slow path */ ir = 3; @@ -815,7 +856,75 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc) #ifdef TCG_TARGET_CALL_ALIGN_ARGS ir |= 1; #endif - tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg2); + tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg); + tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); +#endif + tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index); + tcg_out_call (s, (tcg_target_long) qemu_ld_helpers[s_bits], 1); + tcg_out32 (s, B | 8); + tcg_out32 (s, (tcg_target_long) raddr); + switch (opc) { + case 0|4: + tcg_out32 (s, EXTSB | RA (data_reg) | RS (3)); + break; + case 1|4: + tcg_out32 (s, EXTSH | RA (data_reg) | RS (3)); + break; + case 0: + case 1: + case 2: + if (data_reg != 3) + tcg_out_mov (s, TCG_TYPE_I32, data_reg, 3); + break; + case 3: + if (data_reg == 3) { + if (data_reg2 == 4) { + tcg_out_mov (s, TCG_TYPE_I32, 0, 4); + tcg_out_mov (s, TCG_TYPE_I32, 4, 3); + tcg_out_mov (s, TCG_TYPE_I32, 3, 0); + } + else { + tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3); + tcg_out_mov (s, TCG_TYPE_I32, 3, 4); + } + } + else { + if (data_reg != 4) tcg_out_mov (s, TCG_TYPE_I32, data_reg, 4); + if (data_reg2 != 3) tcg_out_mov (s, TCG_TYPE_I32, data_reg2, 3); + } + break; + } + /* Jump to the code corresponding to next IR of qemu_st */ + tcg_out_b (s, 0, (tcg_target_long) raddr); +} + +static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label) +{ + int s_bits; + int ir; + int opc = label->opc; + int mem_index = label->mem_index; + int data_reg = label->datalo_reg; + int data_reg2 = label->datahi_reg; + int addr_reg = label->addrlo_reg; + uint8_t *raddr = label->raddr; + uint8_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolve label address */ + reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr); + + /* slow path */ + ir = 3; + tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0); +#if TARGET_LONG_BITS == 32 + tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); +#else +#ifdef TCG_TARGET_CALL_ALIGN_ARGS + ir |= 1; +#endif + tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg); tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); #endif @@ -851,74 +960,28 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg *args, int opc) tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index); tcg_out_call (s, (tcg_target_long) qemu_st_helpers[opc], 1); - label2_ptr = s->code_ptr; - tcg_out32 (s, B); - - /* label1: fast path */ -#ifdef FAST_PATH - reloc_pc14 (label1_ptr, (tcg_target_long) s->code_ptr); -#endif - - tcg_out32 (s, (LWZ - | RT (r0) - | RA (r0) - | (offsetof (CPUTLBEntry, addend) - - offsetof (CPUTLBEntry, addr_write)) - )); - /* r0 = env->tlb_table[mem_index][index].addend */ - tcg_out32 (s, ADD | RT (r0) | RA (r0) | RB (addr_reg)); - /* r0 = env->tlb_table[mem_index][index].addend + addr */ - -#else /* !CONFIG_SOFTMMU */ - r0 = addr_reg; - r1 = 3; - rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0; -#endif + tcg_out32 (s, B | 8); + tcg_out32 (s, (tcg_target_long) raddr); + tcg_out_b (s, 0, (tcg_target_long) raddr); +} -#ifdef TARGET_WORDS_BIGENDIAN - bswap = 0; -#else - bswap = 1; -#endif - switch (opc) { - case 0: - tcg_out32 (s, STBX | SAB (data_reg, rbase, r0)); - break; - case 1: - if (bswap) - tcg_out32 (s, STHBRX | SAB (data_reg, rbase, r0)); - else - tcg_out32 (s, STHX | SAB (data_reg, rbase, r0)); - break; - case 2: - if (bswap) - tcg_out32 (s, STWBRX | SAB (data_reg, rbase, r0)); - else - tcg_out32 (s, STWX | SAB (data_reg, rbase, r0)); - break; - case 3: - if (bswap) { - tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4); - tcg_out32 (s, STWBRX | SAB (data_reg, rbase, r0)); - tcg_out32 (s, STWBRX | SAB (data_reg2, rbase, r1)); +void tcg_out_tb_finalize(TCGContext *s) +{ + int i; + TCGLabelQemuLdst *label; + + /* qemu_ld/st slow paths */ + for (i = 0; i < s->nb_qemu_ldst_labels; i++) { + label = (TCGLabelQemuLdst *) &s->qemu_ldst_labels[i]; + if (label->is_ld) { + tcg_out_qemu_ld_slow_path (s, label); } else { -#ifdef CONFIG_USE_GUEST_BASE - tcg_out32 (s, STWX | SAB (data_reg2, rbase, r0)); - tcg_out32 (s, ADDI | RT (r1) | RA (r0) | 4); - tcg_out32 (s, STWX | SAB (data_reg, rbase, r1)); -#else - tcg_out32 (s, STW | RS (data_reg2) | RA (r0)); - tcg_out32 (s, STW | RS (data_reg) | RA (r0) | 4); -#endif + tcg_out_qemu_st_slow_path (s, label); } - break; } - -#ifdef CONFIG_SOFTMMU - reloc_pc24 (label2_ptr, (tcg_target_long) s->code_ptr); -#endif } +#endif static void tcg_target_qemu_prologue (TCGContext *s) {