From patchwork Thu Sep 16 15:34:02 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 64979 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id 533581007D2 for ; Fri, 17 Sep 2010 01:34:24 +1000 (EST) Received: (qmail 5147 invoked by alias); 16 Sep 2010 15:34:20 -0000 Received: (qmail 4743 invoked by uid 22791); 16 Sep 2010 15:34:16 -0000 X-SWARE-Spam-Status: No, hits=-1.8 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_NONE X-Spam-Check-By: sourceware.org Received: from mail-vw0-f47.google.com (HELO mail-vw0-f47.google.com) (209.85.212.47) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Thu, 16 Sep 2010 15:34:06 +0000 Received: by vws9 with SMTP id 9so1098648vws.20 for ; Thu, 16 Sep 2010 08:34:04 -0700 (PDT) MIME-Version: 1.0 Received: by 10.220.128.204 with SMTP id l12mr1911132vcs.102.1284651243218; Thu, 16 Sep 2010 08:34:03 -0700 (PDT) Received: by 10.220.202.9 with HTTP; Thu, 16 Sep 2010 08:34:02 -0700 (PDT) In-Reply-To: <4C914AC3.1040702@redhat.com> References: <20100913145422.GA21719@intel.com> <4C914AC3.1040702@redhat.com> Date: Thu, 16 Sep 2010 08:34:02 -0700 Message-ID: Subject: Re: PATCH: Pad short functions for Atom From: "H.J. Lu" To: Richard Henderson Cc: "H.J. Lu" , gcc-patches@gcc.gnu.org, Uros Bizjak X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org On Wed, Sep 15, 2010 at 3:37 PM, Richard Henderson wrote: > On 09/13/2010 07:54 AM, H.J. Lu wrote: >>        xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); >> +      /* Pad stack IP move with 4 instructions.  2 NOPs count as 1 >> +         instruction.  */ >> +      if (TARGET_PAD_SHORT_FUNCTION) >> +     output_asm_insn (".byte 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, " >> +                      "0x90, 0x90", xops); > > What's wrong with "nop" instead of ".byte"? > Similarly for "return_nops". Done. >> +/* Pad short funtion to 4 instructions.   */ >> + >> +static void >> +ix86_pad_short_function (void) >> +{ >> +  edge e; >> +  edge_iterator ei; >> + >> +  /* Set up block info for each basic block.  */ >> +  alloc_aux_for_blocks (sizeof (struct block_info_def)); >> + >> +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds) >> +    { >> +      rtx ret = BB_END (e->src); >> +      if (JUMP_P (ret) && GET_CODE (PATTERN (ret)) == RETURN) >> +     { >> +       int insn_count = ix86_count_insn (e->src); >> + >> +       /* Pad short function.  */ >> +       if (insn_count < 4) >> +         { >> +           emit_jump_insn_before (gen_return_nops (GEN_INT (insn_count)), >> +                                  ret); >> +           delete_insn (ret); >> +         } >> +     } >> +    } > > Surely this is overkill.  Why not assume that any function > with more than a single basic block is not short?  You can > then significantly simplify these two functions. Done. > I'll also say that this will break the as-yet unsubmitted > Windows x64 SEH code.  In order to reduce the size of the > unwind information, the form of epilogues is tightly > constrained.  See > >  http://msdn.microsoft.com/en-US/library/tawsa7cb%28v=VS.80%29.aspx > > In order to work with SEH, you'll need to put the nops > before the epilogue, and not attach them to the return pattern. > Does this patch look OK for trunk? Thanks. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 19d6387..154df63 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1576,6 +1576,9 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_PAD_RETURNS */ m_AMD_MULTIPLE | m_CORE2 | m_GENERIC, + /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ + m_ATOM, + /* X86_TUNE_EXT_80387_CONSTANTS */ m_K6_GEODE | m_ATHLON_K8 | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC, @@ -8017,6 +8020,11 @@ ix86_code_end (void) xops[0] = gen_rtx_REG (Pmode, regno); xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); + /* Pad stack IP move with 4 instructions. 2 NOPs count as 1 + instruction. */ + if (TARGET_PAD_SHORT_FUNCTION) + output_asm_insn ("nop; nop; nop; nop; nop; nop; nop; nop", + xops); output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops); output_asm_insn ("ret", xops); final_end_function (); @@ -27776,6 +27784,135 @@ ix86_pad_returns (void) } } +/* Count the minimum number of instructions in BB. Return 4 if the + number of instructions >= 4. */ + +static int +ix86_count_insn_bb (basic_block bb) +{ + rtx insn; + int insn_count = 0; + + /* Count number of instructions in this block. Return 4 if the number + of instructions >= 4. */ + FOR_BB_INSNS (bb, insn) + { + /* Only happen in exit blocks. */ + if (JUMP_P (insn) + && GET_CODE (PATTERN (insn)) == RETURN) + break; + + if (NONDEBUG_INSN_P (insn) + && GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) + { + insn_count++; + if (insn_count >= 4) + return insn_count; + } + } + + return insn_count; +} + + +/* Count the minimum number of instructions in code path in BB. + Return 4 if the number of instructions >= 4. */ + +static int +ix86_count_insn (basic_block bb) +{ + edge e; + edge_iterator ei; + int insn_count = ix86_count_insn_bb (bb); + int min_insn_count; + + if (insn_count >= 4) + return insn_count; + + /* This block has less than 4 instructions. Count predecessor + edges of this block. */ + min_insn_count = insn_count; + FOR_EACH_EDGE (e, ei, bb->preds) + { + int count = insn_count + ix86_count_insn_bb (e->src); + + if (count < 4) + { + /* This block plus its predecessor have less than 4 + instructions. Check predecessor edges. */ + edge prev_e; + edge_iterator prev_ei; + int old_count = count; + bool has_prev_bb = false; + + count = 4; + FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds) + { + /* Check if the predecessor is entry point. Not a short + function if it has more than 2 basic blocks. */ + if (prev_e->src == ENTRY_BLOCK_PTR) + { + has_prev_bb = false; + break; + } + has_prev_bb = true; + } + + if (!has_prev_bb) + count = old_count; + } + + if (min_insn_count == insn_count) + min_insn_count = count; + else if (count < min_insn_count) + min_insn_count = count; + } + + return min_insn_count; +} + +/* Pad short funtion to 4 instructions. */ + +static void +ix86_pad_short_function (void) +{ + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds) + { + rtx ret = BB_END (e->src); + if (JUMP_P (ret) && GET_CODE (PATTERN (ret)) == RETURN) + { + int insn_count = ix86_count_insn (e->src); + + /* Pad short function. */ + if (insn_count < 4) + { + rtx nop = gen_nop (); + rtx insn = ret; + + /* Find epilogue. */ + while (insn + && (!NOTE_P (insn) + || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG)) + insn = PREV_INSN (insn); + + if (!insn) + insn = ret; + + /* Two NOPs are counted as one instruction. */ + for (; insn_count < 4; insn_count++) + { + emit_insn_before (nop, insn); + emit_insn_before (nop, insn); + } + } + } + } +} + /* Implement machine specific optimizations. We implement padding of returns for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ static void @@ -27783,7 +27920,9 @@ ix86_reorg (void) { if (optimize && optimize_function_for_speed_p (cfun)) { - if (TARGET_PAD_RETURNS) + if (TARGET_PAD_SHORT_FUNCTION) + ix86_pad_short_function (); + else if (TARGET_PAD_RETURNS) ix86_pad_returns (); #ifdef ASM_OUTPUT_MAX_SKIP_PAD if (TARGET_FOUR_JUMP_LIMIT) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index fe48efd..c28768f 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -299,6 +299,7 @@ enum ix86_tune_indices { X86_TUNE_USE_BT, X86_TUNE_USE_INCDEC, X86_TUNE_PAD_RETURNS, + X86_TUNE_PAD_SHORT_FUNCTION, X86_TUNE_EXT_80387_CONSTANTS, X86_TUNE_SHORTEN_X87_SSE, X86_TUNE_AVOID_VECTOR_DECODE, @@ -385,6 +386,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT] #define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC] #define TARGET_PAD_RETURNS ix86_tune_features[X86_TUNE_PAD_RETURNS] +#define TARGET_PAD_SHORT_FUNCTION \ + ix86_tune_features[X86_TUNE_PAD_SHORT_FUNCTION] #define TARGET_EXT_80387_CONSTANTS \ ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS] #define TARGET_SHORTEN_X87_SSE ix86_tune_features[X86_TUNE_SHORTEN_X87_SSE] diff --git a/gcc/testsuite/gcc.target/i386/pad-1.c b/gcc/testsuite/gcc.target/i386/pad-1.c new file mode 100644 index 0000000..87a9d6c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-1.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fomit-frame-pointer -mtune=generic -S" } */ +/* { dg-final { scan-assembler "rep" } } */ +/* { dg-final { scan-assembler-not "nop" } } */ + +void +foo () +{ +} diff --git a/gcc/testsuite/gcc.target/i386/pad-10.c b/gcc/testsuite/gcc.target/i386/pad-10.c new file mode 100644 index 0000000..6ba3b78 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-10.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-not "nop" } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +extern void bar (); + +int +foo2 (int z, int x) +{ + if (x == 1) + { + bar (); + return z; + } + else + return x + z; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-2.c b/gcc/testsuite/gcc.target/i386/pad-2.c new file mode 100644 index 0000000..e7659a9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-2.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-times "nop" 8 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +void +foo () +{ +} diff --git a/gcc/testsuite/gcc.target/i386/pad-3.c b/gcc/testsuite/gcc.target/i386/pad-3.c new file mode 100644 index 0000000..52442b4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-3.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-not "nop" } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +int s[8] = {1, 2, 3, 4, 5, 6, 7, 8}; +int d[8] = {11, 22, 33, 44, 55, 66, 77, 88}; + +void +foo () +{ + int i; + for (i = 0; i < 8; i++) + d[i] = s[i] + 0x1000; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-4.c b/gcc/testsuite/gcc.target/i386/pad-4.c new file mode 100644 index 0000000..a7033fa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-4.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ilp32 } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S -fPIC" } */ +/* { dg-final { scan-assembler-times "nop; nop; nop; nop; nop; nop; nop; nop" 1 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +extern int bar; + +int +foo () +{ + return bar; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-5a.c b/gcc/testsuite/gcc.target/i386/pad-5a.c new file mode 100644 index 0000000..4af419e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-5a.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ilp32 } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-times "nop" 2 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +int +foo (int x, int y, int z) +{ + return x + y + z; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-5b.c b/gcc/testsuite/gcc.target/i386/pad-5b.c new file mode 100644 index 0000000..48ab446 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-5b.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-times "nop" 4 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +int +foo (int x, int y, int z) +{ + return x + y + z; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-6a.c b/gcc/testsuite/gcc.target/i386/pad-6a.c new file mode 100644 index 0000000..379ba65 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-6a.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ilp32 } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-times "nop" 4 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +int +foo (int x, int y) +{ + return x + y; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-6b.c b/gcc/testsuite/gcc.target/i386/pad-6b.c new file mode 100644 index 0000000..fdedca1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-6b.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-times "nop" 6 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +int +foo (int x, int y) +{ + return x + y; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-7.c b/gcc/testsuite/gcc.target/i386/pad-7.c new file mode 100644 index 0000000..7a7493d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-7.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ilp32 } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-not "nop" } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +int +foo (int x, int y, int z) +{ + return x + y + z + y; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-8.c b/gcc/testsuite/gcc.target/i386/pad-8.c new file mode 100644 index 0000000..d3163fb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-8.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-times "nop" 6 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +int +foo (int x, int y) +{ + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pad-9.c b/gcc/testsuite/gcc.target/i386/pad-9.c new file mode 100644 index 0000000..572a1af --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pad-9.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -fomit-frame-pointer -march=atom -S" } */ +/* { dg-final { scan-assembler-times "nop" 4 } } */ +/* { dg-final { scan-assembler-not "rep" } } */ + +extern void bar (void); + +void +foo (int x) +{ + if (x) + bar (); +}