diff mbox

[v3,06/10] target/arm: optimize indirect branches

Message ID 1493187803-4510-7-git-send-email-cota@braap.org
State New
Headers show

Commit Message

Emilio Cota April 26, 2017, 6:23 a.m. UTC
Speed up indirect branches by jumping to the target if it is valid.

Softmmu measurements (see later commit for user-mode results):

Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.

- Impact on Boot time

| setup  | ARM debian jessie boot+shutdown time | stddev |
|--------+--------------------------------------+--------|
| v2.9.0 |                                 8.84 |   0.07 |
| +cross |                                 8.85 |   0.03 |
| +jr    |                                 8.83 |   0.06 |

-                            NBench, arm-softmmu (debian jessie guest). Host: Intel i7-4790K @ 4.00GHz

  1.3x +-+-------------------------------------------------------------------------------------------------------------+-+
       |                                                                                                                 |
       |   cross                                                          ####                                           |
 1.25x +cross+jr..........................................................#++#.........................................+-+
       |                                                        ####      #  #                                           |
       |                                                     +++#  #      #  #                                           |
       |                                      +++            ****  #      #  #                                           |
  1.2x +-+...................................####............*..*..#......#..#.........................................+-+
       |                                  ****  #            *  *  #      #  #     ####                                  |
       |                                  *  *  #            *  *  #      #  #     #  #                                  |
 1.15x +-+................................*..*..#............*..*..#......#..#.....#..#................................+-+
       |                                  *  *  #            *  *  #      #  #     #  #                                  |
       |                                  *  *  #      ####  *  *  #      #  #     #  #                                  |
       |                                  *  *  #      #  #  *  *  #      #  #     #  #                         ####     |
  1.1x +-+................................*..*..#......#..#..*..*..#......#..#.....#..#.........................#..#...+-+
       |                                  *  *  #      #  #  *  *  #      #  #     #  #                         #  #     |
       |                                  *  *  #      #  #  *  *  #      #  #     #  #                         #  #     |
 1.05x +-+..........................####..*..*..#......#..#..*..*..#......#..#.....#..#......+++............*****..#...+-+
       |                        *****  #  *  *  #      #  #  *  *  #  *****  #     #  #   +++ |    ****###  *   *  #     |
       |                        *+++*  #  *  *  #      #  #  *  *  #  *+++*  #  ****  #  *****###  *  *  #  *   *  #     |
       |     *****###  +++####  *   *  #  *  *  #  *****  #  *  *  #  *   *  #  *  *  #  * | *++#  *  *  #  *   *  #     |
    1x +-++-+*+++*-+#++****++#++*+-+*++#+-*++*++#-+*+++*-+#++*++*++#++*+-+*++#+-*++*++#-+*+++*-+#++*++*++#++*+-+*++#+-++-+
       |     *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #     |
       |     *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #     |
 0.95x +-+---*****###--****###--*****###--****###--*****###--****###--*****###--****###--*****###--****###--*****###---+-+
       ASSIGNMENT BITFIELD   FOURFP EMULATION   HUFFMAN   LU DECOMPOSITIONEURAL NNUMERIC SOSTRING SORT     hmean
  png: http://imgur.com/eOLmZNR

NB. 'cross' represents the previous commit.

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 target/arm/translate.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

Comments

Richard Henderson April 26, 2017, 7:54 a.m. UTC | #1
On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> +static bool gen_jr;...
>          case DISAS_JUMP:
> +            if (gen_jr) {

Why the variable?  Why not just try the goto_ptr for any DISAS_JUMP?


r~
Emilio Cota April 27, 2017, 3:20 a.m. UTC | #2
On Wed, Apr 26, 2017 at 09:54:07 +0200, Richard Henderson wrote:
> On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> >+static bool gen_jr;...
> >         case DISAS_JUMP:
> >+            if (gen_jr) {
> 
> Why the variable?  Why not just try the goto_ptr for any DISAS_JUMP?

We have code that assumes DISAS_JUMP implies "go to exec loop", e.g.:
            case 6: /* isb */
                /* We need to break the TB after this insn to execute
                 * self-modifying code correctly and also to take
                 * any pending interrupts immediately.
                 */
                gen_lookup_tb(s);
where gen_lookup_tb does:

/* Force a TB lookup after an instruction that changes the CPU state.  */
static inline void gen_lookup_tb(DisasContext *s)
{
    tcg_gen_movi_i32(cpu_R[15], s->pc & ~1);
    s->is_jmp = DISAS_JUMP;
}

Also, the gen_exception_* functions set DISAS_JUMP. I suspect we want to go
to the exec loop with those as well.

Testing shows that I'm onto something; if I remove the variable,
and note that I make sure DISAS_UPDATE is not falling through, I get
easily reproducible (~1 out of 5) freezes and other instability
(e.g. RCU lockup warnings) when booting + shutting down debian jessie
in the guest.

In v4 I've added a comment about this.

Thanks,

		E.
Aurelien Jarno April 27, 2017, 10:25 a.m. UTC | #3
On 2017-04-26 23:20, Emilio G. Cota wrote:
> On Wed, Apr 26, 2017 at 09:54:07 +0200, Richard Henderson wrote:
> > On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> > >+static bool gen_jr;...
> > >         case DISAS_JUMP:
> > >+            if (gen_jr) {
> > 
> > Why the variable?  Why not just try the goto_ptr for any DISAS_JUMP?
> 
> We have code that assumes DISAS_JUMP implies "go to exec loop", e.g.:
>             case 6: /* isb */
>                 /* We need to break the TB after this insn to execute
>                  * self-modifying code correctly and also to take
>                  * any pending interrupts immediately.
>                  */
>                 gen_lookup_tb(s);
> where gen_lookup_tb does:
> 
> /* Force a TB lookup after an instruction that changes the CPU state.  */
> static inline void gen_lookup_tb(DisasContext *s)
> {
>     tcg_gen_movi_i32(cpu_R[15], s->pc & ~1);
>     s->is_jmp = DISAS_JUMP;
> }

Changing the CPU state should already be taken into account in
lookup_tb_ptr through the cpu_get_tb_cpu_state function. Also interrupts
should be checked at the beginning of each TB.

> Also, the gen_exception_* functions set DISAS_JUMP. I suspect we want to go
> to the exec loop with those as well.

Technically all the code after the one generated by gen_exception_*
function is not executed, including the tcg_gen_exit_tb(0). Therefore
generating code using tcg_gen_exit_tb or or tcg_gen_lookup_and_goto_ptr
should no change the behaviour.

> Testing shows that I'm onto something; if I remove the variable,
> and note that I make sure DISAS_UPDATE is not falling through, I get
> easily reproducible (~1 out of 5) freezes and other instability
> (e.g. RCU lockup warnings) when booting + shutting down debian jessie
> in the guest.

I agree that always calling lookup_tb_ptr might be suboptimal in case we
know for sure that the lookup will fail or that there will be an exit
request at the beginning of the next TB. That said I found strange that
it causes issues, and I wonder if it just expose a bug somewhere.
diff mbox

Patch

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 02cad96..73595b4 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -65,6 +65,7 @@  static TCGv_i32 cpu_R[16];
 TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 TCGv_i64 cpu_exclusive_addr;
 TCGv_i64 cpu_exclusive_val;
+static bool gen_jr;
 
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -221,6 +222,7 @@  static void store_reg(DisasContext *s, int reg, TCGv_i32 var)
          */
         tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3);
         s->is_jmp = DISAS_JUMP;
+        gen_jr = true;
     }
     tcg_gen_mov_i32(cpu_R[reg], var);
     tcg_temp_free_i32(var);
@@ -893,6 +895,7 @@  static inline void gen_bx_im(DisasContext *s, uint32_t addr)
         tcg_temp_free_i32(tmp);
     }
     tcg_gen_movi_i32(cpu_R[15], addr & ~1);
+    gen_jr = true;
 }
 
 /* Set PC and Thumb state from var.  var is marked as dead.  */
@@ -902,6 +905,7 @@  static inline void gen_bx(DisasContext *s, TCGv_i32 var)
     tcg_gen_andi_i32(cpu_R[15], var, ~1);
     tcg_gen_andi_i32(var, var, 1);
     store_cpu_field(var, thumb);
+    gen_jr = true;
 }
 
 /* Variant of store_reg which uses branch&exchange logic when storing
@@ -12034,6 +12038,16 @@  void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
             gen_set_pc_im(dc, dc->pc);
             /* fall through */
         case DISAS_JUMP:
+            if (gen_jr) {
+                TCGv addr = tcg_temp_new();
+
+                gen_jr = false;
+                tcg_gen_extu_i32_tl(addr, cpu_R[15]);
+                tcg_gen_lookup_and_goto_ptr(addr);
+                tcg_temp_free(addr);
+                break;
+            }
+            /* fall through */
         default:
             /* indicate that the hash table must be used to find the next TB */
             tcg_gen_exit_tb(0);