Patchwork [google,4.7] atomic update of profile counters (issue6965050)

login
register
mail settings
Submitter Rong Xu
Date Dec. 19, 2012, 8:08 p.m.
Message ID <20121219200828.73DB9106927@rong.mtv.corp.google.com>
Download mbox | patch
Permalink /patch/207513/
State New
Headers show

Comments

Rong Xu - Dec. 19, 2012, 8:08 p.m.
Hi,

This patch adds the supprot of atomic update the profile counters.
Tested with google internal benchmarks and fdo kernel build.

Thanks,

-Rong

2012-12-19  Rong Xu  <xur@google.com>

	* gcc/common.opt: Add -fprofile-gen-atomic option.
	* gcc/gcov-io.h: Add profile atomic update support.
	* gcc/tree-profile.c (gimple_init_edge_profiler): Ditto.
	(gimple_gen_edge_profiler): Ditto.
	* libgcc/libgcov.c (__gcov_one_value_profiler_body): Ditto.
	(__gcov_one_value_profiler_body_atomic): Ditto.
	(__gcov_one_value_profiler_atomic): Ditto.
	(__gcov_indirect_call_profiler_atomic): Ditto.


--
This patch is available for review at http://codereview.appspot.com/6965050
Xinliang David Li - Dec. 20, 2012, 12:24 a.m.
This looks good to me for google branches. Useful for trunk too.

David

On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
> Hi,
>
> This patch adds the supprot of atomic update the profile counters.
> Tested with google internal benchmarks and fdo kernel build.
>
> Thanks,
>
> -Rong
>
> 2012-12-19  Rong Xu  <xur@google.com>
>
>         * gcc/common.opt: Add -fprofile-gen-atomic option.
>         * gcc/gcov-io.h: Add profile atomic update support.
>         * gcc/tree-profile.c (gimple_init_edge_profiler): Ditto.
>         (gimple_gen_edge_profiler): Ditto.
>         * libgcc/libgcov.c (__gcov_one_value_profiler_body): Ditto.
>         (__gcov_one_value_profiler_body_atomic): Ditto.
>         (__gcov_one_value_profiler_atomic): Ditto.
>         (__gcov_indirect_call_profiler_atomic): Ditto.
>
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt      (revision 194562)
> +++ gcc/common.opt      (working copy)
> @@ -1754,6 +1754,15 @@ fprofile-dump
>  Common Report Var(flag_profile_dump) Init(0) Optimization
>  Dump CFG profile for comparison.
>
> +; fprofile-gen-atomic=0: disable aotimically update.
> +; fprofile-gen-atomic=1: aotimically update edge profile counters.
> +; fprofile-gen-atomic=2: aotimically update value profile counters.
> +; fprofile-gen-atomic=3: aotimically update edge and value profile counters.
> +; other values will be ignored (fall back to the default of 0).
> +fprofile-gen-atomic=
> +Common Joined UInteger Report Var(flag_profile_gen_atomic) Init(0) Optimization
> +fprofile-gen-atomic=[0..3] Atomically increments for profile counters.
> +
>  fprofile-generate
>  Common
>  Enable common options for generating profile info for profile feedback directed optimizations
> Index: gcc/gcov-io.h
> ===================================================================
> --- gcc/gcov-io.h       (revision 194562)
> +++ gcc/gcov-io.h       (working copy)
> @@ -300,6 +300,14 @@ typedef unsigned gcov_type_unsigned __attribute__
>
>  #endif  /* BITS_PER_UNIT == 8  */
>
> +#if LONG_LONG_TYPE_SIZE > 32
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
> +#else
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
> +#endif
> +
>  #undef EXTRACT_MODULE_ID_FROM_GLOBAL_ID
>  #undef EXTRACT_FUNC_ID_FROM_GLOBAL_ID
>  #undef GEN_FUNC_GLOBAL_ID
> @@ -322,6 +330,18 @@ typedef unsigned gcov_type_unsigned __attribute__
>  typedef unsigned gcov_unsigned_t;
>  typedef unsigned gcov_position_t;
>
> +#if LONG_LONG_TYPE_SIZE > 32
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
> +#else
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
> +#endif
> +#define PROFILE_GEN_EDGE_ATOMIC (flag_profile_gen_atomic == 1 || \
> +                                 flag_profile_gen_atomic == 3)
> +#define PROFILE_GEN_VALUE_ATOMIC (flag_profile_gen_atomic == 2 || \
> +                                  flag_profile_gen_atomic == 3)
> +
>  /* gcov_type is typedef'd elsewhere for the compiler */
>  #if IN_GCOV
>  #define GCOV_LINKAGE static
> Index: gcc/tree-profile.c
> ===================================================================
> --- gcc/tree-profile.c  (revision 194562)
> +++ gcc/tree-profile.c  (working copy)
> @@ -471,7 +471,12 @@ gimple_init_edge_profiler (void)
>               = build_function_type_list (void_type_node,
>                                           gcov_type_ptr, gcov_type_node,
>                                           NULL_TREE);
> -      tree_one_value_profiler_fn
> +      if (PROFILE_GEN_VALUE_ATOMIC)
> +        tree_one_value_profiler_fn
> +             = build_fn_decl ("__gcov_one_value_profiler_atomic",
> +                                    one_value_profiler_fn_type);
> +      else
> +        tree_one_value_profiler_fn
>               = build_fn_decl ("__gcov_one_value_profiler",
>                                      one_value_profiler_fn_type);
>        TREE_NOTHROW (tree_one_value_profiler_fn) = 1;
> @@ -487,7 +492,12 @@ gimple_init_edge_profiler (void)
>                                        gcov_type_ptr, gcov_type_node,
>                                        ptr_void,
>                                        ptr_void, NULL_TREE);
> -      tree_indirect_call_profiler_fn
> +      if (PROFILE_GEN_VALUE_ATOMIC)
> +        tree_indirect_call_profiler_fn
> +             = build_fn_decl ("__gcov_indirect_call_profiler_atomic",
> +                                    ic_profiler_fn_type);
> +      else
> +        tree_indirect_call_profiler_fn
>               = build_fn_decl ("__gcov_indirect_call_profiler",
>                                      ic_profiler_fn_type);
>        TREE_NOTHROW (tree_indirect_call_profiler_fn) = 1;
> @@ -563,21 +573,37 @@ gimple_gen_edge_profiler (int edgeno, edge e)
>       gets re-set in tree_profiling.  */
>    if (gcov_type_tmp_var == NULL_TREE)
>      gcov_type_tmp_var = create_tmp_reg (gcov_type_node, "PROF_edge_counter");
> -  ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
> +
> +  if (PROFILE_GEN_EDGE_ATOMIC)
> +    ref = tree_coverage_counter_addr (GCOV_COUNTER_ARCS, edgeno);
> +  else
> +    ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
> +
>    one = build_int_cst (gcov_type_node, 1);
> -  stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
> -  gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
> -  find_referenced_vars_in (stmt1);
> -  stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
> -                                       gimple_assign_lhs (stmt1), one);
> -  gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
> -  stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
> +  if (PROFILE_GEN_EDGE_ATOMIC)
> +    {
> +      /* __sync_fetch_and_add_8 (&counter, 1); */
> +      stmt3 = gimple_build_call (builtin_decl_explicit
> +                                    (GCOV_TYPE_SYNC_FETCH_AND_ADD),
> +                                 2, ref, one);
> +      find_referenced_vars_in (stmt3);
> +    }
> +  else
> +    {
> +      stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
> +      gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
> +      find_referenced_vars_in (stmt1);
> +      stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
> +                                       gimple_assign_lhs (stmt1), one);
> +      gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
> +      stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
>
> -  if (flag_profile_generate_sampling)
> -    pointer_set_insert (instrumentation_to_be_sampled, stmt1);
> +      if (flag_profile_generate_sampling)
> +        pointer_set_insert (instrumentation_to_be_sampled, stmt1);
>
> -  gsi_insert_on_edge (e, stmt1);
> -  gsi_insert_on_edge (e, stmt2);
> +      gsi_insert_on_edge (e, stmt1);
> +      gsi_insert_on_edge (e, stmt2);
> +    }
>    gsi_insert_on_edge (e, stmt3);
>  }
>
> Index: libgcc/libgcov.c
> ===================================================================
> --- libgcc/libgcov.c    (revision 194562)
> +++ libgcc/libgcov.c    (working copy)
> @@ -1632,6 +1632,22 @@ __gcov_one_value_profiler_body (gcov_type *counter
>    counters[2]++;
>  }
>
> +/* Atomic update version of __gcov_one_value_profile_body().  */
> +static inline void
> +__gcov_one_value_profiler_body_atomic (gcov_type *counters, gcov_type value)
> +{
> +  if (value == counters[0])
> +    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], 1);
> +  else if (counters[1] == 0)
> +    {
> +      counters[1] = 1;
> +      counters[0] = value;
> +    }
> +  else
> +    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], -1);
> +  GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[2], 1);
> +}
> +
>  #ifdef L_gcov_indirect_call_topn_profiler
>  /* Tries to keep track the most frequent N values in the counters where
>     N is specified by parameter TOPN_VAL. To track top N values, 2*N counter
> @@ -1740,6 +1756,12 @@ __gcov_one_value_profiler (gcov_type *counters, gc
>  {
>    __gcov_one_value_profiler_body (counters, value);
>  }
> +
> +void
> +__gcov_one_value_profiler_atomic (gcov_type *counters, gcov_type value)
> +{
> +  __gcov_one_value_profiler_body_atomic (counters, value);
> +}
>  #endif
>
>  #ifdef L_gcov_indirect_call_profiler
> @@ -1774,6 +1796,17 @@ __gcov_indirect_call_profiler (gcov_type* counter,
>           && *(void **) cur_func == *(void **) callee_func))
>      __gcov_one_value_profiler_body (counter, value);
>  }
> +
> +/* Atomic update version of __gcov_indirect_call_profiler().  */
> +void
> +__gcov_indirect_call_profiler_atomic (gcov_type* counter, gcov_type value,
> +                                      void* cur_func, void* callee_func)
> +{
> +  if (cur_func == callee_func
> +      || (VTABLE_USES_DESCRIPTORS && callee_func
> +          && *(void **) cur_func == *(void **) callee_func))
> +    __gcov_one_value_profiler_body_atomic (counter, value);
> +}
>  #endif
>
>
> @@ -2089,9 +2122,11 @@ EXPORT_SYMBOL (__gcov_merge_reusedist);
>
>  EXPORT_SYMBOL (__gcov_average_profiler);
>  EXPORT_SYMBOL (__gcov_indirect_call_profiler);
> +EXPORT_SYMBOL (__gcov_indirect_call_profiler_atomic);
>  EXPORT_SYMBOL (__gcov_interval_profiler);
>  EXPORT_SYMBOL (__gcov_ior_profiler);
>  EXPORT_SYMBOL (__gcov_one_value_profiler);
> +EXPORT_SYMBOL (__gcov_one_value_profiler_atomic);
>  EXPORT_SYMBOL (__gcov_pow2_profiler);
>
>  #endif /* __GCOV_KERNEL__ */
>
> --
> This patch is available for review at http://codereview.appspot.com/6965050
Andrew Pinski - Dec. 20, 2012, 12:29 a.m.
On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
> Hi,
>
> This patch adds the supprot of atomic update the profile counters.
> Tested with google internal benchmarks and fdo kernel build.

I think you should use the __atomic_ functions instead of __sync_
functions as they allow better performance for simple counters as you
can use __ATOMIC_RELAXED.

And this would be useful for the trunk also.  I was going to implement
this exact thing this week but some other important stuff came up.

Thanks,
Andrew Pinski


>
> Thanks,
>
> -Rong
>
> 2012-12-19  Rong Xu  <xur@google.com>
>
>         * gcc/common.opt: Add -fprofile-gen-atomic option.
>         * gcc/gcov-io.h: Add profile atomic update support.
>         * gcc/tree-profile.c (gimple_init_edge_profiler): Ditto.
>         (gimple_gen_edge_profiler): Ditto.
>         * libgcc/libgcov.c (__gcov_one_value_profiler_body): Ditto.
>         (__gcov_one_value_profiler_body_atomic): Ditto.
>         (__gcov_one_value_profiler_atomic): Ditto.
>         (__gcov_indirect_call_profiler_atomic): Ditto.
>
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt      (revision 194562)
> +++ gcc/common.opt      (working copy)
> @@ -1754,6 +1754,15 @@ fprofile-dump
>  Common Report Var(flag_profile_dump) Init(0) Optimization
>  Dump CFG profile for comparison.
>
> +; fprofile-gen-atomic=0: disable aotimically update.
> +; fprofile-gen-atomic=1: aotimically update edge profile counters.
> +; fprofile-gen-atomic=2: aotimically update value profile counters.
> +; fprofile-gen-atomic=3: aotimically update edge and value profile counters.
> +; other values will be ignored (fall back to the default of 0).
> +fprofile-gen-atomic=
> +Common Joined UInteger Report Var(flag_profile_gen_atomic) Init(0) Optimization
> +fprofile-gen-atomic=[0..3] Atomically increments for profile counters.
> +
>  fprofile-generate
>  Common
>  Enable common options for generating profile info for profile feedback directed optimizations
> Index: gcc/gcov-io.h
> ===================================================================
> --- gcc/gcov-io.h       (revision 194562)
> +++ gcc/gcov-io.h       (working copy)
> @@ -300,6 +300,14 @@ typedef unsigned gcov_type_unsigned __attribute__
>
>  #endif  /* BITS_PER_UNIT == 8  */
>
> +#if LONG_LONG_TYPE_SIZE > 32
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
> +#else
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
> +#endif
> +
>  #undef EXTRACT_MODULE_ID_FROM_GLOBAL_ID
>  #undef EXTRACT_FUNC_ID_FROM_GLOBAL_ID
>  #undef GEN_FUNC_GLOBAL_ID
> @@ -322,6 +330,18 @@ typedef unsigned gcov_type_unsigned __attribute__
>  typedef unsigned gcov_unsigned_t;
>  typedef unsigned gcov_position_t;
>
> +#if LONG_LONG_TYPE_SIZE > 32
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
> +#else
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
> +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
> +#endif
> +#define PROFILE_GEN_EDGE_ATOMIC (flag_profile_gen_atomic == 1 || \
> +                                 flag_profile_gen_atomic == 3)
> +#define PROFILE_GEN_VALUE_ATOMIC (flag_profile_gen_atomic == 2 || \
> +                                  flag_profile_gen_atomic == 3)
> +
>  /* gcov_type is typedef'd elsewhere for the compiler */
>  #if IN_GCOV
>  #define GCOV_LINKAGE static
> Index: gcc/tree-profile.c
> ===================================================================
> --- gcc/tree-profile.c  (revision 194562)
> +++ gcc/tree-profile.c  (working copy)
> @@ -471,7 +471,12 @@ gimple_init_edge_profiler (void)
>               = build_function_type_list (void_type_node,
>                                           gcov_type_ptr, gcov_type_node,
>                                           NULL_TREE);
> -      tree_one_value_profiler_fn
> +      if (PROFILE_GEN_VALUE_ATOMIC)
> +        tree_one_value_profiler_fn
> +             = build_fn_decl ("__gcov_one_value_profiler_atomic",
> +                                    one_value_profiler_fn_type);
> +      else
> +        tree_one_value_profiler_fn
>               = build_fn_decl ("__gcov_one_value_profiler",
>                                      one_value_profiler_fn_type);
>        TREE_NOTHROW (tree_one_value_profiler_fn) = 1;
> @@ -487,7 +492,12 @@ gimple_init_edge_profiler (void)
>                                        gcov_type_ptr, gcov_type_node,
>                                        ptr_void,
>                                        ptr_void, NULL_TREE);
> -      tree_indirect_call_profiler_fn
> +      if (PROFILE_GEN_VALUE_ATOMIC)
> +        tree_indirect_call_profiler_fn
> +             = build_fn_decl ("__gcov_indirect_call_profiler_atomic",
> +                                    ic_profiler_fn_type);
> +      else
> +        tree_indirect_call_profiler_fn
>               = build_fn_decl ("__gcov_indirect_call_profiler",
>                                      ic_profiler_fn_type);
>        TREE_NOTHROW (tree_indirect_call_profiler_fn) = 1;
> @@ -563,21 +573,37 @@ gimple_gen_edge_profiler (int edgeno, edge e)
>       gets re-set in tree_profiling.  */
>    if (gcov_type_tmp_var == NULL_TREE)
>      gcov_type_tmp_var = create_tmp_reg (gcov_type_node, "PROF_edge_counter");
> -  ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
> +
> +  if (PROFILE_GEN_EDGE_ATOMIC)
> +    ref = tree_coverage_counter_addr (GCOV_COUNTER_ARCS, edgeno);
> +  else
> +    ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
> +
>    one = build_int_cst (gcov_type_node, 1);
> -  stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
> -  gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
> -  find_referenced_vars_in (stmt1);
> -  stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
> -                                       gimple_assign_lhs (stmt1), one);
> -  gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
> -  stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
> +  if (PROFILE_GEN_EDGE_ATOMIC)
> +    {
> +      /* __sync_fetch_and_add_8 (&counter, 1); */
> +      stmt3 = gimple_build_call (builtin_decl_explicit
> +                                    (GCOV_TYPE_SYNC_FETCH_AND_ADD),
> +                                 2, ref, one);
> +      find_referenced_vars_in (stmt3);
> +    }
> +  else
> +    {
> +      stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
> +      gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
> +      find_referenced_vars_in (stmt1);
> +      stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
> +                                       gimple_assign_lhs (stmt1), one);
> +      gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
> +      stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
>
> -  if (flag_profile_generate_sampling)
> -    pointer_set_insert (instrumentation_to_be_sampled, stmt1);
> +      if (flag_profile_generate_sampling)
> +        pointer_set_insert (instrumentation_to_be_sampled, stmt1);
>
> -  gsi_insert_on_edge (e, stmt1);
> -  gsi_insert_on_edge (e, stmt2);
> +      gsi_insert_on_edge (e, stmt1);
> +      gsi_insert_on_edge (e, stmt2);
> +    }
>    gsi_insert_on_edge (e, stmt3);
>  }
>
> Index: libgcc/libgcov.c
> ===================================================================
> --- libgcc/libgcov.c    (revision 194562)
> +++ libgcc/libgcov.c    (working copy)
> @@ -1632,6 +1632,22 @@ __gcov_one_value_profiler_body (gcov_type *counter
>    counters[2]++;
>  }
>
> +/* Atomic update version of __gcov_one_value_profile_body().  */
> +static inline void
> +__gcov_one_value_profiler_body_atomic (gcov_type *counters, gcov_type value)
> +{
> +  if (value == counters[0])
> +    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], 1);
> +  else if (counters[1] == 0)
> +    {
> +      counters[1] = 1;
> +      counters[0] = value;
> +    }
> +  else
> +    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], -1);
> +  GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[2], 1);
> +}
> +
>  #ifdef L_gcov_indirect_call_topn_profiler
>  /* Tries to keep track the most frequent N values in the counters where
>     N is specified by parameter TOPN_VAL. To track top N values, 2*N counter
> @@ -1740,6 +1756,12 @@ __gcov_one_value_profiler (gcov_type *counters, gc
>  {
>    __gcov_one_value_profiler_body (counters, value);
>  }
> +
> +void
> +__gcov_one_value_profiler_atomic (gcov_type *counters, gcov_type value)
> +{
> +  __gcov_one_value_profiler_body_atomic (counters, value);
> +}
>  #endif
>
>  #ifdef L_gcov_indirect_call_profiler
> @@ -1774,6 +1796,17 @@ __gcov_indirect_call_profiler (gcov_type* counter,
>           && *(void **) cur_func == *(void **) callee_func))
>      __gcov_one_value_profiler_body (counter, value);
>  }
> +
> +/* Atomic update version of __gcov_indirect_call_profiler().  */
> +void
> +__gcov_indirect_call_profiler_atomic (gcov_type* counter, gcov_type value,
> +                                      void* cur_func, void* callee_func)
> +{
> +  if (cur_func == callee_func
> +      || (VTABLE_USES_DESCRIPTORS && callee_func
> +          && *(void **) cur_func == *(void **) callee_func))
> +    __gcov_one_value_profiler_body_atomic (counter, value);
> +}
>  #endif
>
>
> @@ -2089,9 +2122,11 @@ EXPORT_SYMBOL (__gcov_merge_reusedist);
>
>  EXPORT_SYMBOL (__gcov_average_profiler);
>  EXPORT_SYMBOL (__gcov_indirect_call_profiler);
> +EXPORT_SYMBOL (__gcov_indirect_call_profiler_atomic);
>  EXPORT_SYMBOL (__gcov_interval_profiler);
>  EXPORT_SYMBOL (__gcov_ior_profiler);
>  EXPORT_SYMBOL (__gcov_one_value_profiler);
> +EXPORT_SYMBOL (__gcov_one_value_profiler_atomic);
>  EXPORT_SYMBOL (__gcov_pow2_profiler);
>
>  #endif /* __GCOV_KERNEL__ */
>
> --
> This patch is available for review at http://codereview.appspot.com/6965050
Rong Xu - Dec. 20, 2012, 12:56 a.m.
On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
> > Hi,
> >
> > This patch adds the supprot of atomic update the profile counters.
> > Tested with google internal benchmarks and fdo kernel build.
>
> I think you should use the __atomic_ functions instead of __sync_
> functions as they allow better performance for simple counters as you
> can use __ATOMIC_RELAXED.

You are right. I think __ATOMIC_RELAXED should be OK here.
Thanks for the suggestion.

>
> And this would be useful for the trunk also.  I was going to implement
> this exact thing this week but some other important stuff came up.

I'll post trunk patch later.

>
> Thanks,
> Andrew Pinski
>
>
> >
> > Thanks,
> >
> > -Rong
> >
> > 2012-12-19  Rong Xu  <xur@google.com>
> >
> >         * gcc/common.opt: Add -fprofile-gen-atomic option.
> >         * gcc/gcov-io.h: Add profile atomic update support.
> >         * gcc/tree-profile.c (gimple_init_edge_profiler): Ditto.
> >         (gimple_gen_edge_profiler): Ditto.
> >         * libgcc/libgcov.c (__gcov_one_value_profiler_body): Ditto.
> >         (__gcov_one_value_profiler_body_atomic): Ditto.
> >         (__gcov_one_value_profiler_atomic): Ditto.
> >         (__gcov_indirect_call_profiler_atomic): Ditto.
> >
> > Index: gcc/common.opt
> > ===================================================================
> > --- gcc/common.opt      (revision 194562)
> > +++ gcc/common.opt      (working copy)
> > @@ -1754,6 +1754,15 @@ fprofile-dump
> >  Common Report Var(flag_profile_dump) Init(0) Optimization
> >  Dump CFG profile for comparison.
> >
> > +; fprofile-gen-atomic=0: disable aotimically update.
> > +; fprofile-gen-atomic=1: aotimically update edge profile counters.
> > +; fprofile-gen-atomic=2: aotimically update value profile counters.
> > +; fprofile-gen-atomic=3: aotimically update edge and value profile counters.
> > +; other values will be ignored (fall back to the default of 0).
> > +fprofile-gen-atomic=
> > +Common Joined UInteger Report Var(flag_profile_gen_atomic) Init(0) Optimization
> > +fprofile-gen-atomic=[0..3] Atomically increments for profile counters.
> > +
> >  fprofile-generate
> >  Common
> >  Enable common options for generating profile info for profile feedback directed optimizations
> > Index: gcc/gcov-io.h
> > ===================================================================
> > --- gcc/gcov-io.h       (revision 194562)
> > +++ gcc/gcov-io.h       (working copy)
> > @@ -300,6 +300,14 @@ typedef unsigned gcov_type_unsigned __attribute__
> >
> >  #endif  /* BITS_PER_UNIT == 8  */
> >
> > +#if LONG_LONG_TYPE_SIZE > 32
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
> > +#else
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
> > +#endif
> > +
> >  #undef EXTRACT_MODULE_ID_FROM_GLOBAL_ID
> >  #undef EXTRACT_FUNC_ID_FROM_GLOBAL_ID
> >  #undef GEN_FUNC_GLOBAL_ID
> > @@ -322,6 +330,18 @@ typedef unsigned gcov_type_unsigned __attribute__
> >  typedef unsigned gcov_unsigned_t;
> >  typedef unsigned gcov_position_t;
> >
> > +#if LONG_LONG_TYPE_SIZE > 32
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
> > +#else
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
> > +#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
> > +#endif
> > +#define PROFILE_GEN_EDGE_ATOMIC (flag_profile_gen_atomic == 1 || \
> > +                                 flag_profile_gen_atomic == 3)
> > +#define PROFILE_GEN_VALUE_ATOMIC (flag_profile_gen_atomic == 2 || \
> > +                                  flag_profile_gen_atomic == 3)
> > +
> >  /* gcov_type is typedef'd elsewhere for the compiler */
> >  #if IN_GCOV
> >  #define GCOV_LINKAGE static
> > Index: gcc/tree-profile.c
> > ===================================================================
> > --- gcc/tree-profile.c  (revision 194562)
> > +++ gcc/tree-profile.c  (working copy)
> > @@ -471,7 +471,12 @@ gimple_init_edge_profiler (void)
> >               = build_function_type_list (void_type_node,
> >                                           gcov_type_ptr, gcov_type_node,
> >                                           NULL_TREE);
> > -      tree_one_value_profiler_fn
> > +      if (PROFILE_GEN_VALUE_ATOMIC)
> > +        tree_one_value_profiler_fn
> > +             = build_fn_decl ("__gcov_one_value_profiler_atomic",
> > +                                    one_value_profiler_fn_type);
> > +      else
> > +        tree_one_value_profiler_fn
> >               = build_fn_decl ("__gcov_one_value_profiler",
> >                                      one_value_profiler_fn_type);
> >        TREE_NOTHROW (tree_one_value_profiler_fn) = 1;
> > @@ -487,7 +492,12 @@ gimple_init_edge_profiler (void)
> >                                        gcov_type_ptr, gcov_type_node,
> >                                        ptr_void,
> >                                        ptr_void, NULL_TREE);
> > -      tree_indirect_call_profiler_fn
> > +      if (PROFILE_GEN_VALUE_ATOMIC)
> > +        tree_indirect_call_profiler_fn
> > +             = build_fn_decl ("__gcov_indirect_call_profiler_atomic",
> > +                                    ic_profiler_fn_type);
> > +      else
> > +        tree_indirect_call_profiler_fn
> >               = build_fn_decl ("__gcov_indirect_call_profiler",
> >                                      ic_profiler_fn_type);
> >        TREE_NOTHROW (tree_indirect_call_profiler_fn) = 1;
> > @@ -563,21 +573,37 @@ gimple_gen_edge_profiler (int edgeno, edge e)
> >       gets re-set in tree_profiling.  */
> >    if (gcov_type_tmp_var == NULL_TREE)
> >      gcov_type_tmp_var = create_tmp_reg (gcov_type_node, "PROF_edge_counter");
> > -  ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
> > +
> > +  if (PROFILE_GEN_EDGE_ATOMIC)
> > +    ref = tree_coverage_counter_addr (GCOV_COUNTER_ARCS, edgeno);
> > +  else
> > +    ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
> > +
> >    one = build_int_cst (gcov_type_node, 1);
> > -  stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
> > -  gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
> > -  find_referenced_vars_in (stmt1);
> > -  stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
> > -                                       gimple_assign_lhs (stmt1), one);
> > -  gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
> > -  stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
> > +  if (PROFILE_GEN_EDGE_ATOMIC)
> > +    {
> > +      /* __sync_fetch_and_add_8 (&counter, 1); */
> > +      stmt3 = gimple_build_call (builtin_decl_explicit
> > +                                    (GCOV_TYPE_SYNC_FETCH_AND_ADD),
> > +                                 2, ref, one);
> > +      find_referenced_vars_in (stmt3);
> > +    }
> > +  else
> > +    {
> > +      stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
> > +      gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
> > +      find_referenced_vars_in (stmt1);
> > +      stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
> > +                                       gimple_assign_lhs (stmt1), one);
> > +      gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
> > +      stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
> >
> > -  if (flag_profile_generate_sampling)
> > -    pointer_set_insert (instrumentation_to_be_sampled, stmt1);
> > +      if (flag_profile_generate_sampling)
> > +        pointer_set_insert (instrumentation_to_be_sampled, stmt1);
> >
> > -  gsi_insert_on_edge (e, stmt1);
> > -  gsi_insert_on_edge (e, stmt2);
> > +      gsi_insert_on_edge (e, stmt1);
> > +      gsi_insert_on_edge (e, stmt2);
> > +    }
> >    gsi_insert_on_edge (e, stmt3);
> >  }
> >
> > Index: libgcc/libgcov.c
> > ===================================================================
> > --- libgcc/libgcov.c    (revision 194562)
> > +++ libgcc/libgcov.c    (working copy)
> > @@ -1632,6 +1632,22 @@ __gcov_one_value_profiler_body (gcov_type *counter
> >    counters[2]++;
> >  }
> >
> > +/* Atomic update version of __gcov_one_value_profile_body().  */
> > +static inline void
> > +__gcov_one_value_profiler_body_atomic (gcov_type *counters, gcov_type value)
> > +{
> > +  if (value == counters[0])
> > +    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], 1);
> > +  else if (counters[1] == 0)
> > +    {
> > +      counters[1] = 1;
> > +      counters[0] = value;
> > +    }
> > +  else
> > +    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], -1);
> > +  GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[2], 1);
> > +}
> > +
> >  #ifdef L_gcov_indirect_call_topn_profiler
> >  /* Tries to keep track the most frequent N values in the counters where
> >     N is specified by parameter TOPN_VAL. To track top N values, 2*N counter
> > @@ -1740,6 +1756,12 @@ __gcov_one_value_profiler (gcov_type *counters, gc
> >  {
> >    __gcov_one_value_profiler_body (counters, value);
> >  }
> > +
> > +void
> > +__gcov_one_value_profiler_atomic (gcov_type *counters, gcov_type value)
> > +{
> > +  __gcov_one_value_profiler_body_atomic (counters, value);
> > +}
> >  #endif
> >
> >  #ifdef L_gcov_indirect_call_profiler
> > @@ -1774,6 +1796,17 @@ __gcov_indirect_call_profiler (gcov_type* counter,
> >           && *(void **) cur_func == *(void **) callee_func))
> >      __gcov_one_value_profiler_body (counter, value);
> >  }
> > +
> > +/* Atomic update version of __gcov_indirect_call_profiler().  */
> > +void
> > +__gcov_indirect_call_profiler_atomic (gcov_type* counter, gcov_type value,
> > +                                      void* cur_func, void* callee_func)
> > +{
> > +  if (cur_func == callee_func
> > +      || (VTABLE_USES_DESCRIPTORS && callee_func
> > +          && *(void **) cur_func == *(void **) callee_func))
> > +    __gcov_one_value_profiler_body_atomic (counter, value);
> > +}
> >  #endif
> >
> >
> > @@ -2089,9 +2122,11 @@ EXPORT_SYMBOL (__gcov_merge_reusedist);
> >
> >  EXPORT_SYMBOL (__gcov_average_profiler);
> >  EXPORT_SYMBOL (__gcov_indirect_call_profiler);
> > +EXPORT_SYMBOL (__gcov_indirect_call_profiler_atomic);
> >  EXPORT_SYMBOL (__gcov_interval_profiler);
> >  EXPORT_SYMBOL (__gcov_ior_profiler);
> >  EXPORT_SYMBOL (__gcov_one_value_profiler);
> > +EXPORT_SYMBOL (__gcov_one_value_profiler_atomic);
> >  EXPORT_SYMBOL (__gcov_pow2_profiler);
> >
> >  #endif /* __GCOV_KERNEL__ */
> >
> > --
> > This patch is available for review at http://codereview.appspot.com/6965050
Jan Hubicka - Dec. 20, 2012, 4:20 p.m.
> On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> >
> > On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
> > > Hi,
> > >
> > > This patch adds the supprot of atomic update the profile counters.
> > > Tested with google internal benchmarks and fdo kernel build.
> >
> > I think you should use the __atomic_ functions instead of __sync_
> > functions as they allow better performance for simple counters as you
> > can use __ATOMIC_RELAXED.
> 
> You are right. I think __ATOMIC_RELAXED should be OK here.
> Thanks for the suggestion.
> 
> >
> > And this would be useful for the trunk also.  I was going to implement
> > this exact thing this week but some other important stuff came up.
> 
> I'll post trunk patch later.

Yes, I like that patch, too. Even if the costs are quite high (and this is why
atomic updates was sort of voted down in the past) the alternative of using TLS
has problems with too-much per-thread memory.

While there are even more alternatives, like recording the changes and
commmiting them in blocks (say at function return), I guess some solution is
better than no solution.

Thanks,
Honza
Andrew Pinski - Dec. 20, 2012, 4:57 p.m.
On Thu, Dec 20, 2012 at 8:20 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>> >
>> > On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
>> > > Hi,
>> > >
>> > > This patch adds the supprot of atomic update the profile counters.
>> > > Tested with google internal benchmarks and fdo kernel build.
>> >
>> > I think you should use the __atomic_ functions instead of __sync_
>> > functions as they allow better performance for simple counters as you
>> > can use __ATOMIC_RELAXED.
>>
>> You are right. I think __ATOMIC_RELAXED should be OK here.
>> Thanks for the suggestion.
>>
>> >
>> > And this would be useful for the trunk also.  I was going to implement
>> > this exact thing this week but some other important stuff came up.
>>
>> I'll post trunk patch later.
>
> Yes, I like that patch, too. Even if the costs are quite high (and this is why
> atomic updates was sort of voted down in the past) the alternative of using TLS
> has problems with too-much per-thread memory.

Actually sometimes (on some processors) atomic increments are cheaper
than doing a regular incremental.  Mainly because there is an
instruction which can handle it in the L2 cache rather than populating
the L1.   Octeon is one such processor where this is true.

Thanks,
Andrew Pinski

>
> While there are even more alternatives, like recording the changes and
> commmiting them in blocks (say at function return), I guess some solution is
> better than no solution.
>
> Thanks,
> Honza
Rong Xu - Dec. 20, 2012, 7:35 p.m.
we have this patch primarily for getting valid profile counts. we
observe that for some high-threaded programs, we are getting poor
counter due to data racing of counter update (like counter value is
only 15% of what it supposed to be for a 10-thread program).

In general, enabling atomic updates slows down programs. (for my some
of my toy programs, it has 3x slow down.) And that the reason I use
options to control value and edge profile count.

-Rong

On Thu, Dec 20, 2012 at 8:57 AM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Thu, Dec 20, 2012 at 8:20 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>>> On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>> >
>>> > On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
>>> > > Hi,
>>> > >
>>> > > This patch adds the supprot of atomic update the profile counters.
>>> > > Tested with google internal benchmarks and fdo kernel build.
>>> >
>>> > I think you should use the __atomic_ functions instead of __sync_
>>> > functions as they allow better performance for simple counters as you
>>> > can use __ATOMIC_RELAXED.
>>>
>>> You are right. I think __ATOMIC_RELAXED should be OK here.
>>> Thanks for the suggestion.
>>>
>>> >
>>> > And this would be useful for the trunk also.  I was going to implement
>>> > this exact thing this week but some other important stuff came up.
>>>
>>> I'll post trunk patch later.
>>
>> Yes, I like that patch, too. Even if the costs are quite high (and this is why
>> atomic updates was sort of voted down in the past) the alternative of using TLS
>> has problems with too-much per-thread memory.
>
> Actually sometimes (on some processors) atomic increments are cheaper
> than doing a regular incremental.  Mainly because there is an
> instruction which can handle it in the L2 cache rather than populating
> the L1.   Octeon is one such processor where this is true.
>
> Thanks,
> Andrew Pinski
>
>>
>> While there are even more alternatives, like recording the changes and
>> commmiting them in blocks (say at function return), I guess some solution is
>> better than no solution.
>>
>> Thanks,
>> Honza
Andrew Pinski - Dec. 20, 2012, 7:42 p.m.
On Thu, Dec 20, 2012 at 11:35 AM, Rong Xu <xur@google.com> wrote:
> we have this patch primarily for getting valid profile counts. we
> observe that for some high-threaded programs, we are getting poor
> counter due to data racing of counter update (like counter value is
> only 15% of what it supposed to be for a 10-thread program).

I have seen much worse on Octeon running with 32-threaded program.  I
think it was only 1% of what it should have been.


>
> In general, enabling atomic updates slows down programs. (for my some
> of my toy programs, it has 3x slow down.) And that the reason I use
> options to control value and edge profile count.

I think on Octeon, the atomic updates would be a speedup because of
the atomic instruction which was added explicitly for incrementing a
statistics counter.  Internally at Cavium, I might just turn this on
by default as it even helps the one thread case :).

Thanks,
Andrew Pinski

>
> -Rong
>
> On Thu, Dec 20, 2012 at 8:57 AM, Andrew Pinski <pinskia@gmail.com> wrote:
>> On Thu, Dec 20, 2012 at 8:20 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>>>> On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>>> >
>>>> > On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
>>>> > > Hi,
>>>> > >
>>>> > > This patch adds the supprot of atomic update the profile counters.
>>>> > > Tested with google internal benchmarks and fdo kernel build.
>>>> >
>>>> > I think you should use the __atomic_ functions instead of __sync_
>>>> > functions as they allow better performance for simple counters as you
>>>> > can use __ATOMIC_RELAXED.
>>>>
>>>> You are right. I think __ATOMIC_RELAXED should be OK here.
>>>> Thanks for the suggestion.
>>>>
>>>> >
>>>> > And this would be useful for the trunk also.  I was going to implement
>>>> > this exact thing this week but some other important stuff came up.
>>>>
>>>> I'll post trunk patch later.
>>>
>>> Yes, I like that patch, too. Even if the costs are quite high (and this is why
>>> atomic updates was sort of voted down in the past) the alternative of using TLS
>>> has problems with too-much per-thread memory.
>>
>> Actually sometimes (on some processors) atomic increments are cheaper
>> than doing a regular incremental.  Mainly because there is an
>> instruction which can handle it in the L2 cache rather than populating
>> the L1.   Octeon is one such processor where this is true.
>>
>> Thanks,
>> Andrew Pinski
>>
>>>
>>> While there are even more alternatives, like recording the changes and
>>> commmiting them in blocks (say at function return), I guess some solution is
>>> better than no solution.
>>>
>>> Thanks,
>>> Honza
Jan Hubicka - Dec. 21, 2012, 9:13 a.m.
> On Thu, Dec 20, 2012 at 8:20 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> >> On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> >> >
> >> > On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
> >> > > Hi,
> >> > >
> >> > > This patch adds the supprot of atomic update the profile counters.
> >> > > Tested with google internal benchmarks and fdo kernel build.
> >> >
> >> > I think you should use the __atomic_ functions instead of __sync_
> >> > functions as they allow better performance for simple counters as you
> >> > can use __ATOMIC_RELAXED.
> >>
> >> You are right. I think __ATOMIC_RELAXED should be OK here.
> >> Thanks for the suggestion.
> >>
> >> >
> >> > And this would be useful for the trunk also.  I was going to implement
> >> > this exact thing this week but some other important stuff came up.
> >>
> >> I'll post trunk patch later.
> >
> > Yes, I like that patch, too. Even if the costs are quite high (and this is why
> > atomic updates was sort of voted down in the past) the alternative of using TLS
> > has problems with too-much per-thread memory.
> 
> Actually sometimes (on some processors) atomic increments are cheaper
> than doing a regular incremental.  Mainly because there is an
> instruction which can handle it in the L2 cache rather than populating
> the L1.   Octeon is one such processor where this is true.

One reason for large divergence may be the fact that we optimize the counter
update code.  Perhaps declaring counters volatile will prevent load/store motion
and reduce the racing, too.

Honza
> 
> Thanks,
> Andrew Pinski
> 
> >
> > While there are even more alternatives, like recording the changes and
> > commmiting them in blocks (say at function return), I guess some solution is
> > better than no solution.
> >
> > Thanks,
> > Honza
Richard Guenther - Dec. 21, 2012, 9:55 a.m.
On Fri, Dec 21, 2012 at 10:13 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> On Thu, Dec 20, 2012 at 8:20 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> >> On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>> >> >
>> >> > On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
>> >> > > Hi,
>> >> > >
>> >> > > This patch adds the supprot of atomic update the profile counters.
>> >> > > Tested with google internal benchmarks and fdo kernel build.
>> >> >
>> >> > I think you should use the __atomic_ functions instead of __sync_
>> >> > functions as they allow better performance for simple counters as you
>> >> > can use __ATOMIC_RELAXED.
>> >>
>> >> You are right. I think __ATOMIC_RELAXED should be OK here.
>> >> Thanks for the suggestion.
>> >>
>> >> >
>> >> > And this would be useful for the trunk also.  I was going to implement
>> >> > this exact thing this week but some other important stuff came up.
>> >>
>> >> I'll post trunk patch later.
>> >
>> > Yes, I like that patch, too. Even if the costs are quite high (and this is why
>> > atomic updates was sort of voted down in the past) the alternative of using TLS
>> > has problems with too-much per-thread memory.
>>
>> Actually sometimes (on some processors) atomic increments are cheaper
>> than doing a regular incremental.  Mainly because there is an
>> instruction which can handle it in the L2 cache rather than populating
>> the L1.   Octeon is one such processor where this is true.
>
> One reason for large divergence may be the fact that we optimize the counter
> update code.  Perhaps declaring counters volatile will prevent load/store motion
> and reduce the racing, too.

Well, that will make it slower, too.  The best benchmark to check is tramp3d
for all this stuff.  I remember that ICC when it had a function call for each
counter update was about 100000x slower instrumented than w/o instrumentation
(that is, I never waited long enough to make it finish even one iteration ...)

Thus, it's very important that counter updates are subject to loop
invariant / store
motion (and SCEV const-prop)!  GCC does a wonderful job here at the moment,
please do not regress here.

Richard.

> Honza
>>
>> Thanks,
>> Andrew Pinski
>>
>> >
>> > While there are even more alternatives, like recording the changes and
>> > commmiting them in blocks (say at function return), I guess some solution is
>> > better than no solution.
>> >
>> > Thanks,
>> > Honza
Jan Hubicka - Dec. 21, 2012, 10:36 a.m.
> On Fri, Dec 21, 2012 at 10:13 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> >> On Thu, Dec 20, 2012 at 8:20 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> >> >> On Wed, Dec 19, 2012 at 4:29 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> >> >> >
> >> >> > On Wed, Dec 19, 2012 at 12:08 PM, Rong Xu <xur@google.com> wrote:
> >> >> > > Hi,
> >> >> > >
> >> >> > > This patch adds the supprot of atomic update the profile counters.
> >> >> > > Tested with google internal benchmarks and fdo kernel build.
> >> >> >
> >> >> > I think you should use the __atomic_ functions instead of __sync_
> >> >> > functions as they allow better performance for simple counters as you
> >> >> > can use __ATOMIC_RELAXED.
> >> >>
> >> >> You are right. I think __ATOMIC_RELAXED should be OK here.
> >> >> Thanks for the suggestion.
> >> >>
> >> >> >
> >> >> > And this would be useful for the trunk also.  I was going to implement
> >> >> > this exact thing this week but some other important stuff came up.
> >> >>
> >> >> I'll post trunk patch later.
> >> >
> >> > Yes, I like that patch, too. Even if the costs are quite high (and this is why
> >> > atomic updates was sort of voted down in the past) the alternative of using TLS
> >> > has problems with too-much per-thread memory.
> >>
> >> Actually sometimes (on some processors) atomic increments are cheaper
> >> than doing a regular incremental.  Mainly because there is an
> >> instruction which can handle it in the L2 cache rather than populating
> >> the L1.   Octeon is one such processor where this is true.
> >
> > One reason for large divergence may be the fact that we optimize the counter
> > update code.  Perhaps declaring counters volatile will prevent load/store motion
> > and reduce the racing, too.
> 
> Well, that will make it slower, too.  The best benchmark to check is tramp3d
> for all this stuff.  I remember that ICC when it had a function call for each
> counter update was about 100000x slower instrumented than w/o instrumentation
> (that is, I never waited long enough to make it finish even one iteration ...)
> 
> Thus, it's very important that counter updates are subject to loop
> invariant / store
> motion (and SCEV const-prop)!  GCC does a wonderful job here at the moment,
> please do not regress here.

Well, this feature is enabled by user switch.  I do not thing we should change
the default behaviour...

Which makes me to ask, the patch is very isolated (i.e. enabled by command line
only) and has obvious value for end user.  Would it be fine for stage3?

Honza

Patch

Index: gcc/common.opt
===================================================================
--- gcc/common.opt	(revision 194562)
+++ gcc/common.opt	(working copy)
@@ -1754,6 +1754,15 @@  fprofile-dump
 Common Report Var(flag_profile_dump) Init(0) Optimization
 Dump CFG profile for comparison.
 
+; fprofile-gen-atomic=0: disable aotimically update.
+; fprofile-gen-atomic=1: aotimically update edge profile counters.
+; fprofile-gen-atomic=2: aotimically update value profile counters.
+; fprofile-gen-atomic=3: aotimically update edge and value profile counters.
+; other values will be ignored (fall back to the default of 0).
+fprofile-gen-atomic=
+Common Joined UInteger Report Var(flag_profile_gen_atomic) Init(0) Optimization
+fprofile-gen-atomic=[0..3] Atomically increments for profile counters.
+
 fprofile-generate
 Common
 Enable common options for generating profile info for profile feedback directed optimizations
Index: gcc/gcov-io.h
===================================================================
--- gcc/gcov-io.h	(revision 194562)
+++ gcc/gcov-io.h	(working copy)
@@ -300,6 +300,14 @@  typedef unsigned gcov_type_unsigned __attribute__
 
 #endif  /* BITS_PER_UNIT == 8  */
 
+#if LONG_LONG_TYPE_SIZE > 32
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
+#else
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
+#endif
+
 #undef EXTRACT_MODULE_ID_FROM_GLOBAL_ID
 #undef EXTRACT_FUNC_ID_FROM_GLOBAL_ID
 #undef GEN_FUNC_GLOBAL_ID
@@ -322,6 +330,18 @@  typedef unsigned gcov_type_unsigned __attribute__
 typedef unsigned gcov_unsigned_t;
 typedef unsigned gcov_position_t;
 
+#if LONG_LONG_TYPE_SIZE > 32
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_8
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_8
+#else
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD_FN __sync_fetch_and_add_4
+#define GCOV_TYPE_SYNC_FETCH_AND_ADD BUILT_IN_SYNC_FETCH_AND_ADD_4
+#endif
+#define PROFILE_GEN_EDGE_ATOMIC (flag_profile_gen_atomic == 1 || \
+                                 flag_profile_gen_atomic == 3)
+#define PROFILE_GEN_VALUE_ATOMIC (flag_profile_gen_atomic == 2 || \
+                                  flag_profile_gen_atomic == 3)
+
 /* gcov_type is typedef'd elsewhere for the compiler */
 #if IN_GCOV
 #define GCOV_LINKAGE static
Index: gcc/tree-profile.c
===================================================================
--- gcc/tree-profile.c	(revision 194562)
+++ gcc/tree-profile.c	(working copy)
@@ -471,7 +471,12 @@  gimple_init_edge_profiler (void)
 	      = build_function_type_list (void_type_node,
 					  gcov_type_ptr, gcov_type_node,
 					  NULL_TREE);
-      tree_one_value_profiler_fn
+      if (PROFILE_GEN_VALUE_ATOMIC)
+        tree_one_value_profiler_fn
+	      = build_fn_decl ("__gcov_one_value_profiler_atomic",
+				     one_value_profiler_fn_type);
+      else
+        tree_one_value_profiler_fn
 	      = build_fn_decl ("__gcov_one_value_profiler",
 				     one_value_profiler_fn_type);
       TREE_NOTHROW (tree_one_value_profiler_fn) = 1;
@@ -487,7 +492,12 @@  gimple_init_edge_profiler (void)
                                       gcov_type_ptr, gcov_type_node,
                                       ptr_void,
                                       ptr_void, NULL_TREE);
-      tree_indirect_call_profiler_fn
+      if (PROFILE_GEN_VALUE_ATOMIC)
+        tree_indirect_call_profiler_fn
+	      = build_fn_decl ("__gcov_indirect_call_profiler_atomic",
+				     ic_profiler_fn_type);
+      else
+        tree_indirect_call_profiler_fn
 	      = build_fn_decl ("__gcov_indirect_call_profiler",
 				     ic_profiler_fn_type);
       TREE_NOTHROW (tree_indirect_call_profiler_fn) = 1;
@@ -563,21 +573,37 @@  gimple_gen_edge_profiler (int edgeno, edge e)
      gets re-set in tree_profiling.  */
   if (gcov_type_tmp_var == NULL_TREE)
     gcov_type_tmp_var = create_tmp_reg (gcov_type_node, "PROF_edge_counter");
-  ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
+
+  if (PROFILE_GEN_EDGE_ATOMIC)
+    ref = tree_coverage_counter_addr (GCOV_COUNTER_ARCS, edgeno);
+  else 
+    ref = tree_coverage_counter_ref (GCOV_COUNTER_ARCS, edgeno);
+
   one = build_int_cst (gcov_type_node, 1);
-  stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
-  gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
-  find_referenced_vars_in (stmt1);
-  stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
-					gimple_assign_lhs (stmt1), one);
-  gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
-  stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
+  if (PROFILE_GEN_EDGE_ATOMIC)
+    {
+      /* __sync_fetch_and_add_8 (&counter, 1); */
+      stmt3 = gimple_build_call (builtin_decl_explicit
+                                    (GCOV_TYPE_SYNC_FETCH_AND_ADD),
+                                 2, ref, one);
+      find_referenced_vars_in (stmt3);
+    }
+  else
+    {
+      stmt1 = gimple_build_assign (gcov_type_tmp_var, ref);
+      gimple_assign_set_lhs (stmt1, make_ssa_name (gcov_type_tmp_var, stmt1));
+      find_referenced_vars_in (stmt1);
+      stmt2 = gimple_build_assign_with_ops (PLUS_EXPR, gcov_type_tmp_var,
+            				gimple_assign_lhs (stmt1), one);
+      gimple_assign_set_lhs (stmt2, make_ssa_name (gcov_type_tmp_var, stmt2));
+      stmt3 = gimple_build_assign (unshare_expr (ref), gimple_assign_lhs (stmt2));
 
-  if (flag_profile_generate_sampling)
-    pointer_set_insert (instrumentation_to_be_sampled, stmt1);
+      if (flag_profile_generate_sampling)
+        pointer_set_insert (instrumentation_to_be_sampled, stmt1);
 
-  gsi_insert_on_edge (e, stmt1);
-  gsi_insert_on_edge (e, stmt2);
+      gsi_insert_on_edge (e, stmt1);
+      gsi_insert_on_edge (e, stmt2);
+    }
   gsi_insert_on_edge (e, stmt3);
 }
 
Index: libgcc/libgcov.c
===================================================================
--- libgcc/libgcov.c	(revision 194562)
+++ libgcc/libgcov.c	(working copy)
@@ -1632,6 +1632,22 @@  __gcov_one_value_profiler_body (gcov_type *counter
   counters[2]++;
 }
 
+/* Atomic update version of __gcov_one_value_profile_body().  */
+static inline void
+__gcov_one_value_profiler_body_atomic (gcov_type *counters, gcov_type value)
+{
+  if (value == counters[0])
+    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], 1);
+  else if (counters[1] == 0)
+    {
+      counters[1] = 1;
+      counters[0] = value;
+    }
+  else
+    GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[1], -1);
+  GCOV_TYPE_SYNC_FETCH_AND_ADD_FN (&counters[2], 1);
+}
+
 #ifdef L_gcov_indirect_call_topn_profiler
 /* Tries to keep track the most frequent N values in the counters where
    N is specified by parameter TOPN_VAL. To track top N values, 2*N counter
@@ -1740,6 +1756,12 @@  __gcov_one_value_profiler (gcov_type *counters, gc
 {
   __gcov_one_value_profiler_body (counters, value);
 }
+
+void
+__gcov_one_value_profiler_atomic (gcov_type *counters, gcov_type value)
+{
+  __gcov_one_value_profiler_body_atomic (counters, value);
+}
 #endif
 
 #ifdef L_gcov_indirect_call_profiler
@@ -1774,6 +1796,17 @@  __gcov_indirect_call_profiler (gcov_type* counter,
 	  && *(void **) cur_func == *(void **) callee_func))
     __gcov_one_value_profiler_body (counter, value);
 }
+
+/* Atomic update version of __gcov_indirect_call_profiler().  */
+void
+__gcov_indirect_call_profiler_atomic (gcov_type* counter, gcov_type value,
+                                      void* cur_func, void* callee_func)
+{
+  if (cur_func == callee_func
+      || (VTABLE_USES_DESCRIPTORS && callee_func
+          && *(void **) cur_func == *(void **) callee_func))
+    __gcov_one_value_profiler_body_atomic (counter, value);
+}
 #endif
 
 
@@ -2089,9 +2122,11 @@  EXPORT_SYMBOL (__gcov_merge_reusedist);
 
 EXPORT_SYMBOL (__gcov_average_profiler);
 EXPORT_SYMBOL (__gcov_indirect_call_profiler);
+EXPORT_SYMBOL (__gcov_indirect_call_profiler_atomic);
 EXPORT_SYMBOL (__gcov_interval_profiler);
 EXPORT_SYMBOL (__gcov_ior_profiler);
 EXPORT_SYMBOL (__gcov_one_value_profiler);
+EXPORT_SYMBOL (__gcov_one_value_profiler_atomic);
 EXPORT_SYMBOL (__gcov_pow2_profiler);
 
 #endif /* __GCOV_KERNEL__ */