diff mbox series

LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend

Message ID 20230821010453.3916192-1-juzhe.zhong@rivai.ai
State New
Headers show
Series LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend | expand

Commit Message

juzhe.zhong@rivai.ai Aug. 21, 2023, 1:04 a.m. UTC
This patch exports 'compute_antinout_edge' and 'compute_earliest' as global scope
which is going to be used in VSETVL PASS of RISC-V backend.

The demand fusion is the fusion of VSETVL information to emit VSETVL which dominate and pre-config for most
of the RVV instructions in order to elide redundant VSETVLs.

For exmaple:

for
 for
  for
    if (cond}
      VSETVL demand 1: SEW/LMUL = 16 and TU policy
    else
      VSETVL demand 2: SEW = 32

VSETVL pass should be able to fuse demand 1 and demand 2 into new demand: SEW = 32, LMUL = M2, TU policy.
Then emit such VSETVL at the outmost of the for loop to get the most optimal codegen and run-time execution.

Currenty the VSETVL PASS Phase 3 (demand fusion) is really messy and un-reliable as well as un-maintainable.
And, I recently read dragon book and morgan's book again, I found there "earliest" can allow us to do the
demand fusion in a very reliable and optimal way.

So, this patch exports these 2 functions which are very helpful for VSETVL pass.

gcc/ChangeLog:

	* lcm.cc (compute_antinout_edge): Export as global use.
	(compute_earliest): Ditto.
	(compute_rev_insert_delete): Ditto.
	* lcm.h (compute_antinout_edge): Ditto.
	(compute_earliest): Ditto.

---
 gcc/lcm.cc | 7 ++-----
 gcc/lcm.h  | 3 +++
 2 files changed, 5 insertions(+), 5 deletions(-)

Comments

Richard Biener Aug. 21, 2023, 7:09 a.m. UTC | #1
On Mon, 21 Aug 2023, Juzhe-Zhong wrote:

> This patch exports 'compute_antinout_edge' and 'compute_earliest' as global scope
> which is going to be used in VSETVL PASS of RISC-V backend.
> 
> The demand fusion is the fusion of VSETVL information to emit VSETVL which dominate and pre-config for most
> of the RVV instructions in order to elide redundant VSETVLs.
> 
> For exmaple:
> 
> for
>  for
>   for
>     if (cond}
>       VSETVL demand 1: SEW/LMUL = 16 and TU policy
>     else
>       VSETVL demand 2: SEW = 32
> 
> VSETVL pass should be able to fuse demand 1 and demand 2 into new demand: SEW = 32, LMUL = M2, TU policy.
> Then emit such VSETVL at the outmost of the for loop to get the most optimal codegen and run-time execution.
> 
> Currenty the VSETVL PASS Phase 3 (demand fusion) is really messy and un-reliable as well as un-maintainable.
> And, I recently read dragon book and morgan's book again, I found there "earliest" can allow us to do the
> demand fusion in a very reliable and optimal way.
> 
> So, this patch exports these 2 functions which are very helpful for VSETVL pass.

It would be nice to put these internal functions into a class or a
namespace given their non LCM name.  I don't see how you are going
to use these intermediate DF functions - they are just necessary
to compute pre_edge_lcm_avs which I see you already do.  Just to say
you are possibly going to blow up compile-time complexity of your
VSETVL dataflow problem?

> gcc/ChangeLog:
> 
> 	* lcm.cc (compute_antinout_edge): Export as global use.
> 	(compute_earliest): Ditto.
> 	(compute_rev_insert_delete): Ditto.
> 	* lcm.h (compute_antinout_edge): Ditto.
> 	(compute_earliest): Ditto.
> 
> ---
>  gcc/lcm.cc | 7 ++-----
>  gcc/lcm.h  | 3 +++
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/lcm.cc b/gcc/lcm.cc
> index 94a3ed43aea..03421e490e4 100644
> --- a/gcc/lcm.cc
> +++ b/gcc/lcm.cc
> @@ -56,9 +56,6 @@ along with GCC; see the file COPYING3.  If not see
>  #include "lcm.h"
>  
>  /* Edge based LCM routines.  */
> -static void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> -static void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> -			      sbitmap *, sbitmap *, sbitmap *);
>  static void compute_laterin (struct edge_list *, sbitmap *, sbitmap *,
>  			     sbitmap *, sbitmap *);
>  static void compute_insert_delete (struct edge_list *edge_list, sbitmap *,
> @@ -79,7 +76,7 @@ static void compute_rev_insert_delete (struct edge_list *edge_list, sbitmap *,
>     This is done based on the flow graph, and not on the pred-succ lists.
>     Other than that, its pretty much identical to compute_antinout.  */
>  
> -static void
> +void
>  compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>  		       sbitmap *antout)
>  {
> @@ -170,7 +167,7 @@ compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>  
>  /* Compute the earliest vector for edge based lcm.  */
>  
> -static void
> +void
>  compute_earliest (struct edge_list *edge_list, int n_exprs, sbitmap *antin,
>  		  sbitmap *antout, sbitmap *avout, sbitmap *kill,
>  		  sbitmap *earliest)
> diff --git a/gcc/lcm.h b/gcc/lcm.h
> index e08339352e0..7145d6fc46d 100644
> --- a/gcc/lcm.h
> +++ b/gcc/lcm.h
> @@ -31,4 +31,7 @@ extern struct edge_list *pre_edge_rev_lcm (int, sbitmap *,
>  					   sbitmap *, sbitmap *,
>  					   sbitmap *, sbitmap **,
>  					   sbitmap **);
> +extern void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> +extern void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> +			      sbitmap *, sbitmap *, sbitmap *);
>  #endif /* GCC_LCM_H */
>
juzhe.zhong@rivai.ai Aug. 21, 2023, 7:24 a.m. UTC | #2
Hi, Richi.

This patch is how I use LCM functions:
https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627953.html 

>> they are just necessary
>> to compute pre_edge_lcm_avs which I see you already do.  

In Phase 4 I use pre_edge_lcm_av to PRE fo current VSETVL cfg information.
However, it's not enough since I need phase 3 fuse VSETVL information to get better codegen.

The is how I use the functions:

+      /* Compute global availability.  */
       compute_available (m_vector_manager->vector_comp,
 			 m_vector_manager->vector_kill,
 			 m_vector_manager->vector_avout,
 			 m_vector_manager->vector_avin);
-      changed_p |= cleanup_illegal_dirty_blocks ();
+      /* Compute global anticipatability.  */
+      compute_antinout_edge (m_vector_manager->vector_antic,
+			     m_vector_manager->vector_transp,
+			     m_vector_manager->vector_antin,
+			     m_vector_manager->vector_antout);
+      /* Compute earliestness.  */
+      compute_earliest (m_vector_manager->vector_edge_list,
+			m_vector_manager->vector_exprs.length (),
+			m_vector_manager->vector_antin,
+			m_vector_manager->vector_antout,
+			m_vector_manager->vector_avout,
+			m_vector_manager->vector_kill,
+			m_vector_manager->vector_earliest);
+      changed_p |= earliest_fusion ();
You can see I explicitly call 'compute_earliest' followed by 'earliest_fusion'I need the result from 'compute_earliest' to do the VSETVL fusion that's the information 'pre_edge_lcm_av' didn't give us.
>> Just to say
>> you are possibly going to blow up compile-time complexity of your
>> VSETVL dataflow problem?

No, I export 'compute_earliest' as global because 'pre_edge_lcm_av' didn't give us the 'earliest' result that I need.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-21 15:09
To: Juzhe-Zhong
CC: gcc-patches; jeffreyalaw
Subject: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
On Mon, 21 Aug 2023, Juzhe-Zhong wrote:
 
> This patch exports 'compute_antinout_edge' and 'compute_earliest' as global scope
> which is going to be used in VSETVL PASS of RISC-V backend.
> 
> The demand fusion is the fusion of VSETVL information to emit VSETVL which dominate and pre-config for most
> of the RVV instructions in order to elide redundant VSETVLs.
> 
> For exmaple:
> 
> for
>  for
>   for
>     if (cond}
>       VSETVL demand 1: SEW/LMUL = 16 and TU policy
>     else
>       VSETVL demand 2: SEW = 32
> 
> VSETVL pass should be able to fuse demand 1 and demand 2 into new demand: SEW = 32, LMUL = M2, TU policy.
> Then emit such VSETVL at the outmost of the for loop to get the most optimal codegen and run-time execution.
> 
> Currenty the VSETVL PASS Phase 3 (demand fusion) is really messy and un-reliable as well as un-maintainable.
> And, I recently read dragon book and morgan's book again, I found there "earliest" can allow us to do the
> demand fusion in a very reliable and optimal way.
> 
> So, this patch exports these 2 functions which are very helpful for VSETVL pass.
 
It would be nice to put these internal functions into a class or a
namespace given their non LCM name.  I don't see how you are going
to use these intermediate DF functions - they are just necessary
to compute pre_edge_lcm_avs which I see you already do.  Just to say
you are possibly going to blow up compile-time complexity of your
VSETVL dataflow problem?
 
> gcc/ChangeLog:
> 
> * lcm.cc (compute_antinout_edge): Export as global use.
> (compute_earliest): Ditto.
> (compute_rev_insert_delete): Ditto.
> * lcm.h (compute_antinout_edge): Ditto.
> (compute_earliest): Ditto.
> 
> ---
>  gcc/lcm.cc | 7 ++-----
>  gcc/lcm.h  | 3 +++
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/lcm.cc b/gcc/lcm.cc
> index 94a3ed43aea..03421e490e4 100644
> --- a/gcc/lcm.cc
> +++ b/gcc/lcm.cc
> @@ -56,9 +56,6 @@ along with GCC; see the file COPYING3.  If not see
>  #include "lcm.h"
>  
>  /* Edge based LCM routines.  */
> -static void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> -static void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> -       sbitmap *, sbitmap *, sbitmap *);
>  static void compute_laterin (struct edge_list *, sbitmap *, sbitmap *,
>       sbitmap *, sbitmap *);
>  static void compute_insert_delete (struct edge_list *edge_list, sbitmap *,
> @@ -79,7 +76,7 @@ static void compute_rev_insert_delete (struct edge_list *edge_list, sbitmap *,
>     This is done based on the flow graph, and not on the pred-succ lists.
>     Other than that, its pretty much identical to compute_antinout.  */
>  
> -static void
> +void
>  compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>         sbitmap *antout)
>  {
> @@ -170,7 +167,7 @@ compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>  
>  /* Compute the earliest vector for edge based lcm.  */
>  
> -static void
> +void
>  compute_earliest (struct edge_list *edge_list, int n_exprs, sbitmap *antin,
>    sbitmap *antout, sbitmap *avout, sbitmap *kill,
>    sbitmap *earliest)
> diff --git a/gcc/lcm.h b/gcc/lcm.h
> index e08339352e0..7145d6fc46d 100644
> --- a/gcc/lcm.h
> +++ b/gcc/lcm.h
> @@ -31,4 +31,7 @@ extern struct edge_list *pre_edge_rev_lcm (int, sbitmap *,
>     sbitmap *, sbitmap *,
>     sbitmap *, sbitmap **,
>     sbitmap **);
> +extern void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> +extern void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> +       sbitmap *, sbitmap *, sbitmap *);
>  #endif /* GCC_LCM_H */
>
juzhe.zhong@rivai.ai Aug. 21, 2023, 7:46 a.m. UTC | #3
Hi. Richi.
I'd like to share more details that I want to do in VSETVL PASS.

Consider this following case:

for
  for 
    for
      ...
         for
	     VSETVL demand: RATIO = 32 and TU policy.

For this simple case, 'pre_edge_lcm_av' can perfectly work for us, will hoist "vsetvli e32,tu" to the outer-most loop.

However, for this case:
  for
  for 
    for
      ...
         for
	   if (...)
	     VSETVL 1 demand: RATIO = 32 and TU policy.
	   else if (...)
	     VSETVL 2 demand: SEW = 16.
	   else
	     VSETVL 3 demand: MU policy.

'pre_edge_lcm_av' is not sufficient to give us optimal codegen since VSETVL 1,  VSETVL 2 and VSETVL 3 are 3 different VSETVL demands
'pre_edge_lcm_av' can only hoist one of them. Such case I can easily produce by RVV intrinsic and they are already in our RVV testsuite.

To get the optimal codegen for this case,  We need I call it as "Demand fusion" which is fusing all "compatible" VSETVLs into a single VSETVL
then set them to avoid redundant VSETVLs.

In this case, we should be able to fuse VSETVL 1, VSETVL 2 and VSETVL 3 into new VSETVL demand : SEW = 16, LMUL = MF2, TU, MU into a single 
new VSETVL demand. Instead of giving 'pre_edge_lcm_av' 3 VSETVL demands (VSETVL 1/2/3). I give 'pre_edge_lcm_av' only single 1 new VSETVL demand.
Then, LCM PRE can hoist such fused VSETVL to the outer-most loop. So the program will be transformed as:

VSETVL SEW = 16, LMUL = MF2, TU, MU
  for
  for 
    for
      ...
         for
	   if (...) 
	     .....   no vsetvl insn.
	   else if (...)
	     ....  no vsetvl insn.

	   else
	     ....  no vsetvl insn.

So, how to do the demand fusion in this case? 
Before this patch and following RISC-V refactor patch, I do it explictly with my own decide algorithm.
Meaning I calculate which location of the program to do the VSETVL fusion is correct and optimal.

However, I found "compute_earliest" can help us to do the job for calculating the location of the program to do VSETVL fusion and
turns out it's a quite more reliable and reasonable approach than I do.

So that's why I export those 2 functions for us to be use in Phase 3 (Demand fusion) in RISC-V backend VSETVL PASS.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-21 15:09
To: Juzhe-Zhong
CC: gcc-patches; jeffreyalaw
Subject: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
On Mon, 21 Aug 2023, Juzhe-Zhong wrote:
 
> This patch exports 'compute_antinout_edge' and 'compute_earliest' as global scope
> which is going to be used in VSETVL PASS of RISC-V backend.
> 
> The demand fusion is the fusion of VSETVL information to emit VSETVL which dominate and pre-config for most
> of the RVV instructions in order to elide redundant VSETVLs.
> 
> For exmaple:
> 
> for
>  for
>   for
>     if (cond}
>       VSETVL demand 1: SEW/LMUL = 16 and TU policy
>     else
>       VSETVL demand 2: SEW = 32
> 
> VSETVL pass should be able to fuse demand 1 and demand 2 into new demand: SEW = 32, LMUL = M2, TU policy.
> Then emit such VSETVL at the outmost of the for loop to get the most optimal codegen and run-time execution.
> 
> Currenty the VSETVL PASS Phase 3 (demand fusion) is really messy and un-reliable as well as un-maintainable.
> And, I recently read dragon book and morgan's book again, I found there "earliest" can allow us to do the
> demand fusion in a very reliable and optimal way.
> 
> So, this patch exports these 2 functions which are very helpful for VSETVL pass.
 
It would be nice to put these internal functions into a class or a
namespace given their non LCM name.  I don't see how you are going
to use these intermediate DF functions - they are just necessary
to compute pre_edge_lcm_avs which I see you already do.  Just to say
you are possibly going to blow up compile-time complexity of your
VSETVL dataflow problem?
 
> gcc/ChangeLog:
> 
> * lcm.cc (compute_antinout_edge): Export as global use.
> (compute_earliest): Ditto.
> (compute_rev_insert_delete): Ditto.
> * lcm.h (compute_antinout_edge): Ditto.
> (compute_earliest): Ditto.
> 
> ---
>  gcc/lcm.cc | 7 ++-----
>  gcc/lcm.h  | 3 +++
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/lcm.cc b/gcc/lcm.cc
> index 94a3ed43aea..03421e490e4 100644
> --- a/gcc/lcm.cc
> +++ b/gcc/lcm.cc
> @@ -56,9 +56,6 @@ along with GCC; see the file COPYING3.  If not see
>  #include "lcm.h"
>  
>  /* Edge based LCM routines.  */
> -static void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> -static void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> -       sbitmap *, sbitmap *, sbitmap *);
>  static void compute_laterin (struct edge_list *, sbitmap *, sbitmap *,
>       sbitmap *, sbitmap *);
>  static void compute_insert_delete (struct edge_list *edge_list, sbitmap *,
> @@ -79,7 +76,7 @@ static void compute_rev_insert_delete (struct edge_list *edge_list, sbitmap *,
>     This is done based on the flow graph, and not on the pred-succ lists.
>     Other than that, its pretty much identical to compute_antinout.  */
>  
> -static void
> +void
>  compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>         sbitmap *antout)
>  {
> @@ -170,7 +167,7 @@ compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>  
>  /* Compute the earliest vector for edge based lcm.  */
>  
> -static void
> +void
>  compute_earliest (struct edge_list *edge_list, int n_exprs, sbitmap *antin,
>    sbitmap *antout, sbitmap *avout, sbitmap *kill,
>    sbitmap *earliest)
> diff --git a/gcc/lcm.h b/gcc/lcm.h
> index e08339352e0..7145d6fc46d 100644
> --- a/gcc/lcm.h
> +++ b/gcc/lcm.h
> @@ -31,4 +31,7 @@ extern struct edge_list *pre_edge_rev_lcm (int, sbitmap *,
>     sbitmap *, sbitmap *,
>     sbitmap *, sbitmap **,
>     sbitmap **);
> +extern void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> +extern void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> +       sbitmap *, sbitmap *, sbitmap *);
>  #endif /* GCC_LCM_H */
>
Richard Biener Aug. 21, 2023, 7:50 a.m. UTC | #4
On Mon, 21 Aug 2023, juzhe.zhong@rivai.ai wrote:

> Hi. Richi.
> I'd like to share more details that I want to do in VSETVL PASS.
> 
> Consider this following case:
> 
> for
>   for 
>     for
>       ...
>          for
> 	     VSETVL demand: RATIO = 32 and TU policy.
> 
> For this simple case, 'pre_edge_lcm_av' can perfectly work for us, will hoist "vsetvli e32,tu" to the outer-most loop.
> 
> However, for this case:
>   for
>   for 
>     for
>       ...
>          for
> 	   if (...)
> 	     VSETVL 1 demand: RATIO = 32 and TU policy.
> 	   else if (...)
> 	     VSETVL 2 demand: SEW = 16.
> 	   else
> 	     VSETVL 3 demand: MU policy.
> 
> 'pre_edge_lcm_av' is not sufficient to give us optimal codegen since VSETVL 1,  VSETVL 2 and VSETVL 3 are 3 different VSETVL demands
> 'pre_edge_lcm_av' can only hoist one of them. Such case I can easily produce by RVV intrinsic and they are already in our RVV testsuite.
> 
> To get the optimal codegen for this case,  We need I call it as "Demand fusion" which is fusing all "compatible" VSETVLs into a single VSETVL
> then set them to avoid redundant VSETVLs.
> 
> In this case, we should be able to fuse VSETVL 1, VSETVL 2 and VSETVL 3 into new VSETVL demand : SEW = 16, LMUL = MF2, TU, MU into a single 
> new VSETVL demand. Instead of giving 'pre_edge_lcm_av' 3 VSETVL demands (VSETVL 1/2/3). I give 'pre_edge_lcm_av' only single 1 new VSETVL demand.
> Then, LCM PRE can hoist such fused VSETVL to the outer-most loop. So the program will be transformed as:
> 
> VSETVL SEW = 16, LMUL = MF2, TU, MU
>   for
>   for 
>     for
>       ...
>          for
> 	   if (...) 
> 	     .....   no vsetvl insn.
> 	   else if (...)
> 	     ....  no vsetvl insn.
> 
> 	   else
> 	     ....  no vsetvl insn.
> 
> So, how to do the demand fusion in this case? 
> Before this patch and following RISC-V refactor patch, I do it explictly with my own decide algorithm.
> Meaning I calculate which location of the program to do the VSETVL fusion is correct and optimal.
> 
> However, I found "compute_earliest" can help us to do the job for calculating the location of the program to do VSETVL fusion and
> turns out it's a quite more reliable and reasonable approach than I do.
> 
> So that's why I export those 2 functions for us to be use in Phase 3 (Demand fusion) in RISC-V backend VSETVL PASS.

Thanks for the explanation, exporting the functions is OK.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-21 15:09
> To: Juzhe-Zhong
> CC: gcc-patches; jeffreyalaw
> Subject: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
> On Mon, 21 Aug 2023, Juzhe-Zhong wrote:
>  
> > This patch exports 'compute_antinout_edge' and 'compute_earliest' as global scope
> > which is going to be used in VSETVL PASS of RISC-V backend.
> > 
> > The demand fusion is the fusion of VSETVL information to emit VSETVL which dominate and pre-config for most
> > of the RVV instructions in order to elide redundant VSETVLs.
> > 
> > For exmaple:
> > 
> > for
> >  for
> >   for
> >     if (cond}
> >       VSETVL demand 1: SEW/LMUL = 16 and TU policy
> >     else
> >       VSETVL demand 2: SEW = 32
> > 
> > VSETVL pass should be able to fuse demand 1 and demand 2 into new demand: SEW = 32, LMUL = M2, TU policy.
> > Then emit such VSETVL at the outmost of the for loop to get the most optimal codegen and run-time execution.
> > 
> > Currenty the VSETVL PASS Phase 3 (demand fusion) is really messy and un-reliable as well as un-maintainable.
> > And, I recently read dragon book and morgan's book again, I found there "earliest" can allow us to do the
> > demand fusion in a very reliable and optimal way.
> > 
> > So, this patch exports these 2 functions which are very helpful for VSETVL pass.
>  
> It would be nice to put these internal functions into a class or a
> namespace given their non LCM name.  I don't see how you are going
> to use these intermediate DF functions - they are just necessary
> to compute pre_edge_lcm_avs which I see you already do.  Just to say
> you are possibly going to blow up compile-time complexity of your
> VSETVL dataflow problem?
>  
> > gcc/ChangeLog:
> > 
> > * lcm.cc (compute_antinout_edge): Export as global use.
> > (compute_earliest): Ditto.
> > (compute_rev_insert_delete): Ditto.
> > * lcm.h (compute_antinout_edge): Ditto.
> > (compute_earliest): Ditto.
> > 
> > ---
> >  gcc/lcm.cc | 7 ++-----
> >  gcc/lcm.h  | 3 +++
> >  2 files changed, 5 insertions(+), 5 deletions(-)
> > 
> > diff --git a/gcc/lcm.cc b/gcc/lcm.cc
> > index 94a3ed43aea..03421e490e4 100644
> > --- a/gcc/lcm.cc
> > +++ b/gcc/lcm.cc
> > @@ -56,9 +56,6 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "lcm.h"
> >  
> >  /* Edge based LCM routines.  */
> > -static void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> > -static void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> > -       sbitmap *, sbitmap *, sbitmap *);
> >  static void compute_laterin (struct edge_list *, sbitmap *, sbitmap *,
> >       sbitmap *, sbitmap *);
> >  static void compute_insert_delete (struct edge_list *edge_list, sbitmap *,
> > @@ -79,7 +76,7 @@ static void compute_rev_insert_delete (struct edge_list *edge_list, sbitmap *,
> >     This is done based on the flow graph, and not on the pred-succ lists.
> >     Other than that, its pretty much identical to compute_antinout.  */
> >  
> > -static void
> > +void
> >  compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
> >         sbitmap *antout)
> >  {
> > @@ -170,7 +167,7 @@ compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
> >  
> >  /* Compute the earliest vector for edge based lcm.  */
> >  
> > -static void
> > +void
> >  compute_earliest (struct edge_list *edge_list, int n_exprs, sbitmap *antin,
> >    sbitmap *antout, sbitmap *avout, sbitmap *kill,
> >    sbitmap *earliest)
> > diff --git a/gcc/lcm.h b/gcc/lcm.h
> > index e08339352e0..7145d6fc46d 100644
> > --- a/gcc/lcm.h
> > +++ b/gcc/lcm.h
> > @@ -31,4 +31,7 @@ extern struct edge_list *pre_edge_rev_lcm (int, sbitmap *,
> >     sbitmap *, sbitmap *,
> >     sbitmap *, sbitmap **,
> >     sbitmap **);
> > +extern void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> > +extern void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> > +       sbitmap *, sbitmap *, sbitmap *);
> >  #endif /* GCC_LCM_H */
> > 
>  
>
juzhe.zhong@rivai.ai Aug. 21, 2023, 8:06 a.m. UTC | #5
>> Thanks for the explanation, exporting the functions is OK.
Thanks.

>> It would be nice to put these internal functions into a class or a
>> namespace given their non LCM name.
Hi, Richi. I saw there is a function "extern void compute_available (sbitmap *, sbitmap *, sbitmap *, sbitmap *);"
which is already exported as global.

Do you mean I add those 2 functions (I export this patch) and "compute_avaialble" which has already been exported
into namespace lcm like this:

namespace lcm
{
  compute_available 
compute_antinout_edge
compute_earliest
}
?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-21 15:50
To: juzhe.zhong@rivai.ai
CC: gcc-patches; jeffreyalaw
Subject: Re: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
On Mon, 21 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi. Richi.
> I'd like to share more details that I want to do in VSETVL PASS.
> 
> Consider this following case:
> 
> for
>   for 
>     for
>       ...
>          for
>      VSETVL demand: RATIO = 32 and TU policy.
> 
> For this simple case, 'pre_edge_lcm_av' can perfectly work for us, will hoist "vsetvli e32,tu" to the outer-most loop.
> 
> However, for this case:
>   for
>   for 
>     for
>       ...
>          for
>    if (...)
>      VSETVL 1 demand: RATIO = 32 and TU policy.
>    else if (...)
>      VSETVL 2 demand: SEW = 16.
>    else
>      VSETVL 3 demand: MU policy.
> 
> 'pre_edge_lcm_av' is not sufficient to give us optimal codegen since VSETVL 1,  VSETVL 2 and VSETVL 3 are 3 different VSETVL demands
> 'pre_edge_lcm_av' can only hoist one of them. Such case I can easily produce by RVV intrinsic and they are already in our RVV testsuite.
> 
> To get the optimal codegen for this case,  We need I call it as "Demand fusion" which is fusing all "compatible" VSETVLs into a single VSETVL
> then set them to avoid redundant VSETVLs.
> 
> In this case, we should be able to fuse VSETVL 1, VSETVL 2 and VSETVL 3 into new VSETVL demand : SEW = 16, LMUL = MF2, TU, MU into a single 
> new VSETVL demand. Instead of giving 'pre_edge_lcm_av' 3 VSETVL demands (VSETVL 1/2/3). I give 'pre_edge_lcm_av' only single 1 new VSETVL demand.
> Then, LCM PRE can hoist such fused VSETVL to the outer-most loop. So the program will be transformed as:
> 
> VSETVL SEW = 16, LMUL = MF2, TU, MU
>   for
>   for 
>     for
>       ...
>          for
>    if (...) 
>      .....   no vsetvl insn.
>    else if (...)
>      ....  no vsetvl insn.
> 
>    else
>      ....  no vsetvl insn.
> 
> So, how to do the demand fusion in this case? 
> Before this patch and following RISC-V refactor patch, I do it explictly with my own decide algorithm.
> Meaning I calculate which location of the program to do the VSETVL fusion is correct and optimal.
> 
> However, I found "compute_earliest" can help us to do the job for calculating the location of the program to do VSETVL fusion and
> turns out it's a quite more reliable and reasonable approach than I do.
> 
> So that's why I export those 2 functions for us to be use in Phase 3 (Demand fusion) in RISC-V backend VSETVL PASS.
 
Thanks for the explanation, exporting the functions is OK.
 
Richard.
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-21 15:09
> To: Juzhe-Zhong
> CC: gcc-patches; jeffreyalaw
> Subject: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
> On Mon, 21 Aug 2023, Juzhe-Zhong wrote:
>  
> > This patch exports 'compute_antinout_edge' and 'compute_earliest' as global scope
> > which is going to be used in VSETVL PASS of RISC-V backend.
> > 
> > The demand fusion is the fusion of VSETVL information to emit VSETVL which dominate and pre-config for most
> > of the RVV instructions in order to elide redundant VSETVLs.
> > 
> > For exmaple:
> > 
> > for
> >  for
> >   for
> >     if (cond}
> >       VSETVL demand 1: SEW/LMUL = 16 and TU policy
> >     else
> >       VSETVL demand 2: SEW = 32
> > 
> > VSETVL pass should be able to fuse demand 1 and demand 2 into new demand: SEW = 32, LMUL = M2, TU policy.
> > Then emit such VSETVL at the outmost of the for loop to get the most optimal codegen and run-time execution.
> > 
> > Currenty the VSETVL PASS Phase 3 (demand fusion) is really messy and un-reliable as well as un-maintainable.
> > And, I recently read dragon book and morgan's book again, I found there "earliest" can allow us to do the
> > demand fusion in a very reliable and optimal way.
> > 
> > So, this patch exports these 2 functions which are very helpful for VSETVL pass.
>  
> It would be nice to put these internal functions into a class or a
> namespace given their non LCM name.  I don't see how you are going
> to use these intermediate DF functions - they are just necessary
> to compute pre_edge_lcm_avs which I see you already do.  Just to say
> you are possibly going to blow up compile-time complexity of your
> VSETVL dataflow problem?
>  
> > gcc/ChangeLog:
> > 
> > * lcm.cc (compute_antinout_edge): Export as global use.
> > (compute_earliest): Ditto.
> > (compute_rev_insert_delete): Ditto.
> > * lcm.h (compute_antinout_edge): Ditto.
> > (compute_earliest): Ditto.
> > 
> > ---
> >  gcc/lcm.cc | 7 ++-----
> >  gcc/lcm.h  | 3 +++
> >  2 files changed, 5 insertions(+), 5 deletions(-)
> > 
> > diff --git a/gcc/lcm.cc b/gcc/lcm.cc
> > index 94a3ed43aea..03421e490e4 100644
> > --- a/gcc/lcm.cc
> > +++ b/gcc/lcm.cc
> > @@ -56,9 +56,6 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "lcm.h"
> >  
> >  /* Edge based LCM routines.  */
> > -static void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> > -static void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> > -       sbitmap *, sbitmap *, sbitmap *);
> >  static void compute_laterin (struct edge_list *, sbitmap *, sbitmap *,
> >       sbitmap *, sbitmap *);
> >  static void compute_insert_delete (struct edge_list *edge_list, sbitmap *,
> > @@ -79,7 +76,7 @@ static void compute_rev_insert_delete (struct edge_list *edge_list, sbitmap *,
> >     This is done based on the flow graph, and not on the pred-succ lists.
> >     Other than that, its pretty much identical to compute_antinout.  */
> >  
> > -static void
> > +void
> >  compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
> >         sbitmap *antout)
> >  {
> > @@ -170,7 +167,7 @@ compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
> >  
> >  /* Compute the earliest vector for edge based lcm.  */
> >  
> > -static void
> > +void
> >  compute_earliest (struct edge_list *edge_list, int n_exprs, sbitmap *antin,
> >    sbitmap *antout, sbitmap *avout, sbitmap *kill,
> >    sbitmap *earliest)
> > diff --git a/gcc/lcm.h b/gcc/lcm.h
> > index e08339352e0..7145d6fc46d 100644
> > --- a/gcc/lcm.h
> > +++ b/gcc/lcm.h
> > @@ -31,4 +31,7 @@ extern struct edge_list *pre_edge_rev_lcm (int, sbitmap *,
> >     sbitmap *, sbitmap *,
> >     sbitmap *, sbitmap **,
> >     sbitmap **);
> > +extern void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
> > +extern void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
> > +       sbitmap *, sbitmap *, sbitmap *);
> >  #endif /* GCC_LCM_H */
> > 
>  
>
Lehua Ding Aug. 21, 2023, 9:32 a.m. UTC | #6
Committed, thanks Richard.

On 2023/8/21 17:12, Richard Biener via Gcc-patches wrote:
> On Mon, 21 Aug 2023, juzhe.zhong@rivai.ai wrote:
> 
>> Hi, Richi.
>>
>> I found when I try this in lcm.h:
>>
>> namespace lcm {
>> void compute_available (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
>> void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
>> void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *, sbitmap *,
>>                         sbitmap *, sbitmap *);
>> } // namespace lcm
>>
>>
>> Then I need to add namespace lcm for these 3 functions in lcm.cc too.
>> However, they are not located in the same location. So I need to do this:
>>
>> namspace lcm {
>> compute_antinout_edge
>> compute_earliest
>> }
>> ...
>> namspace lcm {
>> compute_available
>> }
>>
>> I think it's a little bit ugly since some functions in lcm.cc belongs to LCM namespace, some are not.
>>
>> And we already have compute_available that has non LCM name.
>> May be this patch is better and OK?
> 
> The original patch is OK.
> 
> Richard.
> 
>> Thanks.
>>
>>
>> juzhe.zhong@rivai.ai
>>   
>> From: juzhe.zhong@rivai.ai
>> Date: 2023-08-21 16:06
>> To: rguenther
>> CC: gcc-patches; jeffreyalaw
>> Subject: Re: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
>>>> Thanks for the explanation, exporting the functions is OK.
>> Thanks.
>>
>>>> It would be nice to put these internal functions into a class or a
>>>> namespace given their non LCM name.
>> Hi, Richi. I saw there is a function "extern void compute_available (sbitmap *, sbitmap *, sbitmap *, sbitmap *);"
>> which is already exported as global.
>>
>> Do you mean I add those 2 functions (I export this patch) and "compute_avaialble" which has already been exported
>> into namespace lcm like this:
>>
>> namespace lcm
>> {
>>    compute_available
>> compute_antinout_edge
>> compute_earliest
>> }
>> ?
>>
>> Thanks.
>>
>>
>> juzhe.zhong@rivai.ai
>>   
>> From: Richard Biener
>> Date: 2023-08-21 15:50
>> To: juzhe.zhong@rivai.ai
>> CC: gcc-patches; jeffreyalaw
>> Subject: Re: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
>> On Mon, 21 Aug 2023, juzhe.zhong@rivai.ai wrote:
>>   
>>> Hi. Richi.
>>> I'd like to share more details that I want to do in VSETVL PASS.
>>>
>>> Consider this following case:
>>>
>>> for
>>>    for
>>>      for
>>>        ...
>>>           for
>>>       VSETVL demand: RATIO = 32 and TU policy.
>>>
>>> For this simple case, 'pre_edge_lcm_av' can perfectly work for us, will hoist "vsetvli e32,tu" to the outer-most loop.
>>>
>>> However, for this case:
>>>    for
>>>    for
>>>      for
>>>        ...
>>>           for
>>>     if (...)
>>>       VSETVL 1 demand: RATIO = 32 and TU policy.
>>>     else if (...)
>>>       VSETVL 2 demand: SEW = 16.
>>>     else
>>>       VSETVL 3 demand: MU policy.
>>>
>>> 'pre_edge_lcm_av' is not sufficient to give us optimal codegen since VSETVL 1,  VSETVL 2 and VSETVL 3 are 3 different VSETVL demands
>>> 'pre_edge_lcm_av' can only hoist one of them. Such case I can easily produce by RVV intrinsic and they are already in our RVV testsuite.
>>>
>>> To get the optimal codegen for this case,  We need I call it as "Demand fusion" which is fusing all "compatible" VSETVLs into a single VSETVL
>>> then set them to avoid redundant VSETVLs.
>>>
>>> In this case, we should be able to fuse VSETVL 1, VSETVL 2 and VSETVL 3 into new VSETVL demand : SEW = 16, LMUL = MF2, TU, MU into a single
>>> new VSETVL demand. Instead of giving 'pre_edge_lcm_av' 3 VSETVL demands (VSETVL 1/2/3). I give 'pre_edge_lcm_av' only single 1 new VSETVL demand.
>>> Then, LCM PRE can hoist such fused VSETVL to the outer-most loop. So the program will be transformed as:
>>>
>>> VSETVL SEW = 16, LMUL = MF2, TU, MU
>>>    for
>>>    for
>>>      for
>>>        ...
>>>           for
>>>     if (...)
>>>       .....   no vsetvl insn.
>>>     else if (...)
>>>       ....  no vsetvl insn.
>>>
>>>     else
>>>       ....  no vsetvl insn.
>>>
>>> So, how to do the demand fusion in this case?
>>> Before this patch and following RISC-V refactor patch, I do it explictly with my own decide algorithm.
>>> Meaning I calculate which location of the program to do the VSETVL fusion is correct and optimal.
>>>
>>> However, I found "compute_earliest" can help us to do the job for calculating the location of the program to do VSETVL fusion and
>>> turns out it's a quite more reliable and reasonable approach than I do.
>>>
>>> So that's why I export those 2 functions for us to be use in Phase 3 (Demand fusion) in RISC-V backend VSETVL PASS.
>>   
>> Thanks for the explanation, exporting the functions is OK.
>>   
>> Richard.
>>   
>>> Thanks.
>>>
>>>
>>> juzhe.zhong@rivai.ai
>>>   
>>> From: Richard Biener
>>> Date: 2023-08-21 15:09
>>> To: Juzhe-Zhong
>>> CC: gcc-patches; jeffreyalaw
>>> Subject: Re: [PATCH] LCM: Export 2 helpful functions as global for VSETVL PASS use in RISC-V backend
>>> On Mon, 21 Aug 2023, Juzhe-Zhong wrote:
>>>   
>>>> This patch exports 'compute_antinout_edge' and 'compute_earliest' as global scope
>>>> which is going to be used in VSETVL PASS of RISC-V backend.
>>>>
>>>> The demand fusion is the fusion of VSETVL information to emit VSETVL which dominate and pre-config for most
>>>> of the RVV instructions in order to elide redundant VSETVLs.
>>>>
>>>> For exmaple:
>>>>
>>>> for
>>>>   for
>>>>    for
>>>>      if (cond}
>>>>        VSETVL demand 1: SEW/LMUL = 16 and TU policy
>>>>      else
>>>>        VSETVL demand 2: SEW = 32
>>>>
>>>> VSETVL pass should be able to fuse demand 1 and demand 2 into new demand: SEW = 32, LMUL = M2, TU policy.
>>>> Then emit such VSETVL at the outmost of the for loop to get the most optimal codegen and run-time execution.
>>>>
>>>> Currenty the VSETVL PASS Phase 3 (demand fusion) is really messy and un-reliable as well as un-maintainable.
>>>> And, I recently read dragon book and morgan's book again, I found there "earliest" can allow us to do the
>>>> demand fusion in a very reliable and optimal way.
>>>>
>>>> So, this patch exports these 2 functions which are very helpful for VSETVL pass.
>>>   
>>> It would be nice to put these internal functions into a class or a
>>> namespace given their non LCM name.  I don't see how you are going
>>> to use these intermediate DF functions - they are just necessary
>>> to compute pre_edge_lcm_avs which I see you already do.  Just to say
>>> you are possibly going to blow up compile-time complexity of your
>>> VSETVL dataflow problem?
>>>   
>>>> gcc/ChangeLog:
>>>>
>>>> * lcm.cc (compute_antinout_edge): Export as global use.
>>>> (compute_earliest): Ditto.
>>>> (compute_rev_insert_delete): Ditto.
>>>> * lcm.h (compute_antinout_edge): Ditto.
>>>> (compute_earliest): Ditto.
>>>>
>>>> ---
>>>>   gcc/lcm.cc | 7 ++-----
>>>>   gcc/lcm.h  | 3 +++
>>>>   2 files changed, 5 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/gcc/lcm.cc b/gcc/lcm.cc
>>>> index 94a3ed43aea..03421e490e4 100644
>>>> --- a/gcc/lcm.cc
>>>> +++ b/gcc/lcm.cc
>>>> @@ -56,9 +56,6 @@ along with GCC; see the file COPYING3.  If not see
>>>>   #include "lcm.h"
>>>>   
>>>>   /* Edge based LCM routines.  */
>>>> -static void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
>>>> -static void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
>>>> -       sbitmap *, sbitmap *, sbitmap *);
>>>>   static void compute_laterin (struct edge_list *, sbitmap *, sbitmap *,
>>>>        sbitmap *, sbitmap *);
>>>>   static void compute_insert_delete (struct edge_list *edge_list, sbitmap *,
>>>> @@ -79,7 +76,7 @@ static void compute_rev_insert_delete (struct edge_list *edge_list, sbitmap *,
>>>>      This is done based on the flow graph, and not on the pred-succ lists.
>>>>      Other than that, its pretty much identical to compute_antinout.  */
>>>>   
>>>> -static void
>>>> +void
>>>>   compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>>>>          sbitmap *antout)
>>>>   {
>>>> @@ -170,7 +167,7 @@ compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
>>>>   
>>>>   /* Compute the earliest vector for edge based lcm.  */
>>>>   
>>>> -static void
>>>> +void
>>>>   compute_earliest (struct edge_list *edge_list, int n_exprs, sbitmap *antin,
>>>>     sbitmap *antout, sbitmap *avout, sbitmap *kill,
>>>>     sbitmap *earliest)
>>>> diff --git a/gcc/lcm.h b/gcc/lcm.h
>>>> index e08339352e0..7145d6fc46d 100644
>>>> --- a/gcc/lcm.h
>>>> +++ b/gcc/lcm.h
>>>> @@ -31,4 +31,7 @@ extern struct edge_list *pre_edge_rev_lcm (int, sbitmap *,
>>>>      sbitmap *, sbitmap *,
>>>>      sbitmap *, sbitmap **,
>>>>      sbitmap **);
>>>> +extern void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
>>>> +extern void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
>>>> +       sbitmap *, sbitmap *, sbitmap *);
>>>>   #endif /* GCC_LCM_H */
>>>>
>>>   
>>>
>>   
>>
>
diff mbox series

Patch

diff --git a/gcc/lcm.cc b/gcc/lcm.cc
index 94a3ed43aea..03421e490e4 100644
--- a/gcc/lcm.cc
+++ b/gcc/lcm.cc
@@ -56,9 +56,6 @@  along with GCC; see the file COPYING3.  If not see
 #include "lcm.h"
 
 /* Edge based LCM routines.  */
-static void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
-static void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
-			      sbitmap *, sbitmap *, sbitmap *);
 static void compute_laterin (struct edge_list *, sbitmap *, sbitmap *,
 			     sbitmap *, sbitmap *);
 static void compute_insert_delete (struct edge_list *edge_list, sbitmap *,
@@ -79,7 +76,7 @@  static void compute_rev_insert_delete (struct edge_list *edge_list, sbitmap *,
    This is done based on the flow graph, and not on the pred-succ lists.
    Other than that, its pretty much identical to compute_antinout.  */
 
-static void
+void
 compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
 		       sbitmap *antout)
 {
@@ -170,7 +167,7 @@  compute_antinout_edge (sbitmap *antloc, sbitmap *transp, sbitmap *antin,
 
 /* Compute the earliest vector for edge based lcm.  */
 
-static void
+void
 compute_earliest (struct edge_list *edge_list, int n_exprs, sbitmap *antin,
 		  sbitmap *antout, sbitmap *avout, sbitmap *kill,
 		  sbitmap *earliest)
diff --git a/gcc/lcm.h b/gcc/lcm.h
index e08339352e0..7145d6fc46d 100644
--- a/gcc/lcm.h
+++ b/gcc/lcm.h
@@ -31,4 +31,7 @@  extern struct edge_list *pre_edge_rev_lcm (int, sbitmap *,
 					   sbitmap *, sbitmap *,
 					   sbitmap *, sbitmap **,
 					   sbitmap **);
+extern void compute_antinout_edge (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
+extern void compute_earliest (struct edge_list *, int, sbitmap *, sbitmap *,
+			      sbitmap *, sbitmap *, sbitmap *);
 #endif /* GCC_LCM_H */