Message ID | 000601ceb44f$a6db2980$f4917c80$@arm.com |
---|---|
State | New |
Headers | show |
On 18/09/13 10:15, bin.cheng wrote: > > >> -----Original Message----- >> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >> owner@gcc.gnu.org] On Behalf Of bin.cheng >> Sent: Monday, September 02, 2013 3:09 PM >> To: Richard Earnshaw >> Cc: gcc-patches@gcc.gnu.org >> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >> >> >> >>> -----Original Message----- >>> From: Richard Earnshaw >>> Sent: Thursday, August 29, 2013 9:06 PM >>> To: Bin Cheng >>> Cc: gcc-patches@gcc.gnu.org >>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>> >>> On 28/08/13 08:00, bin.cheng wrote: >>>> Hi, >>>> >>>> This patch refines scaled address expression on ARM. It supports >>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>> to legitimize "base + index * scale + offset" with "reg <- base + >>>> offset; reg >>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>> + function >>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>> mentioned transformation by calling to try_multiplier_address. >>>> Hoping we can improve it in the future. >>>> >>>> With this patch: >>>> 1) "base+index*scale" is recognized. >>> >>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>> So this shouldn't be necessary. Can you identify where this >> non-canoncial form is being generated? >>> >> >> Oh, for now ivopt constructs "index*scale" to test whether backend >> supports scaled addressing mode, which is not valid on ARM, so I was going >> to construct "base + index*scale" instead. Since "base + index * scale" > is not >> canonical form, I will construct the canonical form and drop this part of > the >> patch. >> >> Is rest of this patch OK? >> > Hi Richard, I removed the part over which you concerned and created this > updated patch. > > Is it OK? > > Thanks. > bin > > 2013-09-18 Bin Cheng <bin.cheng@arm.com> > > * config/arm/arm.c (try_multiplier_address): New function. > (thumb2_legitimize_address): New function. > (arm_legitimize_address): Call try_multiplier_address and > thumb2_legitimize_address. > > > 6-arm-scaled_address-20130918.txt > > > Index: gcc/config/arm/arm.c > =================================================================== > --- gcc/config/arm/arm.c (revision 200774) > +++ gcc/config/arm/arm.c (working copy) > @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) > } > } > > +/* Try to find address expression like base + index * scale + offset > + in X. If we find one, force base + offset into register and > + construct new expression reg + index * scale; return the new > + address expression if it's valid. Otherwise return X. */ > +static rtx > +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) > +{ > + rtx tmp, base_reg, new_rtx; > + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; > + > + gcc_assert (GET_CODE (x) == PLUS); > + > + /* Try to find and record base/index/scale/offset in X. */ > + if (GET_CODE (XEXP (x, 1)) == MULT) > + { > + tmp = XEXP (x, 0); > + index = XEXP (XEXP (x, 1), 0); > + scale = XEXP (XEXP (x, 1), 1); > + if (GET_CODE (tmp) != PLUS) > + return x; > + > + base = XEXP (tmp, 0); > + offset = XEXP (tmp, 1); > + } > + else > + { > + tmp = XEXP (x, 0); > + offset = XEXP (x, 1); > + if (GET_CODE (tmp) != PLUS) > + return x; > + > + base = XEXP (tmp, 0); > + scale = XEXP (tmp, 1); > + if (GET_CODE (base) == MULT) > + { > + tmp = base; > + base = scale; > + scale = tmp; > + } > + if (GET_CODE (scale) != MULT) > + return x; > + > + index = XEXP (scale, 0); > + scale = XEXP (scale, 1); > + } > + > + if (CONST_INT_P (base)) > + { > + tmp = base; > + base = offset; > + offset = tmp; > + } > + > + if (CONST_INT_P (index)) > + { > + tmp = index; > + index = scale; > + scale = tmp; > + } > + > + /* ARM only supports constant scale in address. */ > + if (!CONST_INT_P (scale)) > + return x; > + > + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) > + return x; > + > + /* Only register/constant are allowed in each part. */ > + if (!symbol_mentioned_p (base) > + && !symbol_mentioned_p (offset) > + && !symbol_mentioned_p (index) > + && !symbol_mentioned_p (scale)) > + { It would be easier to do this at the top of the function -- if (symbol_mentioned_p (x)) return x; > + /* Force "base+offset" into register and construct > + "register+index*scale". Return the new expression > + only if it's valid. */ > + tmp = gen_rtx_PLUS (SImode, base, offset); > + base_reg = force_reg (SImode, tmp); > + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); > + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); > + return new_rtx; I can't help thinking that this is backwards. That is, you want to split out the mult expression and use offset addressing in the addresses itself. That's likely to lead to either better CSE, or more induction vars. Furthermore, scaled addressing modes are likely to be more expensive than simple offset addressing modes on at least some cores. R.
On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote: > On 18/09/13 10:15, bin.cheng wrote: >> >> >>> -----Original Message----- >>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>> Sent: Monday, September 02, 2013 3:09 PM >>> To: Richard Earnshaw >>> Cc: gcc-patches@gcc.gnu.org >>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>> >>> >>> >>>> -----Original Message----- >>>> From: Richard Earnshaw >>>> Sent: Thursday, August 29, 2013 9:06 PM >>>> To: Bin Cheng >>>> Cc: gcc-patches@gcc.gnu.org >>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>> >>>> On 28/08/13 08:00, bin.cheng wrote: >>>>> Hi, >>>>> >>>>> This patch refines scaled address expression on ARM. It supports >>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>> to legitimize "base + index * scale + offset" with "reg <- base + >>>>> offset; reg >>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>> + function >>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>> mentioned transformation by calling to try_multiplier_address. >>>>> Hoping we can improve it in the future. >>>>> >>>>> With this patch: >>>>> 1) "base+index*scale" is recognized. >>>> >>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>>> So this shouldn't be necessary. Can you identify where this >>> non-canoncial form is being generated? >>>> >>> >>> Oh, for now ivopt constructs "index*scale" to test whether backend >>> supports scaled addressing mode, which is not valid on ARM, so I was going >>> to construct "base + index*scale" instead. Since "base + index * scale" >> is not >>> canonical form, I will construct the canonical form and drop this part of >> the >>> patch. >>> >>> Is rest of this patch OK? >>> >> Hi Richard, I removed the part over which you concerned and created this >> updated patch. >> >> Is it OK? >> >> Thanks. >> bin >> >> 2013-09-18 Bin Cheng <bin.cheng@arm.com> >> >> * config/arm/arm.c (try_multiplier_address): New function. >> (thumb2_legitimize_address): New function. >> (arm_legitimize_address): Call try_multiplier_address and >> thumb2_legitimize_address. >> >> >> 6-arm-scaled_address-20130918.txt >> >> >> Index: gcc/config/arm/arm.c >> =================================================================== >> --- gcc/config/arm/arm.c (revision 200774) >> +++ gcc/config/arm/arm.c (working copy) >> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >> } >> } >> >> +/* Try to find address expression like base + index * scale + offset >> + in X. If we find one, force base + offset into register and >> + construct new expression reg + index * scale; return the new >> + address expression if it's valid. Otherwise return X. */ >> +static rtx >> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) >> +{ >> + rtx tmp, base_reg, new_rtx; >> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; >> + >> + gcc_assert (GET_CODE (x) == PLUS); >> + >> + /* Try to find and record base/index/scale/offset in X. */ >> + if (GET_CODE (XEXP (x, 1)) == MULT) >> + { >> + tmp = XEXP (x, 0); >> + index = XEXP (XEXP (x, 1), 0); >> + scale = XEXP (XEXP (x, 1), 1); >> + if (GET_CODE (tmp) != PLUS) >> + return x; >> + >> + base = XEXP (tmp, 0); >> + offset = XEXP (tmp, 1); >> + } >> + else >> + { >> + tmp = XEXP (x, 0); >> + offset = XEXP (x, 1); >> + if (GET_CODE (tmp) != PLUS) >> + return x; >> + >> + base = XEXP (tmp, 0); >> + scale = XEXP (tmp, 1); >> + if (GET_CODE (base) == MULT) >> + { >> + tmp = base; >> + base = scale; >> + scale = tmp; >> + } >> + if (GET_CODE (scale) != MULT) >> + return x; >> + >> + index = XEXP (scale, 0); >> + scale = XEXP (scale, 1); >> + } >> + >> + if (CONST_INT_P (base)) >> + { >> + tmp = base; >> + base = offset; >> + offset = tmp; >> + } >> + >> + if (CONST_INT_P (index)) >> + { >> + tmp = index; >> + index = scale; >> + scale = tmp; >> + } >> + >> + /* ARM only supports constant scale in address. */ >> + if (!CONST_INT_P (scale)) >> + return x; >> + >> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >> + return x; >> + >> + /* Only register/constant are allowed in each part. */ >> + if (!symbol_mentioned_p (base) >> + && !symbol_mentioned_p (offset) >> + && !symbol_mentioned_p (index) >> + && !symbol_mentioned_p (scale)) >> + { > > It would be easier to do this at the top of the function -- > if (symbol_mentioned_p (x)) > return x; > > >> + /* Force "base+offset" into register and construct >> + "register+index*scale". Return the new expression >> + only if it's valid. */ >> + tmp = gen_rtx_PLUS (SImode, base, offset); >> + base_reg = force_reg (SImode, tmp); >> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >> + return new_rtx; > > I can't help thinking that this is backwards. That is, you want to > split out the mult expression and use offset addressing in the addresses > itself. That's likely to lead to either better CSE, or more induction Thanks to your review. Actually base+offset is more likely loop invariant than scaled expression, just as reported in pr57540. The bug is observed in spec2k bzip/gzip, and hurts arm in hot loops. The loop induction variable doesn't matter that much comparing to invariant because we are in RTL now. > vars. Furthermore, scaled addressing modes are likely to be more > expensive than simple offset addressing modes on at least some cores. The purpose is to CSE (as you pointed out) or hoist loop invariant. As for addressing mode is concerned, Though we may guide the transformation once we have reliable address cost mode, we don't have the information if base+offset is invariant, so it's difficult to handle in backend, right? What do you think about this? Thanks, bin
On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote: > On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote: >> On 18/09/13 10:15, bin.cheng wrote: >>> >>> >>>> -----Original Message----- >>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>> Sent: Monday, September 02, 2013 3:09 PM >>>> To: Richard Earnshaw >>>> Cc: gcc-patches@gcc.gnu.org >>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>> >>>> >>>> >>>>> -----Original Message----- >>>>> From: Richard Earnshaw >>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>> To: Bin Cheng >>>>> Cc: gcc-patches@gcc.gnu.org >>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>> >>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>> Hi, >>>>>> >>>>>> This patch refines scaled address expression on ARM. It supports >>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>> to legitimize "base + index * scale + offset" with "reg <- base + >>>>>> offset; reg >>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>> + function >>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>> Hoping we can improve it in the future. >>>>>> >>>>>> With this patch: >>>>>> 1) "base+index*scale" is recognized. >>>>> >>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>>>> So this shouldn't be necessary. Can you identify where this >>>> non-canoncial form is being generated? >>>>> >>>> >>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>> supports scaled addressing mode, which is not valid on ARM, so I was going >>>> to construct "base + index*scale" instead. Since "base + index * scale" >>> is not >>>> canonical form, I will construct the canonical form and drop this part of >>> the >>>> patch. >>>> >>>> Is rest of this patch OK? >>>> >>> Hi Richard, I removed the part over which you concerned and created this >>> updated patch. >>> >>> Is it OK? >>> >>> Thanks. >>> bin >>> >>> 2013-09-18 Bin Cheng <bin.cheng@arm.com> >>> >>> * config/arm/arm.c (try_multiplier_address): New function. >>> (thumb2_legitimize_address): New function. >>> (arm_legitimize_address): Call try_multiplier_address and >>> thumb2_legitimize_address. >>> >>> >>> 6-arm-scaled_address-20130918.txt >>> >>> >>> Index: gcc/config/arm/arm.c >>> =================================================================== >>> --- gcc/config/arm/arm.c (revision 200774) >>> +++ gcc/config/arm/arm.c (working copy) >>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>> } >>> } >>> >>> +/* Try to find address expression like base + index * scale + offset >>> + in X. If we find one, force base + offset into register and >>> + construct new expression reg + index * scale; return the new >>> + address expression if it's valid. Otherwise return X. */ >>> +static rtx >>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) >>> +{ >>> + rtx tmp, base_reg, new_rtx; >>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; >>> + >>> + gcc_assert (GET_CODE (x) == PLUS); >>> + >>> + /* Try to find and record base/index/scale/offset in X. */ >>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>> + { >>> + tmp = XEXP (x, 0); >>> + index = XEXP (XEXP (x, 1), 0); >>> + scale = XEXP (XEXP (x, 1), 1); >>> + if (GET_CODE (tmp) != PLUS) >>> + return x; >>> + >>> + base = XEXP (tmp, 0); >>> + offset = XEXP (tmp, 1); >>> + } >>> + else >>> + { >>> + tmp = XEXP (x, 0); >>> + offset = XEXP (x, 1); >>> + if (GET_CODE (tmp) != PLUS) >>> + return x; >>> + >>> + base = XEXP (tmp, 0); >>> + scale = XEXP (tmp, 1); >>> + if (GET_CODE (base) == MULT) >>> + { >>> + tmp = base; >>> + base = scale; >>> + scale = tmp; >>> + } >>> + if (GET_CODE (scale) != MULT) >>> + return x; >>> + >>> + index = XEXP (scale, 0); >>> + scale = XEXP (scale, 1); >>> + } >>> + >>> + if (CONST_INT_P (base)) >>> + { >>> + tmp = base; >>> + base = offset; >>> + offset = tmp; >>> + } >>> + >>> + if (CONST_INT_P (index)) >>> + { >>> + tmp = index; >>> + index = scale; >>> + scale = tmp; >>> + } >>> + >>> + /* ARM only supports constant scale in address. */ >>> + if (!CONST_INT_P (scale)) >>> + return x; >>> + >>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>> + return x; >>> + >>> + /* Only register/constant are allowed in each part. */ >>> + if (!symbol_mentioned_p (base) >>> + && !symbol_mentioned_p (offset) >>> + && !symbol_mentioned_p (index) >>> + && !symbol_mentioned_p (scale)) >>> + { >> >> It would be easier to do this at the top of the function -- >> if (symbol_mentioned_p (x)) >> return x; >> >> >>> + /* Force "base+offset" into register and construct >>> + "register+index*scale". Return the new expression >>> + only if it's valid. */ >>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>> + base_reg = force_reg (SImode, tmp); >>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>> + return new_rtx; >> >> I can't help thinking that this is backwards. That is, you want to >> split out the mult expression and use offset addressing in the addresses >> itself. That's likely to lead to either better CSE, or more induction > Thanks to your review. > > Actually base+offset is more likely loop invariant than scaled > expression, just as reported in pr57540. The bug is observed in > spec2k bzip/gzip, and hurts arm in hot loops. The loop induction > variable doesn't matter that much comparing to invariant because we > are in RTL now. > >> vars. Furthermore, scaled addressing modes are likely to be more >> expensive than simple offset addressing modes on at least some cores. > The purpose is to CSE (as you pointed out) or hoist loop invariant. > As for addressing mode is concerned, Though we may guide the > transformation once we have reliable address cost mode, we don't have > the information if base+offset is invariant, so it's difficult to > handle in backend, right? > > What do you think about this? > Additionally, there is no induction variable issue here. The memory reference we are facing during expand are not lowered by gimple IVOPT, which means either it's outside loop, or doesn't have induction variable address. Generally, there are three passes which lowers memory reference: gimple strength reduction picks opportunity outside loop; gimple IVOPT handles references with induction variable addresses; references not handled by SLSR/IVOPT are handled by RTL expand, which makes bad choice of address re-association. I think Yufeng also noticed the problem and are trying with patch like: http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html After thinking twice, I some kind of think we should not re-associate addresses during expanding, because of lacking of context information. Take base + scaled_index + offset as an example in PR57540, we just don't know if "base+offset" is loop invariant from either backend or RTL expander. Any comments? Thanks, bin
On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng <amker.cheng@gmail.com> wrote: > On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote: >> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote: >>> On 18/09/13 10:15, bin.cheng wrote: >>>> >>>> >>>>> -----Original Message----- >>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>> To: Richard Earnshaw >>>>> Cc: gcc-patches@gcc.gnu.org >>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>> >>>>> >>>>> >>>>>> -----Original Message----- >>>>>> From: Richard Earnshaw >>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>> To: Bin Cheng >>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>> >>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>> Hi, >>>>>>> >>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>>> to legitimize "base + index * scale + offset" with "reg <- base + >>>>>>> offset; reg >>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>> + function >>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>> Hoping we can improve it in the future. >>>>>>> >>>>>>> With this patch: >>>>>>> 1) "base+index*scale" is recognized. >>>>>> >>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>>>>> So this shouldn't be necessary. Can you identify where this >>>>> non-canoncial form is being generated? >>>>>> >>>>> >>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>> supports scaled addressing mode, which is not valid on ARM, so I was going >>>>> to construct "base + index*scale" instead. Since "base + index * scale" >>>> is not >>>>> canonical form, I will construct the canonical form and drop this part of >>>> the >>>>> patch. >>>>> >>>>> Is rest of this patch OK? >>>>> >>>> Hi Richard, I removed the part over which you concerned and created this >>>> updated patch. >>>> >>>> Is it OK? >>>> >>>> Thanks. >>>> bin >>>> >>>> 2013-09-18 Bin Cheng <bin.cheng@arm.com> >>>> >>>> * config/arm/arm.c (try_multiplier_address): New function. >>>> (thumb2_legitimize_address): New function. >>>> (arm_legitimize_address): Call try_multiplier_address and >>>> thumb2_legitimize_address. >>>> >>>> >>>> 6-arm-scaled_address-20130918.txt >>>> >>>> >>>> Index: gcc/config/arm/arm.c >>>> =================================================================== >>>> --- gcc/config/arm/arm.c (revision 200774) >>>> +++ gcc/config/arm/arm.c (working copy) >>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>> } >>>> } >>>> >>>> +/* Try to find address expression like base + index * scale + offset >>>> + in X. If we find one, force base + offset into register and >>>> + construct new expression reg + index * scale; return the new >>>> + address expression if it's valid. Otherwise return X. */ >>>> +static rtx >>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) >>>> +{ >>>> + rtx tmp, base_reg, new_rtx; >>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; >>>> + >>>> + gcc_assert (GET_CODE (x) == PLUS); >>>> + >>>> + /* Try to find and record base/index/scale/offset in X. */ >>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>> + { >>>> + tmp = XEXP (x, 0); >>>> + index = XEXP (XEXP (x, 1), 0); >>>> + scale = XEXP (XEXP (x, 1), 1); >>>> + if (GET_CODE (tmp) != PLUS) >>>> + return x; >>>> + >>>> + base = XEXP (tmp, 0); >>>> + offset = XEXP (tmp, 1); >>>> + } >>>> + else >>>> + { >>>> + tmp = XEXP (x, 0); >>>> + offset = XEXP (x, 1); >>>> + if (GET_CODE (tmp) != PLUS) >>>> + return x; >>>> + >>>> + base = XEXP (tmp, 0); >>>> + scale = XEXP (tmp, 1); >>>> + if (GET_CODE (base) == MULT) >>>> + { >>>> + tmp = base; >>>> + base = scale; >>>> + scale = tmp; >>>> + } >>>> + if (GET_CODE (scale) != MULT) >>>> + return x; >>>> + >>>> + index = XEXP (scale, 0); >>>> + scale = XEXP (scale, 1); >>>> + } >>>> + >>>> + if (CONST_INT_P (base)) >>>> + { >>>> + tmp = base; >>>> + base = offset; >>>> + offset = tmp; >>>> + } >>>> + >>>> + if (CONST_INT_P (index)) >>>> + { >>>> + tmp = index; >>>> + index = scale; >>>> + scale = tmp; >>>> + } >>>> + >>>> + /* ARM only supports constant scale in address. */ >>>> + if (!CONST_INT_P (scale)) >>>> + return x; >>>> + >>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>> + return x; >>>> + >>>> + /* Only register/constant are allowed in each part. */ >>>> + if (!symbol_mentioned_p (base) >>>> + && !symbol_mentioned_p (offset) >>>> + && !symbol_mentioned_p (index) >>>> + && !symbol_mentioned_p (scale)) >>>> + { >>> >>> It would be easier to do this at the top of the function -- >>> if (symbol_mentioned_p (x)) >>> return x; >>> >>> >>>> + /* Force "base+offset" into register and construct >>>> + "register+index*scale". Return the new expression >>>> + only if it's valid. */ >>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>> + base_reg = force_reg (SImode, tmp); >>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>> + return new_rtx; >>> >>> I can't help thinking that this is backwards. That is, you want to >>> split out the mult expression and use offset addressing in the addresses >>> itself. That's likely to lead to either better CSE, or more induction >> Thanks to your review. >> >> Actually base+offset is more likely loop invariant than scaled >> expression, just as reported in pr57540. The bug is observed in >> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >> variable doesn't matter that much comparing to invariant because we >> are in RTL now. >> >>> vars. Furthermore, scaled addressing modes are likely to be more >>> expensive than simple offset addressing modes on at least some cores. >> The purpose is to CSE (as you pointed out) or hoist loop invariant. >> As for addressing mode is concerned, Though we may guide the >> transformation once we have reliable address cost mode, we don't have >> the information if base+offset is invariant, so it's difficult to >> handle in backend, right? >> >> What do you think about this? >> > > Additionally, there is no induction variable issue here. The memory > reference we are facing during expand are not lowered by gimple IVOPT, > which means either it's outside loop, or doesn't have induction > variable address. > > Generally, there are three passes which lowers memory reference: > gimple strength reduction picks opportunity outside loop; gimple IVOPT > handles references with induction variable addresses; references not > handled by SLSR/IVOPT are handled by RTL expand, which makes bad > choice of address re-association. I think Yufeng also noticed the > problem and are trying with patch like: > http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html > http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html > > After thinking twice, I some kind of think we should not re-associate > addresses during expanding, because of lacking of context information. > Take base + scaled_index + offset as an example in PR57540, we just > don't know if "base+offset" is loop invariant from either backend or > RTL expander. > > Any comments? The issue is that re-association and address lowering is not integrated with optimizers such as CSE and invariant motion. This means it's hard to arrive at optimal results and all passes are just "guessing" and applying heuristics that the programmer thought are appropriate for his machine. I have no easy way out but think that we lower addresses too late (at least fully). Loop optimizers would like to see the complex memory reference forms but starting with IVOPTs lowering all addresses would likely be a win in the end (there are CSE, LIM and re-association passes around after/at this point as well). Back in the 4.3 days when I for the first time attempted to introduce MEM_REF I forcefully lowered all memory accesses and addresses very early (during gimplification basically). That didn't work out well. So my suggestion to anyone who wants to try something is to apply an additional lowering to the whole GIMPLE, lowering things like handled-component references and addresses (and also things like bitfield accesses). Richard. > Thanks, > bin > > > > -- > Best Regards.
On Fri, Nov 29, 2013 at 6:44 PM, Richard Biener <richard.guenther@gmail.com> wrote: > On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng <amker.cheng@gmail.com> wrote: >> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote: >>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote: >>>> On 18/09/13 10:15, bin.cheng wrote: >>>>> >>>>> >>>>>> -----Original Message----- >>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>>> To: Richard Earnshaw >>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>>> >>>>>> >>>>>> >>>>>>> -----Original Message----- >>>>>>> From: Richard Earnshaw >>>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>>> To: Bin Cheng >>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>>> >>>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>>> Hi, >>>>>>>> >>>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>>>> to legitimize "base + index * scale + offset" with "reg <- base + >>>>>>>> offset; reg >>>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>>> + function >>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>>> Hoping we can improve it in the future. >>>>>>>> >>>>>>>> With this patch: >>>>>>>> 1) "base+index*scale" is recognized. >>>>>>> >>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>>>>>> So this shouldn't be necessary. Can you identify where this >>>>>> non-canoncial form is being generated? >>>>>>> >>>>>> >>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going >>>>>> to construct "base + index*scale" instead. Since "base + index * scale" >>>>> is not >>>>>> canonical form, I will construct the canonical form and drop this part of >>>>> the >>>>>> patch. >>>>>> >>>>>> Is rest of this patch OK? >>>>>> >>>>> Hi Richard, I removed the part over which you concerned and created this >>>>> updated patch. >>>>> >>>>> Is it OK? >>>>> >>>>> Thanks. >>>>> bin >>>>> >>>>> 2013-09-18 Bin Cheng <bin.cheng@arm.com> >>>>> >>>>> * config/arm/arm.c (try_multiplier_address): New function. >>>>> (thumb2_legitimize_address): New function. >>>>> (arm_legitimize_address): Call try_multiplier_address and >>>>> thumb2_legitimize_address. >>>>> >>>>> >>>>> 6-arm-scaled_address-20130918.txt >>>>> >>>>> >>>>> Index: gcc/config/arm/arm.c >>>>> =================================================================== >>>>> --- gcc/config/arm/arm.c (revision 200774) >>>>> +++ gcc/config/arm/arm.c (working copy) >>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>>> } >>>>> } >>>>> >>>>> +/* Try to find address expression like base + index * scale + offset >>>>> + in X. If we find one, force base + offset into register and >>>>> + construct new expression reg + index * scale; return the new >>>>> + address expression if it's valid. Otherwise return X. */ >>>>> +static rtx >>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) >>>>> +{ >>>>> + rtx tmp, base_reg, new_rtx; >>>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; >>>>> + >>>>> + gcc_assert (GET_CODE (x) == PLUS); >>>>> + >>>>> + /* Try to find and record base/index/scale/offset in X. */ >>>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>>> + { >>>>> + tmp = XEXP (x, 0); >>>>> + index = XEXP (XEXP (x, 1), 0); >>>>> + scale = XEXP (XEXP (x, 1), 1); >>>>> + if (GET_CODE (tmp) != PLUS) >>>>> + return x; >>>>> + >>>>> + base = XEXP (tmp, 0); >>>>> + offset = XEXP (tmp, 1); >>>>> + } >>>>> + else >>>>> + { >>>>> + tmp = XEXP (x, 0); >>>>> + offset = XEXP (x, 1); >>>>> + if (GET_CODE (tmp) != PLUS) >>>>> + return x; >>>>> + >>>>> + base = XEXP (tmp, 0); >>>>> + scale = XEXP (tmp, 1); >>>>> + if (GET_CODE (base) == MULT) >>>>> + { >>>>> + tmp = base; >>>>> + base = scale; >>>>> + scale = tmp; >>>>> + } >>>>> + if (GET_CODE (scale) != MULT) >>>>> + return x; >>>>> + >>>>> + index = XEXP (scale, 0); >>>>> + scale = XEXP (scale, 1); >>>>> + } >>>>> + >>>>> + if (CONST_INT_P (base)) >>>>> + { >>>>> + tmp = base; >>>>> + base = offset; >>>>> + offset = tmp; >>>>> + } >>>>> + >>>>> + if (CONST_INT_P (index)) >>>>> + { >>>>> + tmp = index; >>>>> + index = scale; >>>>> + scale = tmp; >>>>> + } >>>>> + >>>>> + /* ARM only supports constant scale in address. */ >>>>> + if (!CONST_INT_P (scale)) >>>>> + return x; >>>>> + >>>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>>> + return x; >>>>> + >>>>> + /* Only register/constant are allowed in each part. */ >>>>> + if (!symbol_mentioned_p (base) >>>>> + && !symbol_mentioned_p (offset) >>>>> + && !symbol_mentioned_p (index) >>>>> + && !symbol_mentioned_p (scale)) >>>>> + { >>>> >>>> It would be easier to do this at the top of the function -- >>>> if (symbol_mentioned_p (x)) >>>> return x; >>>> >>>> >>>>> + /* Force "base+offset" into register and construct >>>>> + "register+index*scale". Return the new expression >>>>> + only if it's valid. */ >>>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>>> + base_reg = force_reg (SImode, tmp); >>>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>>> + return new_rtx; >>>> >>>> I can't help thinking that this is backwards. That is, you want to >>>> split out the mult expression and use offset addressing in the addresses >>>> itself. That's likely to lead to either better CSE, or more induction >>> Thanks to your review. >>> >>> Actually base+offset is more likely loop invariant than scaled >>> expression, just as reported in pr57540. The bug is observed in >>> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >>> variable doesn't matter that much comparing to invariant because we >>> are in RTL now. >>> >>>> vars. Furthermore, scaled addressing modes are likely to be more >>>> expensive than simple offset addressing modes on at least some cores. >>> The purpose is to CSE (as you pointed out) or hoist loop invariant. >>> As for addressing mode is concerned, Though we may guide the >>> transformation once we have reliable address cost mode, we don't have >>> the information if base+offset is invariant, so it's difficult to >>> handle in backend, right? >>> >>> What do you think about this? >>> >> >> Additionally, there is no induction variable issue here. The memory >> reference we are facing during expand are not lowered by gimple IVOPT, >> which means either it's outside loop, or doesn't have induction >> variable address. >> >> Generally, there are three passes which lowers memory reference: >> gimple strength reduction picks opportunity outside loop; gimple IVOPT >> handles references with induction variable addresses; references not >> handled by SLSR/IVOPT are handled by RTL expand, which makes bad >> choice of address re-association. I think Yufeng also noticed the >> problem and are trying with patch like: >> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html >> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html >> >> After thinking twice, I some kind of think we should not re-associate >> addresses during expanding, because of lacking of context information. >> Take base + scaled_index + offset as an example in PR57540, we just >> don't know if "base+offset" is loop invariant from either backend or >> RTL expander. >> >> Any comments? > > The issue is that re-association and address lowering is not integrated > with optimizers such as CSE and invariant motion. This means it's > hard to arrive at optimal results and all passes are just "guessing" > and applying heuristics that the programmer thought are appropriate > for his machine. > > I have no easy way out but think that we lower addresses too late > (at least fully). Loop optimizers would like to see the complex > memory reference forms but starting with IVOPTs lowering all > addresses would likely be a win in the end (there are CSE, LIM > and re-association passes around after/at this point as well). > > Back in the 4.3 days when I for the first time attempted to introduce > MEM_REF I forcefully lowered all memory accesses and addresses > very early (during gimplification basically). That didn't work out well. > > So my suggestion to anyone who wants to try something is > to apply an additional lowering to the whole GIMPLE, lowering > things like handled-component references and addresses > (and also things like bitfield accesses). Yes, once in expander, there will be no enough information. Since we don't need to handle references with induction variable addresses after IVOPT, it's not difficult to take invariant into consideration when re-associating. Yet I have no clue how CSE should be considered. Nevertheless, force "base+scaled*index+offset" into register and use reg directed addressing mode like PR57540 in expand is not good. Thanks, bin
On Fri, Nov 29, 2013 at 12:03 PM, Bin.Cheng <amker.cheng@gmail.com> wrote: > On Fri, Nov 29, 2013 at 6:44 PM, Richard Biener > <richard.guenther@gmail.com> wrote: >> On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng <amker.cheng@gmail.com> wrote: >>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng <amker.cheng@gmail.com> wrote: >>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw <rearnsha@arm.com> wrote: >>>>> On 18/09/13 10:15, bin.cheng wrote: >>>>>> >>>>>> >>>>>>> -----Original Message----- >>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>>>> To: Richard Earnshaw >>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>>>> >>>>>>> >>>>>>> >>>>>>>> -----Original Message----- >>>>>>>> From: Richard Earnshaw >>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>>>> To: Bin Cheng >>>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>>>> >>>>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>>>> Hi, >>>>>>>>> >>>>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>>>>> to legitimize "base + index * scale + offset" with "reg <- base + >>>>>>>>> offset; reg >>>>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>>>> + function >>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>>>> Hoping we can improve it in the future. >>>>>>>>> >>>>>>>>> With this patch: >>>>>>>>> 1) "base+index*scale" is recognized. >>>>>>>> >>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>>>>>>> So this shouldn't be necessary. Can you identify where this >>>>>>> non-canoncial form is being generated? >>>>>>>> >>>>>>> >>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going >>>>>>> to construct "base + index*scale" instead. Since "base + index * scale" >>>>>> is not >>>>>>> canonical form, I will construct the canonical form and drop this part of >>>>>> the >>>>>>> patch. >>>>>>> >>>>>>> Is rest of this patch OK? >>>>>>> >>>>>> Hi Richard, I removed the part over which you concerned and created this >>>>>> updated patch. >>>>>> >>>>>> Is it OK? >>>>>> >>>>>> Thanks. >>>>>> bin >>>>>> >>>>>> 2013-09-18 Bin Cheng <bin.cheng@arm.com> >>>>>> >>>>>> * config/arm/arm.c (try_multiplier_address): New function. >>>>>> (thumb2_legitimize_address): New function. >>>>>> (arm_legitimize_address): Call try_multiplier_address and >>>>>> thumb2_legitimize_address. >>>>>> >>>>>> >>>>>> 6-arm-scaled_address-20130918.txt >>>>>> >>>>>> >>>>>> Index: gcc/config/arm/arm.c >>>>>> =================================================================== >>>>>> --- gcc/config/arm/arm.c (revision 200774) >>>>>> +++ gcc/config/arm/arm.c (working copy) >>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>>>> } >>>>>> } >>>>>> >>>>>> +/* Try to find address expression like base + index * scale + offset >>>>>> + in X. If we find one, force base + offset into register and >>>>>> + construct new expression reg + index * scale; return the new >>>>>> + address expression if it's valid. Otherwise return X. */ >>>>>> +static rtx >>>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) >>>>>> +{ >>>>>> + rtx tmp, base_reg, new_rtx; >>>>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; >>>>>> + >>>>>> + gcc_assert (GET_CODE (x) == PLUS); >>>>>> + >>>>>> + /* Try to find and record base/index/scale/offset in X. */ >>>>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>>>> + { >>>>>> + tmp = XEXP (x, 0); >>>>>> + index = XEXP (XEXP (x, 1), 0); >>>>>> + scale = XEXP (XEXP (x, 1), 1); >>>>>> + if (GET_CODE (tmp) != PLUS) >>>>>> + return x; >>>>>> + >>>>>> + base = XEXP (tmp, 0); >>>>>> + offset = XEXP (tmp, 1); >>>>>> + } >>>>>> + else >>>>>> + { >>>>>> + tmp = XEXP (x, 0); >>>>>> + offset = XEXP (x, 1); >>>>>> + if (GET_CODE (tmp) != PLUS) >>>>>> + return x; >>>>>> + >>>>>> + base = XEXP (tmp, 0); >>>>>> + scale = XEXP (tmp, 1); >>>>>> + if (GET_CODE (base) == MULT) >>>>>> + { >>>>>> + tmp = base; >>>>>> + base = scale; >>>>>> + scale = tmp; >>>>>> + } >>>>>> + if (GET_CODE (scale) != MULT) >>>>>> + return x; >>>>>> + >>>>>> + index = XEXP (scale, 0); >>>>>> + scale = XEXP (scale, 1); >>>>>> + } >>>>>> + >>>>>> + if (CONST_INT_P (base)) >>>>>> + { >>>>>> + tmp = base; >>>>>> + base = offset; >>>>>> + offset = tmp; >>>>>> + } >>>>>> + >>>>>> + if (CONST_INT_P (index)) >>>>>> + { >>>>>> + tmp = index; >>>>>> + index = scale; >>>>>> + scale = tmp; >>>>>> + } >>>>>> + >>>>>> + /* ARM only supports constant scale in address. */ >>>>>> + if (!CONST_INT_P (scale)) >>>>>> + return x; >>>>>> + >>>>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>>>> + return x; >>>>>> + >>>>>> + /* Only register/constant are allowed in each part. */ >>>>>> + if (!symbol_mentioned_p (base) >>>>>> + && !symbol_mentioned_p (offset) >>>>>> + && !symbol_mentioned_p (index) >>>>>> + && !symbol_mentioned_p (scale)) >>>>>> + { >>>>> >>>>> It would be easier to do this at the top of the function -- >>>>> if (symbol_mentioned_p (x)) >>>>> return x; >>>>> >>>>> >>>>>> + /* Force "base+offset" into register and construct >>>>>> + "register+index*scale". Return the new expression >>>>>> + only if it's valid. */ >>>>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>>>> + base_reg = force_reg (SImode, tmp); >>>>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>>>> + return new_rtx; >>>>> >>>>> I can't help thinking that this is backwards. That is, you want to >>>>> split out the mult expression and use offset addressing in the addresses >>>>> itself. That's likely to lead to either better CSE, or more induction >>>> Thanks to your review. >>>> >>>> Actually base+offset is more likely loop invariant than scaled >>>> expression, just as reported in pr57540. The bug is observed in >>>> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >>>> variable doesn't matter that much comparing to invariant because we >>>> are in RTL now. >>>> >>>>> vars. Furthermore, scaled addressing modes are likely to be more >>>>> expensive than simple offset addressing modes on at least some cores. >>>> The purpose is to CSE (as you pointed out) or hoist loop invariant. >>>> As for addressing mode is concerned, Though we may guide the >>>> transformation once we have reliable address cost mode, we don't have >>>> the information if base+offset is invariant, so it's difficult to >>>> handle in backend, right? >>>> >>>> What do you think about this? >>>> >>> >>> Additionally, there is no induction variable issue here. The memory >>> reference we are facing during expand are not lowered by gimple IVOPT, >>> which means either it's outside loop, or doesn't have induction >>> variable address. >>> >>> Generally, there are three passes which lowers memory reference: >>> gimple strength reduction picks opportunity outside loop; gimple IVOPT >>> handles references with induction variable addresses; references not >>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad >>> choice of address re-association. I think Yufeng also noticed the >>> problem and are trying with patch like: >>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html >>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html >>> >>> After thinking twice, I some kind of think we should not re-associate >>> addresses during expanding, because of lacking of context information. >>> Take base + scaled_index + offset as an example in PR57540, we just >>> don't know if "base+offset" is loop invariant from either backend or >>> RTL expander. >>> >>> Any comments? >> >> The issue is that re-association and address lowering is not integrated >> with optimizers such as CSE and invariant motion. This means it's >> hard to arrive at optimal results and all passes are just "guessing" >> and applying heuristics that the programmer thought are appropriate >> for his machine. >> >> I have no easy way out but think that we lower addresses too late >> (at least fully). Loop optimizers would like to see the complex >> memory reference forms but starting with IVOPTs lowering all >> addresses would likely be a win in the end (there are CSE, LIM >> and re-association passes around after/at this point as well). >> >> Back in the 4.3 days when I for the first time attempted to introduce >> MEM_REF I forcefully lowered all memory accesses and addresses >> very early (during gimplification basically). That didn't work out well. >> >> So my suggestion to anyone who wants to try something is >> to apply an additional lowering to the whole GIMPLE, lowering >> things like handled-component references and addresses >> (and also things like bitfield accesses). > Yes, once in expander, there will be no enough information. Since we > don't need to handle references with induction variable addresses > after IVOPT, it's not difficult to take invariant into consideration > when re-associating. Yet I have no clue how CSE should be considered. You'd need to detect that (a + b)*c can be computed differently if both a*c and b*c are available for example. There is even a PR about this somewhere. > Nevertheless, force "base+scaled*index+offset" into register and use > reg directed addressing mode like PR57540 in expand is not good. Sure ... this is why we have IVOPTs and SLSR. But both would work ok with lowered input I think and would simply build up a "leveled" IL suitable for the target. IVOPTs also does some simple CSE considerations AFAIK. In the end what matters for speed is loops after all ;) Richard. > Thanks, > bin > > -- > Best Regards.
On 11/29/13 07:52, Bin.Cheng wrote: > On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com> wrote: >> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com> wrote: >>> On 18/09/13 10:15, bin.cheng wrote: >>>> >>>> >>>>> -----Original Message----- >>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>> To: Richard Earnshaw >>>>> Cc: gcc-patches@gcc.gnu.org >>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>> >>>>> >>>>> >>>>>> -----Original Message----- >>>>>> From: Richard Earnshaw >>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>> To: Bin Cheng >>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>> >>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>> Hi, >>>>>>> >>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>>> to legitimize "base + index * scale + offset" with "reg<- base + >>>>>>> offset; reg >>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>> + function >>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>> Hoping we can improve it in the future. >>>>>>> >>>>>>> With this patch: >>>>>>> 1) "base+index*scale" is recognized. >>>>>> >>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>>>>> So this shouldn't be necessary. Can you identify where this >>>>> non-canoncial form is being generated? >>>>>> >>>>> >>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>> supports scaled addressing mode, which is not valid on ARM, so I was going >>>>> to construct "base + index*scale" instead. Since "base + index * scale" >>>> is not >>>>> canonical form, I will construct the canonical form and drop this part of >>>> the >>>>> patch. >>>>> >>>>> Is rest of this patch OK? >>>>> >>>> Hi Richard, I removed the part over which you concerned and created this >>>> updated patch. >>>> >>>> Is it OK? >>>> >>>> Thanks. >>>> bin >>>> >>>> 2013-09-18 Bin Cheng<bin.cheng@arm.com> >>>> >>>> * config/arm/arm.c (try_multiplier_address): New function. >>>> (thumb2_legitimize_address): New function. >>>> (arm_legitimize_address): Call try_multiplier_address and >>>> thumb2_legitimize_address. >>>> >>>> >>>> 6-arm-scaled_address-20130918.txt >>>> >>>> >>>> Index: gcc/config/arm/arm.c >>>> =================================================================== >>>> --- gcc/config/arm/arm.c (revision 200774) >>>> +++ gcc/config/arm/arm.c (working copy) >>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>> } >>>> } >>>> >>>> +/* Try to find address expression like base + index * scale + offset >>>> + in X. If we find one, force base + offset into register and >>>> + construct new expression reg + index * scale; return the new >>>> + address expression if it's valid. Otherwise return X. */ >>>> +static rtx >>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) >>>> +{ >>>> + rtx tmp, base_reg, new_rtx; >>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; >>>> + >>>> + gcc_assert (GET_CODE (x) == PLUS); >>>> + >>>> + /* Try to find and record base/index/scale/offset in X. */ >>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>> + { >>>> + tmp = XEXP (x, 0); >>>> + index = XEXP (XEXP (x, 1), 0); >>>> + scale = XEXP (XEXP (x, 1), 1); >>>> + if (GET_CODE (tmp) != PLUS) >>>> + return x; >>>> + >>>> + base = XEXP (tmp, 0); >>>> + offset = XEXP (tmp, 1); >>>> + } >>>> + else >>>> + { >>>> + tmp = XEXP (x, 0); >>>> + offset = XEXP (x, 1); >>>> + if (GET_CODE (tmp) != PLUS) >>>> + return x; >>>> + >>>> + base = XEXP (tmp, 0); >>>> + scale = XEXP (tmp, 1); >>>> + if (GET_CODE (base) == MULT) >>>> + { >>>> + tmp = base; >>>> + base = scale; >>>> + scale = tmp; >>>> + } >>>> + if (GET_CODE (scale) != MULT) >>>> + return x; >>>> + >>>> + index = XEXP (scale, 0); >>>> + scale = XEXP (scale, 1); >>>> + } >>>> + >>>> + if (CONST_INT_P (base)) >>>> + { >>>> + tmp = base; >>>> + base = offset; >>>> + offset = tmp; >>>> + } >>>> + >>>> + if (CONST_INT_P (index)) >>>> + { >>>> + tmp = index; >>>> + index = scale; >>>> + scale = tmp; >>>> + } >>>> + >>>> + /* ARM only supports constant scale in address. */ >>>> + if (!CONST_INT_P (scale)) >>>> + return x; >>>> + >>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>> + return x; >>>> + >>>> + /* Only register/constant are allowed in each part. */ >>>> + if (!symbol_mentioned_p (base) >>>> +&& !symbol_mentioned_p (offset) >>>> +&& !symbol_mentioned_p (index) >>>> +&& !symbol_mentioned_p (scale)) >>>> + { >>> >>> It would be easier to do this at the top of the function -- >>> if (symbol_mentioned_p (x)) >>> return x; >>> >>> >>>> + /* Force "base+offset" into register and construct >>>> + "register+index*scale". Return the new expression >>>> + only if it's valid. */ >>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>> + base_reg = force_reg (SImode, tmp); >>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>> + return new_rtx; >>> >>> I can't help thinking that this is backwards. That is, you want to >>> split out the mult expression and use offset addressing in the addresses >>> itself. That's likely to lead to either better CSE, or more induction >> Thanks to your review. >> >> Actually base+offset is more likely loop invariant than scaled >> expression, just as reported in pr57540. The bug is observed in >> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >> variable doesn't matter that much comparing to invariant because we >> are in RTL now. >> >>> vars. Furthermore, scaled addressing modes are likely to be more >>> expensive than simple offset addressing modes on at least some cores. >> The purpose is to CSE (as you pointed out) or hoist loop invariant. >> As for addressing mode is concerned, Though we may guide the >> transformation once we have reliable address cost mode, we don't have >> the information if base+offset is invariant, so it's difficult to >> handle in backend, right? >> >> What do you think about this? >> > > Additionally, there is no induction variable issue here. The memory > reference we are facing during expand are not lowered by gimple IVOPT, > which means either it's outside loop, or doesn't have induction > variable address. > > Generally, there are three passes which lowers memory reference: > gimple strength reduction picks opportunity outside loop; gimple IVOPT > handles references with induction variable addresses; references not > handled by SLSR/IVOPT are handled by RTL expand, which makes bad > choice of address re-association. I think Yufeng also noticed the > problem and are trying with patch like: > http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html > http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html Yes, when it comes to addressing expression, the re-association in RTL expand is quite sensitive to the available address modes on the target and its address legitimization routines. Both patches I proposed try to make the RTL expansion more canonicalized on addressing expressions, especially on ARM. It is done by essentially enabling simplify_gen_binary () to work on a bigger RTX node. > After thinking twice, I some kind of think we should not re-associate > addresses during expanding, because of lacking of context information. > Take base + scaled_index + offset as an example in PR57540, we just > don't know if "base+offset" is loop invariant from either backend or > RTL expander. I'm getting less convinced by re-associating base with offset unconditionally. One counter example is typedef int arr_1[20]; void foo (arr_1 a1, int i) { a1[i+10] = 1; } I'm experimenting a patch to get the immediate offset in the above example to be the last addend in the address computing (as mentioned in http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the following code-gen: add r1, r0, r1, asl #2 mov r3, #1 str r3, [r1, #40] With your patch applied, the effort will be reverted to add r0, r0, #40 mov r3, #1 str r3, [r0, r1, asl #2] I briefly looked into PR57540. I noticed that you are trying to tackle a loop invariant in a hot loop: .L5: add lr, sp, #2064 ////loop invariant add r2, r2, #1 add r3, lr, r3, asl #2 ldr r3, [r3, #-2064] cmp r3, #0 bge .L5 uxtb r2, r2 Looking into the example code: void foo () { int parent [ 258 * 2 ]; for (i = 1; i <= alphaSize; i++) { while (parent[k] >= 0) { k = parent[k]; ... } ... The loop invariant is part of address computing for a stack variable. The immediate 2064 is the offset of the variable on the stack frame rather than what we normally expect, e.g. part of indexing; it is a back-end specific issue and there is nothing the mid-end can do (the mem_ref lowering cannot help either). One possible solution may be to force the base address of an array on stack to a REG, before the RTL expand does anything 'smart'. Is it something you think worth giving a try? Just my two cents. Thanks, Yufeng
On Fri, Nov 29, 2013 at 12:46 PM, Yufeng Zhang <Yufeng.Zhang@arm.com> wrote: > On 11/29/13 07:52, Bin.Cheng wrote: >> >> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com> wrote: >>> >>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com> >>> wrote: >>>> >>>> On 18/09/13 10:15, bin.cheng wrote: >>>>> >>>>> >>>>> >>>>>> -----Original Message----- >>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>>> To: Richard Earnshaw >>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>>> >>>>>> >>>>>> >>>>>>> -----Original Message----- >>>>>>> From: Richard Earnshaw >>>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>>> To: Bin Cheng >>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>>> >>>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>>> >>>>>>>> Hi, >>>>>>>> >>>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base + >>>>>>>> offset; reg >>>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>>> + function >>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>>> Hoping we can improve it in the future. >>>>>>>> >>>>>>>> With this patch: >>>>>>>> 1) "base+index*scale" is recognized. >>>>>>> >>>>>>> >>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical >>>>>>> form. >>>>>>> So this shouldn't be necessary. Can you identify where this >>>>>> >>>>>> non-canoncial form is being generated? >>>>>>> >>>>>>> >>>>>> >>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>>> supports scaled addressing mode, which is not valid on ARM, so I was >>>>>> going >>>>>> to construct "base + index*scale" instead. Since "base + index * >>>>>> scale" >>>>> >>>>> is not >>>>>> >>>>>> canonical form, I will construct the canonical form and drop this part >>>>>> of >>>>> >>>>> the >>>>>> >>>>>> patch. >>>>>> >>>>>> Is rest of this patch OK? >>>>>> >>>>> Hi Richard, I removed the part over which you concerned and created >>>>> this >>>>> updated patch. >>>>> >>>>> Is it OK? >>>>> >>>>> Thanks. >>>>> bin >>>>> >>>>> 2013-09-18 Bin Cheng<bin.cheng@arm.com> >>>>> >>>>> * config/arm/arm.c (try_multiplier_address): New function. >>>>> (thumb2_legitimize_address): New function. >>>>> (arm_legitimize_address): Call try_multiplier_address and >>>>> thumb2_legitimize_address. >>>>> >>>>> >>>>> 6-arm-scaled_address-20130918.txt >>>>> >>>>> >>>>> Index: gcc/config/arm/arm.c >>>>> =================================================================== >>>>> --- gcc/config/arm/arm.c (revision 200774) >>>>> +++ gcc/config/arm/arm.c (working copy) >>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>>> } >>>>> } >>>>> >>>>> +/* Try to find address expression like base + index * scale + offset >>>>> + in X. If we find one, force base + offset into register and >>>>> + construct new expression reg + index * scale; return the new >>>>> + address expression if it's valid. Otherwise return X. */ >>>>> +static rtx >>>>> +try_multiplier_address (rtx x, enum machine_mode mode >>>>> ATTRIBUTE_UNUSED) >>>>> +{ >>>>> + rtx tmp, base_reg, new_rtx; >>>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = >>>>> NULL_RTX; >>>>> + >>>>> + gcc_assert (GET_CODE (x) == PLUS); >>>>> + >>>>> + /* Try to find and record base/index/scale/offset in X. */ >>>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>>> + { >>>>> + tmp = XEXP (x, 0); >>>>> + index = XEXP (XEXP (x, 1), 0); >>>>> + scale = XEXP (XEXP (x, 1), 1); >>>>> + if (GET_CODE (tmp) != PLUS) >>>>> + return x; >>>>> + >>>>> + base = XEXP (tmp, 0); >>>>> + offset = XEXP (tmp, 1); >>>>> + } >>>>> + else >>>>> + { >>>>> + tmp = XEXP (x, 0); >>>>> + offset = XEXP (x, 1); >>>>> + if (GET_CODE (tmp) != PLUS) >>>>> + return x; >>>>> + >>>>> + base = XEXP (tmp, 0); >>>>> + scale = XEXP (tmp, 1); >>>>> + if (GET_CODE (base) == MULT) >>>>> + { >>>>> + tmp = base; >>>>> + base = scale; >>>>> + scale = tmp; >>>>> + } >>>>> + if (GET_CODE (scale) != MULT) >>>>> + return x; >>>>> + >>>>> + index = XEXP (scale, 0); >>>>> + scale = XEXP (scale, 1); >>>>> + } >>>>> + >>>>> + if (CONST_INT_P (base)) >>>>> + { >>>>> + tmp = base; >>>>> + base = offset; >>>>> + offset = tmp; >>>>> + } >>>>> + >>>>> + if (CONST_INT_P (index)) >>>>> + { >>>>> + tmp = index; >>>>> + index = scale; >>>>> + scale = tmp; >>>>> + } >>>>> + >>>>> + /* ARM only supports constant scale in address. */ >>>>> + if (!CONST_INT_P (scale)) >>>>> + return x; >>>>> + >>>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>>> + return x; >>>>> + >>>>> + /* Only register/constant are allowed in each part. */ >>>>> + if (!symbol_mentioned_p (base) >>>>> +&& !symbol_mentioned_p (offset) >>>>> +&& !symbol_mentioned_p (index) >>>>> +&& !symbol_mentioned_p (scale)) >>>>> + { >>>> >>>> >>>> It would be easier to do this at the top of the function -- >>>> if (symbol_mentioned_p (x)) >>>> return x; >>>> >>>> >>>>> + /* Force "base+offset" into register and construct >>>>> + "register+index*scale". Return the new expression >>>>> + only if it's valid. */ >>>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>>> + base_reg = force_reg (SImode, tmp); >>>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>>> + return new_rtx; >>>> >>>> >>>> I can't help thinking that this is backwards. That is, you want to >>>> split out the mult expression and use offset addressing in the addresses >>>> itself. That's likely to lead to either better CSE, or more induction >>> >>> Thanks to your review. >>> >>> Actually base+offset is more likely loop invariant than scaled >>> expression, just as reported in pr57540. The bug is observed in >>> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >>> variable doesn't matter that much comparing to invariant because we >>> are in RTL now. >>> >>>> vars. Furthermore, scaled addressing modes are likely to be more >>>> expensive than simple offset addressing modes on at least some cores. >>> >>> The purpose is to CSE (as you pointed out) or hoist loop invariant. >>> As for addressing mode is concerned, Though we may guide the >>> transformation once we have reliable address cost mode, we don't have >>> the information if base+offset is invariant, so it's difficult to >>> handle in backend, right? >>> >>> What do you think about this? >>> >> >> Additionally, there is no induction variable issue here. The memory >> reference we are facing during expand are not lowered by gimple IVOPT, >> which means either it's outside loop, or doesn't have induction >> variable address. >> >> Generally, there are three passes which lowers memory reference: >> gimple strength reduction picks opportunity outside loop; gimple IVOPT >> handles references with induction variable addresses; references not >> handled by SLSR/IVOPT are handled by RTL expand, which makes bad >> choice of address re-association. I think Yufeng also noticed the >> problem and are trying with patch like: >> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html >> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html > > > Yes, when it comes to addressing expression, the re-association in RTL > expand is quite sensitive to the available address modes on the target and > its address legitimization routines. Both patches I proposed try to make > the RTL expansion more canonicalized on addressing expressions, especially > on ARM. It is done by essentially enabling simplify_gen_binary () to work > on a bigger RTX node. > > >> After thinking twice, I some kind of think we should not re-associate >> addresses during expanding, because of lacking of context information. >> Take base + scaled_index + offset as an example in PR57540, we just >> don't know if "base+offset" is loop invariant from either backend or >> RTL expander. > > > I'm getting less convinced by re-associating base with offset > unconditionally. One counter example is > > typedef int arr_1[20]; > void foo (arr_1 a1, int i) > { > a1[i+10] = 1; > } > > I'm experimenting a patch to get the immediate offset in the above example > to be the last addend in the address computing (as mentioned in > http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the > following code-gen: > > add r1, r0, r1, asl #2 > mov r3, #1 > str r3, [r1, #40] > > With your patch applied, the effort will be reverted to > > add r0, r0, #40 > mov r3, #1 > str r3, [r0, r1, asl #2] > > > I briefly looked into PR57540. I noticed that you are trying to tackle a > loop invariant in a hot loop: > > .L5: > add lr, sp, #2064 ////loop invariant > add r2, r2, #1 > add r3, lr, r3, asl #2 > ldr r3, [r3, #-2064] > cmp r3, #0 > bge .L5 > uxtb r2, r2 Why does RTL invariant motion not move it? Richard. > Looking into the example code: > > void > foo () > { > int parent [ 258 * 2 ]; > for (i = 1; i <= alphaSize; i++) > { > while (parent[k] >= 0) > { > k = parent[k]; > ... > } > ... > > The loop invariant is part of address computing for a stack variable. The > immediate 2064 is the offset of the variable on the stack frame rather than > what we normally expect, e.g. part of indexing; it is a back-end specific > issue and there is nothing the mid-end can do (the mem_ref lowering cannot > help either). One possible solution may be to force the base address of an > array on stack to a REG, before the RTL expand does anything 'smart'. Is it > something you think worth giving a try? > > Just my two cents. > > Thanks, > Yufeng >
On 11/29/13 12:02, Richard Biener wrote: > On Fri, Nov 29, 2013 at 12:46 PM, Yufeng Zhang<Yufeng.Zhang@arm.com> wrote: >> On 11/29/13 07:52, Bin.Cheng wrote: >>> >>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com> wrote: >>>> >>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com> >>>> wrote: >>>>> >>>>> On 18/09/13 10:15, bin.cheng wrote: >>>>>> >>>>>> >>>>>> >>>>>>> -----Original Message----- >>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>>>> To: Richard Earnshaw >>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>>>> >>>>>>> >>>>>>> >>>>>>>> -----Original Message----- >>>>>>>> From: Richard Earnshaw >>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>>>> To: Bin Cheng >>>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>>>> >>>>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>>>> >>>>>>>>> Hi, >>>>>>>>> >>>>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base + >>>>>>>>> offset; reg >>>>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>>>> + function >>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>>>> Hoping we can improve it in the future. >>>>>>>>> >>>>>>>>> With this patch: >>>>>>>>> 1) "base+index*scale" is recognized. >>>>>>>> >>>>>>>> >>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical >>>>>>>> form. >>>>>>>> So this shouldn't be necessary. Can you identify where this >>>>>>> >>>>>>> non-canoncial form is being generated? >>>>>>>> >>>>>>>> >>>>>>> >>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was >>>>>>> going >>>>>>> to construct "base + index*scale" instead. Since "base + index * >>>>>>> scale" >>>>>> >>>>>> is not >>>>>>> >>>>>>> canonical form, I will construct the canonical form and drop this part >>>>>>> of >>>>>> >>>>>> the >>>>>>> >>>>>>> patch. >>>>>>> >>>>>>> Is rest of this patch OK? >>>>>>> >>>>>> Hi Richard, I removed the part over which you concerned and created >>>>>> this >>>>>> updated patch. >>>>>> >>>>>> Is it OK? >>>>>> >>>>>> Thanks. >>>>>> bin >>>>>> >>>>>> 2013-09-18 Bin Cheng<bin.cheng@arm.com> >>>>>> >>>>>> * config/arm/arm.c (try_multiplier_address): New function. >>>>>> (thumb2_legitimize_address): New function. >>>>>> (arm_legitimize_address): Call try_multiplier_address and >>>>>> thumb2_legitimize_address. >>>>>> >>>>>> >>>>>> 6-arm-scaled_address-20130918.txt >>>>>> >>>>>> >>>>>> Index: gcc/config/arm/arm.c >>>>>> =================================================================== >>>>>> --- gcc/config/arm/arm.c (revision 200774) >>>>>> +++ gcc/config/arm/arm.c (working copy) >>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>>>> } >>>>>> } >>>>>> >>>>>> +/* Try to find address expression like base + index * scale + offset >>>>>> + in X. If we find one, force base + offset into register and >>>>>> + construct new expression reg + index * scale; return the new >>>>>> + address expression if it's valid. Otherwise return X. */ >>>>>> +static rtx >>>>>> +try_multiplier_address (rtx x, enum machine_mode mode >>>>>> ATTRIBUTE_UNUSED) >>>>>> +{ >>>>>> + rtx tmp, base_reg, new_rtx; >>>>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = >>>>>> NULL_RTX; >>>>>> + >>>>>> + gcc_assert (GET_CODE (x) == PLUS); >>>>>> + >>>>>> + /* Try to find and record base/index/scale/offset in X. */ >>>>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>>>> + { >>>>>> + tmp = XEXP (x, 0); >>>>>> + index = XEXP (XEXP (x, 1), 0); >>>>>> + scale = XEXP (XEXP (x, 1), 1); >>>>>> + if (GET_CODE (tmp) != PLUS) >>>>>> + return x; >>>>>> + >>>>>> + base = XEXP (tmp, 0); >>>>>> + offset = XEXP (tmp, 1); >>>>>> + } >>>>>> + else >>>>>> + { >>>>>> + tmp = XEXP (x, 0); >>>>>> + offset = XEXP (x, 1); >>>>>> + if (GET_CODE (tmp) != PLUS) >>>>>> + return x; >>>>>> + >>>>>> + base = XEXP (tmp, 0); >>>>>> + scale = XEXP (tmp, 1); >>>>>> + if (GET_CODE (base) == MULT) >>>>>> + { >>>>>> + tmp = base; >>>>>> + base = scale; >>>>>> + scale = tmp; >>>>>> + } >>>>>> + if (GET_CODE (scale) != MULT) >>>>>> + return x; >>>>>> + >>>>>> + index = XEXP (scale, 0); >>>>>> + scale = XEXP (scale, 1); >>>>>> + } >>>>>> + >>>>>> + if (CONST_INT_P (base)) >>>>>> + { >>>>>> + tmp = base; >>>>>> + base = offset; >>>>>> + offset = tmp; >>>>>> + } >>>>>> + >>>>>> + if (CONST_INT_P (index)) >>>>>> + { >>>>>> + tmp = index; >>>>>> + index = scale; >>>>>> + scale = tmp; >>>>>> + } >>>>>> + >>>>>> + /* ARM only supports constant scale in address. */ >>>>>> + if (!CONST_INT_P (scale)) >>>>>> + return x; >>>>>> + >>>>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>>>> + return x; >>>>>> + >>>>>> + /* Only register/constant are allowed in each part. */ >>>>>> + if (!symbol_mentioned_p (base) >>>>>> +&& !symbol_mentioned_p (offset) >>>>>> +&& !symbol_mentioned_p (index) >>>>>> +&& !symbol_mentioned_p (scale)) >>>>>> + { >>>>> >>>>> >>>>> It would be easier to do this at the top of the function -- >>>>> if (symbol_mentioned_p (x)) >>>>> return x; >>>>> >>>>> >>>>>> + /* Force "base+offset" into register and construct >>>>>> + "register+index*scale". Return the new expression >>>>>> + only if it's valid. */ >>>>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>>>> + base_reg = force_reg (SImode, tmp); >>>>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>>>> + return new_rtx; >>>>> >>>>> >>>>> I can't help thinking that this is backwards. That is, you want to >>>>> split out the mult expression and use offset addressing in the addresses >>>>> itself. That's likely to lead to either better CSE, or more induction >>>> >>>> Thanks to your review. >>>> >>>> Actually base+offset is more likely loop invariant than scaled >>>> expression, just as reported in pr57540. The bug is observed in >>>> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >>>> variable doesn't matter that much comparing to invariant because we >>>> are in RTL now. >>>> >>>>> vars. Furthermore, scaled addressing modes are likely to be more >>>>> expensive than simple offset addressing modes on at least some cores. >>>> >>>> The purpose is to CSE (as you pointed out) or hoist loop invariant. >>>> As for addressing mode is concerned, Though we may guide the >>>> transformation once we have reliable address cost mode, we don't have >>>> the information if base+offset is invariant, so it's difficult to >>>> handle in backend, right? >>>> >>>> What do you think about this? >>>> >>> >>> Additionally, there is no induction variable issue here. The memory >>> reference we are facing during expand are not lowered by gimple IVOPT, >>> which means either it's outside loop, or doesn't have induction >>> variable address. >>> >>> Generally, there are three passes which lowers memory reference: >>> gimple strength reduction picks opportunity outside loop; gimple IVOPT >>> handles references with induction variable addresses; references not >>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad >>> choice of address re-association. I think Yufeng also noticed the >>> problem and are trying with patch like: >>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html >>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html >> >> >> Yes, when it comes to addressing expression, the re-association in RTL >> expand is quite sensitive to the available address modes on the target and >> its address legitimization routines. Both patches I proposed try to make >> the RTL expansion more canonicalized on addressing expressions, especially >> on ARM. It is done by essentially enabling simplify_gen_binary () to work >> on a bigger RTX node. >> >> >>> After thinking twice, I some kind of think we should not re-associate >>> addresses during expanding, because of lacking of context information. >>> Take base + scaled_index + offset as an example in PR57540, we just >>> don't know if "base+offset" is loop invariant from either backend or >>> RTL expander. >> >> >> I'm getting less convinced by re-associating base with offset >> unconditionally. One counter example is >> >> typedef int arr_1[20]; >> void foo (arr_1 a1, int i) >> { >> a1[i+10] = 1; >> } >> >> I'm experimenting a patch to get the immediate offset in the above example >> to be the last addend in the address computing (as mentioned in >> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the >> following code-gen: >> >> add r1, r0, r1, asl #2 >> mov r3, #1 >> str r3, [r1, #40] >> >> With your patch applied, the effort will be reverted to >> >> add r0, r0, #40 >> mov r3, #1 >> str r3, [r0, r1, asl #2] >> >> >> I briefly looked into PR57540. I noticed that you are trying to tackle a >> loop invariant in a hot loop: >> >> .L5: >> add lr, sp, #2064 ////loop invariant >> add r2, r2, #1 >> add r3, lr, r3, asl #2 >> ldr r3, [r3, #-2064] >> cmp r3, #0 >> bge .L5 >> uxtb r2, r2 > > Why does RTL invariant motion not move it? I haven't looked at the issue in detail. I think Bin will be able to answer it. The IM won't solve the entire problem, as ideally we'd like to see the '- 2064' happen earlier, so that we'll get .L5: add r3, lr, r3, asl #2 ldr r3, [sp, r3, asl #2] cmp r3, #0 bge .L5 uxtb r2, r2 Thanks, Yufeng
On 11/29/13 10:44, Richard Biener wrote: > On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng<amker.cheng@gmail.com> wrote: >> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com> wrote: >>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com> wrote: >>>> On 18/09/13 10:15, bin.cheng wrote: >>>>> >>>>> >>>>>> -----Original Message----- >>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>>> To: Richard Earnshaw >>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>>> >>>>>> >>>>>> >>>>>>> -----Original Message----- >>>>>>> From: Richard Earnshaw >>>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>>> To: Bin Cheng >>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>>> >>>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>>> Hi, >>>>>>>> >>>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also tries >>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base + >>>>>>>> offset; reg >>>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>>> + function >>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does the >>>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>>> Hoping we can improve it in the future. >>>>>>>> >>>>>>>> With this patch: >>>>>>>> 1) "base+index*scale" is recognized. >>>>>>> >>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical form. >>>>>>> So this shouldn't be necessary. Can you identify where this >>>>>> non-canoncial form is being generated? >>>>>>> >>>>>> >>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>>> supports scaled addressing mode, which is not valid on ARM, so I was going >>>>>> to construct "base + index*scale" instead. Since "base + index * scale" >>>>> is not >>>>>> canonical form, I will construct the canonical form and drop this part of >>>>> the >>>>>> patch. >>>>>> >>>>>> Is rest of this patch OK? >>>>>> >>>>> Hi Richard, I removed the part over which you concerned and created this >>>>> updated patch. >>>>> >>>>> Is it OK? >>>>> >>>>> Thanks. >>>>> bin >>>>> >>>>> 2013-09-18 Bin Cheng<bin.cheng@arm.com> >>>>> >>>>> * config/arm/arm.c (try_multiplier_address): New function. >>>>> (thumb2_legitimize_address): New function. >>>>> (arm_legitimize_address): Call try_multiplier_address and >>>>> thumb2_legitimize_address. >>>>> >>>>> >>>>> 6-arm-scaled_address-20130918.txt >>>>> >>>>> >>>>> Index: gcc/config/arm/arm.c >>>>> =================================================================== >>>>> --- gcc/config/arm/arm.c (revision 200774) >>>>> +++ gcc/config/arm/arm.c (working copy) >>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>>> } >>>>> } >>>>> >>>>> +/* Try to find address expression like base + index * scale + offset >>>>> + in X. If we find one, force base + offset into register and >>>>> + construct new expression reg + index * scale; return the new >>>>> + address expression if it's valid. Otherwise return X. */ >>>>> +static rtx >>>>> +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) >>>>> +{ >>>>> + rtx tmp, base_reg, new_rtx; >>>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; >>>>> + >>>>> + gcc_assert (GET_CODE (x) == PLUS); >>>>> + >>>>> + /* Try to find and record base/index/scale/offset in X. */ >>>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>>> + { >>>>> + tmp = XEXP (x, 0); >>>>> + index = XEXP (XEXP (x, 1), 0); >>>>> + scale = XEXP (XEXP (x, 1), 1); >>>>> + if (GET_CODE (tmp) != PLUS) >>>>> + return x; >>>>> + >>>>> + base = XEXP (tmp, 0); >>>>> + offset = XEXP (tmp, 1); >>>>> + } >>>>> + else >>>>> + { >>>>> + tmp = XEXP (x, 0); >>>>> + offset = XEXP (x, 1); >>>>> + if (GET_CODE (tmp) != PLUS) >>>>> + return x; >>>>> + >>>>> + base = XEXP (tmp, 0); >>>>> + scale = XEXP (tmp, 1); >>>>> + if (GET_CODE (base) == MULT) >>>>> + { >>>>> + tmp = base; >>>>> + base = scale; >>>>> + scale = tmp; >>>>> + } >>>>> + if (GET_CODE (scale) != MULT) >>>>> + return x; >>>>> + >>>>> + index = XEXP (scale, 0); >>>>> + scale = XEXP (scale, 1); >>>>> + } >>>>> + >>>>> + if (CONST_INT_P (base)) >>>>> + { >>>>> + tmp = base; >>>>> + base = offset; >>>>> + offset = tmp; >>>>> + } >>>>> + >>>>> + if (CONST_INT_P (index)) >>>>> + { >>>>> + tmp = index; >>>>> + index = scale; >>>>> + scale = tmp; >>>>> + } >>>>> + >>>>> + /* ARM only supports constant scale in address. */ >>>>> + if (!CONST_INT_P (scale)) >>>>> + return x; >>>>> + >>>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>>> + return x; >>>>> + >>>>> + /* Only register/constant are allowed in each part. */ >>>>> + if (!symbol_mentioned_p (base) >>>>> +&& !symbol_mentioned_p (offset) >>>>> +&& !symbol_mentioned_p (index) >>>>> +&& !symbol_mentioned_p (scale)) >>>>> + { >>>> >>>> It would be easier to do this at the top of the function -- >>>> if (symbol_mentioned_p (x)) >>>> return x; >>>> >>>> >>>>> + /* Force "base+offset" into register and construct >>>>> + "register+index*scale". Return the new expression >>>>> + only if it's valid. */ >>>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>>> + base_reg = force_reg (SImode, tmp); >>>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>>> + return new_rtx; >>>> >>>> I can't help thinking that this is backwards. That is, you want to >>>> split out the mult expression and use offset addressing in the addresses >>>> itself. That's likely to lead to either better CSE, or more induction >>> Thanks to your review. >>> >>> Actually base+offset is more likely loop invariant than scaled >>> expression, just as reported in pr57540. The bug is observed in >>> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >>> variable doesn't matter that much comparing to invariant because we >>> are in RTL now. >>> >>>> vars. Furthermore, scaled addressing modes are likely to be more >>>> expensive than simple offset addressing modes on at least some cores. >>> The purpose is to CSE (as you pointed out) or hoist loop invariant. >>> As for addressing mode is concerned, Though we may guide the >>> transformation once we have reliable address cost mode, we don't have >>> the information if base+offset is invariant, so it's difficult to >>> handle in backend, right? >>> >>> What do you think about this? >>> >> >> Additionally, there is no induction variable issue here. The memory >> reference we are facing during expand are not lowered by gimple IVOPT, >> which means either it's outside loop, or doesn't have induction >> variable address. >> >> Generally, there are three passes which lowers memory reference: >> gimple strength reduction picks opportunity outside loop; gimple IVOPT >> handles references with induction variable addresses; references not >> handled by SLSR/IVOPT are handled by RTL expand, which makes bad >> choice of address re-association. I think Yufeng also noticed the >> problem and are trying with patch like: >> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html >> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html >> >> After thinking twice, I some kind of think we should not re-associate >> addresses during expanding, because of lacking of context information. >> Take base + scaled_index + offset as an example in PR57540, we just >> don't know if "base+offset" is loop invariant from either backend or >> RTL expander. >> >> Any comments? > > The issue is that re-association and address lowering is not integrated > with optimizers such as CSE and invariant motion. This means it's > hard to arrive at optimal results and all passes are just "guessing" > and applying heuristics that the programmer thought are appropriate > for his machine. > > I have no easy way out but think that we lower addresses too late > (at least fully). Loop optimizers would like to see the complex > memory reference forms but starting with IVOPTs lowering all > addresses would likely be a win in the end (there are CSE, LIM > and re-association passes around after/at this point as well). > > Back in the 4.3 days when I for the first time attempted to introduce > MEM_REF I forcefully lowered all memory accesses and addresses > very early (during gimplification basically). That didn't work out well. > > So my suggestion to anyone who wants to try something is > to apply an additional lowering to the whole GIMPLE, lowering > things like handled-component references and addresses > (and also things like bitfield accesses). This idea of additional lowering is interesting. I found this wiki page describing the memory reference flattening in GIMPLE: http://gcc.gnu.org/wiki/MemRef I wonder whether there is more information or notes you can share, things like common pitfalls, implication in alias analysis, etc. I'd like to do some experiment with it when I have time. Regards, Yufeng
On 29/11/13 11:46, Yufeng Zhang wrote: > On 11/29/13 07:52, Bin.Cheng wrote: >> After thinking twice, I some kind of think we should not re-associate >> addresses during expanding, because of lacking of context information. >> Take base + scaled_index + offset as an example in PR57540, we just >> don't know if "base+offset" is loop invariant from either backend or >> RTL expander. > > I'm getting less convinced by re-associating base with offset > unconditionally. One counter example is > > typedef int arr_1[20]; > void foo (arr_1 a1, int i) > { > a1[i+10] = 1; > } > > I'm experimenting a patch to get the immediate offset in the above > example to be the last addend in the address computing (as mentioned in > http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the > following code-gen: > > add r1, r0, r1, asl #2 > mov r3, #1 > str r3, [r1, #40] > > With your patch applied, the effort will be reverted to > > add r0, r0, #40 > mov r3, #1 > str r3, [r0, r1, asl #2] > And another one is: typedef int arr_1[20]; void foo (arr_1 a1, int i) { a1[i+10] = 1; a1[i+11] = 1; } This should compile to: add r1, r0, r1, asl #2 mov r3, #1 str r3, [r1, #40] str r3, [r1, #44] And which on Thumb2 should then collapse to: add r1, r0, r1, asl #2 mov r3, #1 strd r3, r3, [r1, #40] With your patch I don't see any chance of being able to get to this situation. (BTW, we currently generate: mov r3, #1 add r1, r1, #10 add r2, r0, r1, asl #2 str r3, [r0, r1, asl #2] str r3, [r2, #4] which is insane). I think I see where you're coming from on the original testcase, but I think you're trying to solve the wrong problem. In your test case the base is an eliminable register, which is likely to be replaced with an offset expression during register allocation. The problem then, I think, is that the cost of these virtual registers is treated the same as any other pseudo register, when it may really have the cost of a PLUS expression. Perhaps the cost of using an eliminable register should be raised in rtx_costs() (treating them as equivalent to (PLUS (reg) (CONST_INT (TBD))), so that loop optimizations will try to hoist suitable sub-expressions out the loop and replace them with real pseudos. R.
On Sat, Nov 30, 2013 at 12:34 AM, Richard Earnshaw <rearnsha@arm.com> wrote: > On 29/11/13 11:46, Yufeng Zhang wrote: >> On 11/29/13 07:52, Bin.Cheng wrote: >>> After thinking twice, I some kind of think we should not re-associate >>> addresses during expanding, because of lacking of context information. >>> Take base + scaled_index + offset as an example in PR57540, we just >>> don't know if "base+offset" is loop invariant from either backend or >>> RTL expander. >> >> I'm getting less convinced by re-associating base with offset >> unconditionally. One counter example is >> >> typedef int arr_1[20]; >> void foo (arr_1 a1, int i) >> { >> a1[i+10] = 1; >> } >> >> I'm experimenting a patch to get the immediate offset in the above >> example to be the last addend in the address computing (as mentioned in >> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the >> following code-gen: >> >> add r1, r0, r1, asl #2 >> mov r3, #1 >> str r3, [r1, #40] >> >> With your patch applied, the effort will be reverted to >> >> add r0, r0, #40 >> mov r3, #1 >> str r3, [r0, r1, asl #2] >> > > And another one is: > > > > typedef int arr_1[20]; > void foo (arr_1 a1, int i) > { > a1[i+10] = 1; > a1[i+11] = 1; > } > > This should compile to: > > add r1, r0, r1, asl #2 > mov r3, #1 > str r3, [r1, #40] > str r3, [r1, #44] > > And which on Thumb2 should then collapse to: > > add r1, r0, r1, asl #2 > mov r3, #1 > strd r3, r3, [r1, #40] > > With your patch I don't see any chance of being able to get to this > situation. > > (BTW, we currently generate: > > mov r3, #1 > add r1, r1, #10 > add r2, r0, r1, asl #2 > str r3, [r0, r1, asl #2] > str r3, [r2, #4] > > which is insane). The two memory references share common sub expressions, SLSR is designed to handle this case, and it should be improved to handle. The original patch are only used to pick up cases not handled by SLSR and IVOPT. Anyway, as you saw from previous message, to do the refactoring during expand is not a good practice, without enough CSE/INVARIANT information, there will be always catched and missed opportunities, that's why I think another lowering besides SLSR/IVOPT on gimple might be a win. Thanks, bin > > I think I see where you're coming from on the original testcase, but I > think you're trying to solve the wrong problem. In your test case the > base is an eliminable register, which is likely to be replaced with an > offset expression during register allocation. The problem then, I > think, is that the cost of these virtual registers is treated the same > as any other pseudo register, when it may really have the cost of a PLUS > expression. > > Perhaps the cost of using an eliminable register should be raised in > rtx_costs() (treating them as equivalent to (PLUS (reg) (CONST_INT > (TBD))), so that loop optimizations will try to hoist suitable > sub-expressions out the loop and replace them with real pseudos. > > R. > > > > > > > >
On Sat, Nov 30, 2013 at 12:34 AM, Richard Earnshaw <rearnsha@arm.com> wrote: > On 29/11/13 11:46, Yufeng Zhang wrote: >> On 11/29/13 07:52, Bin.Cheng wrote: >>> After thinking twice, I some kind of think we should not re-associate >>> addresses during expanding, because of lacking of context information. >>> Take base + scaled_index + offset as an example in PR57540, we just >>> don't know if "base+offset" is loop invariant from either backend or >>> RTL expander. >> >> I'm getting less convinced by re-associating base with offset >> unconditionally. One counter example is >> >> typedef int arr_1[20]; >> void foo (arr_1 a1, int i) >> { >> a1[i+10] = 1; >> } >> >> I'm experimenting a patch to get the immediate offset in the above >> example to be the last addend in the address computing (as mentioned in >> http://gcc.gnu.org/ml/gcc/2013-11/msg00581.html), aiming to get the >> following code-gen: >> >> add r1, r0, r1, asl #2 >> mov r3, #1 >> str r3, [r1, #40] >> >> With your patch applied, the effort will be reverted to >> >> add r0, r0, #40 >> mov r3, #1 >> str r3, [r0, r1, asl #2] >> > > And another one is: > > > > typedef int arr_1[20]; > void foo (arr_1 a1, int i) > { > a1[i+10] = 1; > a1[i+11] = 1; > } > > This should compile to: > > add r1, r0, r1, asl #2 > mov r3, #1 > str r3, [r1, #40] > str r3, [r1, #44] > > And which on Thumb2 should then collapse to: > > add r1, r0, r1, asl #2 > mov r3, #1 > strd r3, r3, [r1, #40] > > With your patch I don't see any chance of being able to get to this > situation. > > (BTW, we currently generate: > > mov r3, #1 > add r1, r1, #10 > add r2, r0, r1, asl #2 > str r3, [r0, r1, asl #2] > str r3, [r2, #4] > > which is insane). > > I think I see where you're coming from on the original testcase, but I > think you're trying to solve the wrong problem. In your test case the > base is an eliminable register, which is likely to be replaced with an > offset expression during register allocation. The problem then, I > think, is that the cost of these virtual registers is treated the same > as any other pseudo register, when it may really have the cost of a PLUS > expression. > > Perhaps the cost of using an eliminable register should be raised in > rtx_costs() (treating them as equivalent to (PLUS (reg) (CONST_INT > (TBD))), so that loop optimizations will try to hoist suitable > sub-expressions out the loop and replace them with real pseudos. > I now have access to the code. The gimple before expanding is like: <bb 6>: # j_26 = PHI <j_9(8), 0(5)> # k_29 = PHI <k_8(8), k_24(5)> j_9 = j_26 + 1; k_8 = parent[k_29]; if (k_8 >= 0) goto <bb 8>; else goto <bb 7>; The rtl generated after expanding is like: 88: NOTE_INSN_BASIC_BLOCK 7 89: r174:SI=r174:SI+0x1 90: r191:SI=r173:SI<<0x2 91: r190:SI=r105:SI+r191:SI 92: r173:SI=[r190:SI-0x810] with r105 == virtual_stack_vars_rtx, and it will be instantiated into frame_pointer_rtx in vregs pass: 88: NOTE_INSN_BASIC_BLOCK 7 89: r174:SI=r174:SI+0x1 90: r191:SI=r173:SI<<0x2 91: r190:SI=sfp:SI+r191:SI 92: r173:SI=[r190:SI-0x810] As you pointed out, sfp is not hoisted as a high cost invariant. I am not sure if loop-invariant will hoist a single pseudo reg even it's assigned with a higher cost. But before the invariant problem, the choice made by RTL expand is bad because it hides the CSE opportunity, because (sfp + r173<<2 - 0x810) == (sp + r173<<2), (sfp-0x810) can be folded into (sp), then we can embed the shift instruction in scaled addressing mode [sp + r173 << 2]. Thanks, bin
On Fri, Nov 29, 2013 at 5:10 PM, Yufeng Zhang <Yufeng.Zhang@arm.com> wrote: > On 11/29/13 10:44, Richard Biener wrote: >> >> On Fri, Nov 29, 2013 at 8:52 AM, Bin.Cheng<amker.cheng@gmail.com> wrote: >>> >>> On Thu, Nov 28, 2013 at 8:06 PM, Bin.Cheng<amker.cheng@gmail.com> wrote: >>>> >>>> On Thu, Nov 28, 2013 at 6:48 PM, Richard Earnshaw<rearnsha@arm.com> >>>> wrote: >>>>> >>>>> On 18/09/13 10:15, bin.cheng wrote: >>>>>> >>>>>> >>>>>> >>>>>>> -----Original Message----- >>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches- >>>>>>> owner@gcc.gnu.org] On Behalf Of bin.cheng >>>>>>> Sent: Monday, September 02, 2013 3:09 PM >>>>>>> To: Richard Earnshaw >>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>> Subject: RE: [PATCH ARM]Refine scaled address expression on ARM >>>>>>> >>>>>>> >>>>>>> >>>>>>>> -----Original Message----- >>>>>>>> From: Richard Earnshaw >>>>>>>> Sent: Thursday, August 29, 2013 9:06 PM >>>>>>>> To: Bin Cheng >>>>>>>> Cc: gcc-patches@gcc.gnu.org >>>>>>>> Subject: Re: [PATCH ARM]Refine scaled address expression on ARM >>>>>>>> >>>>>>>> On 28/08/13 08:00, bin.cheng wrote: >>>>>>>>> >>>>>>>>> Hi, >>>>>>>>> >>>>>>>>> This patch refines scaled address expression on ARM. It supports >>>>>>>>> "base+index*scale" in arm_legitimate_address_outer_p. It also >>>>>>>>> tries >>>>>>>>> to legitimize "base + index * scale + offset" with "reg<- base + >>>>>>>>> offset; reg >>>>>>>>> + index * scale" by introducing thumb2_legitimize_address. For now >>>>>>>>> + function >>>>>>>>> thumb2_legitimize_address is a kind of placeholder and just does >>>>>>>>> the >>>>>>>>> mentioned transformation by calling to try_multiplier_address. >>>>>>>>> Hoping we can improve it in the future. >>>>>>>>> >>>>>>>>> With this patch: >>>>>>>>> 1) "base+index*scale" is recognized. >>>>>>>> >>>>>>>> >>>>>>>> That's because (PLUS (REG) (MULT (REG) (CONST))) is not canonical >>>>>>>> form. >>>>>>>> So this shouldn't be necessary. Can you identify where this >>>>>>> >>>>>>> non-canoncial form is being generated? >>>>>>>> >>>>>>>> >>>>>>> >>>>>>> Oh, for now ivopt constructs "index*scale" to test whether backend >>>>>>> supports scaled addressing mode, which is not valid on ARM, so I was >>>>>>> going >>>>>>> to construct "base + index*scale" instead. Since "base + index * >>>>>>> scale" >>>>>> >>>>>> is not >>>>>>> >>>>>>> canonical form, I will construct the canonical form and drop this >>>>>>> part of >>>>>> >>>>>> the >>>>>>> >>>>>>> patch. >>>>>>> >>>>>>> Is rest of this patch OK? >>>>>>> >>>>>> Hi Richard, I removed the part over which you concerned and created >>>>>> this >>>>>> updated patch. >>>>>> >>>>>> Is it OK? >>>>>> >>>>>> Thanks. >>>>>> bin >>>>>> >>>>>> 2013-09-18 Bin Cheng<bin.cheng@arm.com> >>>>>> >>>>>> * config/arm/arm.c (try_multiplier_address): New function. >>>>>> (thumb2_legitimize_address): New function. >>>>>> (arm_legitimize_address): Call try_multiplier_address and >>>>>> thumb2_legitimize_address. >>>>>> >>>>>> >>>>>> 6-arm-scaled_address-20130918.txt >>>>>> >>>>>> >>>>>> Index: gcc/config/arm/arm.c >>>>>> =================================================================== >>>>>> --- gcc/config/arm/arm.c (revision 200774) >>>>>> +++ gcc/config/arm/arm.c (working copy) >>>>>> @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) >>>>>> } >>>>>> } >>>>>> >>>>>> +/* Try to find address expression like base + index * scale + offset >>>>>> + in X. If we find one, force base + offset into register and >>>>>> + construct new expression reg + index * scale; return the new >>>>>> + address expression if it's valid. Otherwise return X. */ >>>>>> +static rtx >>>>>> +try_multiplier_address (rtx x, enum machine_mode mode >>>>>> ATTRIBUTE_UNUSED) >>>>>> +{ >>>>>> + rtx tmp, base_reg, new_rtx; >>>>>> + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = >>>>>> NULL_RTX; >>>>>> + >>>>>> + gcc_assert (GET_CODE (x) == PLUS); >>>>>> + >>>>>> + /* Try to find and record base/index/scale/offset in X. */ >>>>>> + if (GET_CODE (XEXP (x, 1)) == MULT) >>>>>> + { >>>>>> + tmp = XEXP (x, 0); >>>>>> + index = XEXP (XEXP (x, 1), 0); >>>>>> + scale = XEXP (XEXP (x, 1), 1); >>>>>> + if (GET_CODE (tmp) != PLUS) >>>>>> + return x; >>>>>> + >>>>>> + base = XEXP (tmp, 0); >>>>>> + offset = XEXP (tmp, 1); >>>>>> + } >>>>>> + else >>>>>> + { >>>>>> + tmp = XEXP (x, 0); >>>>>> + offset = XEXP (x, 1); >>>>>> + if (GET_CODE (tmp) != PLUS) >>>>>> + return x; >>>>>> + >>>>>> + base = XEXP (tmp, 0); >>>>>> + scale = XEXP (tmp, 1); >>>>>> + if (GET_CODE (base) == MULT) >>>>>> + { >>>>>> + tmp = base; >>>>>> + base = scale; >>>>>> + scale = tmp; >>>>>> + } >>>>>> + if (GET_CODE (scale) != MULT) >>>>>> + return x; >>>>>> + >>>>>> + index = XEXP (scale, 0); >>>>>> + scale = XEXP (scale, 1); >>>>>> + } >>>>>> + >>>>>> + if (CONST_INT_P (base)) >>>>>> + { >>>>>> + tmp = base; >>>>>> + base = offset; >>>>>> + offset = tmp; >>>>>> + } >>>>>> + >>>>>> + if (CONST_INT_P (index)) >>>>>> + { >>>>>> + tmp = index; >>>>>> + index = scale; >>>>>> + scale = tmp; >>>>>> + } >>>>>> + >>>>>> + /* ARM only supports constant scale in address. */ >>>>>> + if (!CONST_INT_P (scale)) >>>>>> + return x; >>>>>> + >>>>>> + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) >>>>>> + return x; >>>>>> + >>>>>> + /* Only register/constant are allowed in each part. */ >>>>>> + if (!symbol_mentioned_p (base) >>>>>> +&& !symbol_mentioned_p (offset) >>>>>> +&& !symbol_mentioned_p (index) >>>>>> +&& !symbol_mentioned_p (scale)) >>>>>> + { >>>>> >>>>> >>>>> It would be easier to do this at the top of the function -- >>>>> if (symbol_mentioned_p (x)) >>>>> return x; >>>>> >>>>> >>>>>> + /* Force "base+offset" into register and construct >>>>>> + "register+index*scale". Return the new expression >>>>>> + only if it's valid. */ >>>>>> + tmp = gen_rtx_PLUS (SImode, base, offset); >>>>>> + base_reg = force_reg (SImode, tmp); >>>>>> + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); >>>>>> + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); >>>>>> + return new_rtx; >>>>> >>>>> >>>>> I can't help thinking that this is backwards. That is, you want to >>>>> split out the mult expression and use offset addressing in the >>>>> addresses >>>>> itself. That's likely to lead to either better CSE, or more induction >>>> >>>> Thanks to your review. >>>> >>>> Actually base+offset is more likely loop invariant than scaled >>>> expression, just as reported in pr57540. The bug is observed in >>>> spec2k bzip/gzip, and hurts arm in hot loops. The loop induction >>>> variable doesn't matter that much comparing to invariant because we >>>> are in RTL now. >>>> >>>>> vars. Furthermore, scaled addressing modes are likely to be more >>>>> expensive than simple offset addressing modes on at least some cores. >>>> >>>> The purpose is to CSE (as you pointed out) or hoist loop invariant. >>>> As for addressing mode is concerned, Though we may guide the >>>> transformation once we have reliable address cost mode, we don't have >>>> the information if base+offset is invariant, so it's difficult to >>>> handle in backend, right? >>>> >>>> What do you think about this? >>>> >>> >>> Additionally, there is no induction variable issue here. The memory >>> reference we are facing during expand are not lowered by gimple IVOPT, >>> which means either it's outside loop, or doesn't have induction >>> variable address. >>> >>> Generally, there are three passes which lowers memory reference: >>> gimple strength reduction picks opportunity outside loop; gimple IVOPT >>> handles references with induction variable addresses; references not >>> handled by SLSR/IVOPT are handled by RTL expand, which makes bad >>> choice of address re-association. I think Yufeng also noticed the >>> problem and are trying with patch like: >>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02878.html >>> http://gcc.gnu.org/ml/gcc-patches/2013-11/msg03339.html >>> >>> After thinking twice, I some kind of think we should not re-associate >>> addresses during expanding, because of lacking of context information. >>> Take base + scaled_index + offset as an example in PR57540, we just >>> don't know if "base+offset" is loop invariant from either backend or >>> RTL expander. >>> >>> Any comments? >> >> >> The issue is that re-association and address lowering is not integrated >> with optimizers such as CSE and invariant motion. This means it's >> hard to arrive at optimal results and all passes are just "guessing" >> and applying heuristics that the programmer thought are appropriate >> for his machine. >> >> I have no easy way out but think that we lower addresses too late >> (at least fully). Loop optimizers would like to see the complex >> memory reference forms but starting with IVOPTs lowering all >> addresses would likely be a win in the end (there are CSE, LIM >> and re-association passes around after/at this point as well). >> >> Back in the 4.3 days when I for the first time attempted to introduce >> MEM_REF I forcefully lowered all memory accesses and addresses >> very early (during gimplification basically). That didn't work out well. >> >> So my suggestion to anyone who wants to try something is >> to apply an additional lowering to the whole GIMPLE, lowering >> things like handled-component references and addresses >> (and also things like bitfield accesses). > > > This idea of additional lowering is interesting. I found this wiki page > describing the memory reference flattening in GIMPLE: > > http://gcc.gnu.org/wiki/MemRef > > I wonder whether there is more information or notes you can share, things > like common pitfalls, implication in alias analysis, etc. I'd like to do > some experiment with it when I have time. The most notable pitfall is that you lose disambiguations that work on access-paths (disambigations look at a single 'tree' only). The later you do the lowering the less is the effect - but it will definitely hit the instruction scheduler for example. Currently we save the full "tree" memory access in MEM_EXPR during expansion which is where we lower the accesses to RTL. If you lower earlier you cannot save that fancy MEM_EXPR. The other pitfall is that more variables become address-taken if they are variable-indexed as the memory access becomes indirect in the IL. This then requires proper points-to information to disambiguate against other indirect references (but this points to info should be available, so this isn't a big concern IMHO). That said, "late" lowering in a target specific way (that is, with TARGET_MEM_REFs) is still worthwhile to try. Richard. > Regards, > Yufeng >
Index: gcc/config/arm/arm.c =================================================================== --- gcc/config/arm/arm.c (revision 200774) +++ gcc/config/arm/arm.c (working copy) @@ -6652,6 +6654,106 @@ legitimize_tls_address (rtx x, rtx reg) } } +/* Try to find address expression like base + index * scale + offset + in X. If we find one, force base + offset into register and + construct new expression reg + index * scale; return the new + address expression if it's valid. Otherwise return X. */ +static rtx +try_multiplier_address (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) +{ + rtx tmp, base_reg, new_rtx; + rtx base = NULL_RTX, index = NULL_RTX, scale = NULL_RTX, offset = NULL_RTX; + + gcc_assert (GET_CODE (x) == PLUS); + + /* Try to find and record base/index/scale/offset in X. */ + if (GET_CODE (XEXP (x, 1)) == MULT) + { + tmp = XEXP (x, 0); + index = XEXP (XEXP (x, 1), 0); + scale = XEXP (XEXP (x, 1), 1); + if (GET_CODE (tmp) != PLUS) + return x; + + base = XEXP (tmp, 0); + offset = XEXP (tmp, 1); + } + else + { + tmp = XEXP (x, 0); + offset = XEXP (x, 1); + if (GET_CODE (tmp) != PLUS) + return x; + + base = XEXP (tmp, 0); + scale = XEXP (tmp, 1); + if (GET_CODE (base) == MULT) + { + tmp = base; + base = scale; + scale = tmp; + } + if (GET_CODE (scale) != MULT) + return x; + + index = XEXP (scale, 0); + scale = XEXP (scale, 1); + } + + if (CONST_INT_P (base)) + { + tmp = base; + base = offset; + offset = tmp; + } + + if (CONST_INT_P (index)) + { + tmp = index; + index = scale; + scale = tmp; + } + + /* ARM only supports constant scale in address. */ + if (!CONST_INT_P (scale)) + return x; + + if (GET_MODE (base) != SImode || GET_MODE (index) != SImode) + return x; + + /* Only register/constant are allowed in each part. */ + if (!symbol_mentioned_p (base) + && !symbol_mentioned_p (offset) + && !symbol_mentioned_p (index) + && !symbol_mentioned_p (scale)) + { + /* Force "base+offset" into register and construct + "register+index*scale". Return the new expression + only if it's valid. */ + tmp = gen_rtx_PLUS (SImode, base, offset); + base_reg = force_reg (SImode, tmp); + tmp = gen_rtx_fmt_ee (MULT, SImode, index, scale); + new_rtx = gen_rtx_PLUS (SImode, base_reg, tmp); + return new_rtx; + } + + return x; +} + +/* Try machine-dependent ways of modifying an illegitimate Thumb2 address + to be legitimate. If we find one, return the new address. + + TODO: legitimize_address for Thumb2. */ +static rtx +thumb2_legitimize_address (rtx x, rtx orig_x ATTRIBUTE_UNUSED, + enum machine_mode mode) +{ + if (GET_CODE (x) == PLUS) + return try_multiplier_address (x, mode); + + return x; +} + /* Try machine-dependent ways of modifying an illegitimate address to be legitimate. If we find one, return the new, valid address. */ rtx @@ -6659,9 +6761,9 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma { if (!TARGET_ARM) { - /* TODO: legitimize_address for Thumb2. */ if (TARGET_THUMB2) - return x; + return thumb2_legitimize_address (x, orig_x, mode); + return thumb_legitimize_address (x, orig_x, mode); } @@ -6673,6 +6775,10 @@ arm_legitimize_address (rtx x, rtx orig_x, enum ma rtx xop0 = XEXP (x, 0); rtx xop1 = XEXP (x, 1); + rtx new_rtx = try_multiplier_address (x, mode); + if (new_rtx != x) + return new_rtx; + if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0)) xop0 = force_reg (SImode, xop0);