Message ID | 20100609100453.GC18910@kam.mff.cuni.cz |
---|---|
State | New |
Headers | show |
On Wed, Jun 9, 2010 at 12:04 PM, Jan Hubicka <hubicka@ucw.cz> wrote: > Hi, > in the profiles, bitmap_ior_into shows as second most expensive function. It > is mostly because it is collecting cache misses from dataflow that I hope to > improve a bit somewhat by adding more obstacks still. > > Looking at mispredict profiles I however noticed that we have there difficult > to predict branch in internal loop while looking for changes. The return value > of bitmap_ior_into is ignored in all hot calls, but it is possible to manually > ifconvert the loop (we won't do it as it introduce extra store) to get reduce > the problem. I will later do some measurements if it makes sense to have two > variants - one tracking changes and one not, but I want to be finished with > flattening first. > > I also noticed that bitmap.c code walks the array backwards. We won't reverse > it at -O3 since we unroll first. It should be better on every modern CPU to > reverse the direction. > > The patch seems to cut about 5-15% off the profile of bitmap_ior_into, but still > keeps function second most hot. > > Bootstrapped/regtested x86_64-linux, OK? Hm, storing unconditionally is probably not a win on all archs. The loop reversals are ok. Thanks, Richard. > Honza > > * bitmap.c (bitmap_and): Walk array forward. > (bitmap_and_compl_into): Likewise. > (bitmap_xor): Likewise. > (bitmap_xor_into): Likewise. > (bitmap_equal_p): Likewise. > (bitmap_intersect_p): Likewise. > (bitmap_intersect_compl_p): Likewise. > (bitmap_ior_and_into): Likewise. > (bitmap_elt_copy): Optimize test for changes; walk forwards. > (bitmap_and_compl): Likewise. > (bitmap_elt_ior): Likewise. > Index: bitmap.c > =================================================================== > --- bitmap.c (revision 160447) > +++ bitmap.c (working copy) > @@ -908,7 +908,7 @@ bitmap_and (bitmap dst, const_bitmap a, > dst_elt = bitmap_elt_insert_after (dst, dst_prev, a_elt->indx); > else > dst_elt->indx = a_elt->indx; > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] & b_elt->bits[ix]; > > @@ -960,7 +960,7 @@ bitmap_and_into (bitmap a, const_bitmap > unsigned ix; > BITMAP_WORD ior = 0; > > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] & b_elt->bits[ix]; > > @@ -991,13 +991,14 @@ bitmap_elt_copy (bitmap dst, bitmap_elem > if (!changed && dst_elt && dst_elt->indx == src_elt->indx) > { > unsigned ix; > + BITMAP_WORD changed_here = 0; > > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > - if (src_elt->bits[ix] != dst_elt->bits[ix]) > - { > - dst_elt->bits[ix] = src_elt->bits[ix]; > - changed = true; > - } > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > + { > + changed_here |= src_elt->bits[ix] ^ dst_elt->bits[ix]; > + dst_elt->bits[ix] = src_elt->bits[ix]; > + } > + changed = changed_here != 0; > } > else > { > @@ -1056,17 +1057,16 @@ bitmap_and_compl (bitmap dst, const_bitm > > if (!changed && dst_elt && dst_elt->indx == a_elt->indx) > { > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + BITMAP_WORD changed_here = 0; > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] & ~b_elt->bits[ix]; > > - if (dst_elt->bits[ix] != r) > - { > - changed = true; > - dst_elt->bits[ix] = r; > - } > + changed_here |= dst_elt->bits[ix] ^ r; > + dst_elt->bits[ix] = r; > ior |= r; > } > + changed = changed_here != 0; > } > else > { > @@ -1082,7 +1082,7 @@ bitmap_and_compl (bitmap dst, const_bitm > new_element = false; > } > > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] & ~b_elt->bits[ix]; > > @@ -1158,16 +1158,18 @@ bitmap_and_compl_into (bitmap a, const_b > /* Matching elts, generate A &= ~B. */ > unsigned ix; > BITMAP_WORD ior = 0; > + BITMAP_WORD changed_here = 0; > > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD cleared = a_elt->bits[ix] & b_elt->bits[ix]; > BITMAP_WORD r = a_elt->bits[ix] ^ cleared; > > a_elt->bits[ix] = r; > - changed |= cleared; > + changed_here |= cleared; > ior |= r; > } > + changed = changed_here != 0; > next = a_elt->next; > if (!ior) > bitmap_element_free (a, a_elt); > @@ -1453,7 +1455,7 @@ bitmap_compl_and_into (bitmap a, const_b > unsigned ix; > BITMAP_WORD ior = 0; > > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD cleared = a_elt->bits[ix] & b_elt->bits[ix]; > BITMAP_WORD r = b_elt->bits[ix] ^ cleared; > @@ -1494,15 +1496,14 @@ bitmap_elt_ior (bitmap dst, bitmap_eleme > > if (!changed && dst_elt && dst_elt->indx == a_elt->indx) > { > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + BITMAP_WORD changed_here = 0; > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] | b_elt->bits[ix]; > - if (r != dst_elt->bits[ix]) > - { > - dst_elt->bits[ix] = r; > - changed = true; > - } > + changed_here |= r ^ dst_elt->bits[ix]; > + dst_elt->bits[ix] = r; > } > + changed = changed_here != 0; > } > else > { > @@ -1511,7 +1512,7 @@ bitmap_elt_ior (bitmap dst, bitmap_eleme > dst_elt = bitmap_elt_insert_after (dst, dst_prev, a_elt->indx); > else > dst_elt->indx = a_elt->indx; > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] | b_elt->bits[ix]; > dst_elt->bits[ix] = r; > @@ -1650,7 +1651,7 @@ bitmap_xor (bitmap dst, const_bitmap a, > dst_elt = bitmap_elt_insert_after (dst, dst_prev, a_elt->indx); > else > dst_elt->indx = a_elt->indx; > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] ^ b_elt->bits[ix]; > > @@ -1735,7 +1736,7 @@ bitmap_xor_into (bitmap a, const_bitmap > BITMAP_WORD ior = 0; > bitmap_element *next = a_elt->next; > > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = a_elt->bits[ix] ^ b_elt->bits[ix]; > > @@ -1772,7 +1773,7 @@ bitmap_equal_p (const_bitmap a, const_bi > { > if (a_elt->indx != b_elt->indx) > return false; > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > if (a_elt->bits[ix] != b_elt->bits[ix]) > return false; > } > @@ -1797,7 +1798,7 @@ bitmap_intersect_p (const_bitmap a, cons > b_elt = b_elt->next; > else > { > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > if (a_elt->bits[ix] & b_elt->bits[ix]) > return true; > a_elt = a_elt->next; > @@ -1824,7 +1825,7 @@ bitmap_intersect_compl_p (const_bitmap a > b_elt = b_elt->next; > else > { > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > if (a_elt->bits[ix] & ~b_elt->bits[ix]) > return true; > a_elt = a_elt->next; > @@ -1880,7 +1881,7 @@ bitmap_ior_and_compl (bitmap dst, const_ > > BITMAP_WORD ior = 0; > tmp_elt.indx = b_elt->indx; > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > BITMAP_WORD r = b_elt->bits[ix] & ~kill_elt->bits[ix]; > ior |= r; > @@ -1998,7 +1999,7 @@ bitmap_ior_and_into (bitmap a, const_bit > > overall = 0; > and_elt.indx = b_elt->indx; > - for (ix = BITMAP_ELEMENT_WORDS; ix--;) > + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) > { > and_elt.bits[ix] = b_elt->bits[ix] & c_elt->bits[ix]; > overall |= and_elt.bits[ix]; >
Jan Hubicka <hubicka@ucw.cz> writes: > > The patch seems to cut about 5-15% off the profile of bitmap_ior_into, but still > keeps function second most hot. Perhaps it could be vectorized to use 128 bit SSE registers? The vectorizer is not enabled in a normal build, but it could be for that function. -Andi
> Jan Hubicka <hubicka@ucw.cz> writes: > > > > The patch seems to cut about 5-15% off the profile of bitmap_ior_into, but still > > keeps function second most hot. > > Perhaps it could be vectorized to use 128 bit SSE registers? > The vectorizer is not enabled in a normal build, but > it could be for that function. Yeah, vectorization is something I was thinking of as incremental step. It might even make sense to just write it using our vector extensions by hand. Honza > > -Andi > > -- > ak@linux.intel.com -- Speaking for myself only.
On 06/09/2010 04:01 PM, Jan Hubicka wrote: > Yeah, vectorization is something I was thinking of as incremental step. > It might even make sense to just write it using our vector extensions by hand. I had a patch to do this, maybe I can dig it up and fix the conflicts. :-) It didn't give a substantial speedup, but maybe I can find a few more tricks if I look at it again. Part of it changed the bitmap_obstacks to actually use an allocation pool instead. Paolo
> On 06/09/2010 04:01 PM, Jan Hubicka wrote: >> Yeah, vectorization is something I was thinking of as incremental step. >> It might even make sense to just write it using our vector extensions by hand. > > I had a patch to do this, maybe I can dig it up and fix the conflicts. > :-) It didn't give a substantial speedup, but maybe I can find a few > more tricks if I look at it again. > > Part of it changed the bitmap_obstacks to actually use an allocation > pool instead. Yeah, using pools instead of obstack would be nice cleanup. Pool is definitly more fitting model here. But I suspect it won't speedup much, since obstack are quite effective. Honza > > Paolo
On 06/09/2010 08:42 PM, Jan Hubicka wrote: >> On 06/09/2010 04:01 PM, Jan Hubicka wrote: >>> Yeah, vectorization is something I was thinking of as incremental step. >>> It might even make sense to just write it using our vector extensions by hand. >> >> I had a patch to do this, maybe I can dig it up and fix the conflicts. >> :-) It didn't give a substantial speedup, but maybe I can find a few >> more tricks if I look at it again. >> >> Part of it changed the bitmap_obstacks to actually use an allocation >> pool instead. > > Yeah, using pools instead of obstack would be nice cleanup. Pool is definitly > more fitting model here. But I suspect it won't speedup much, since obstack > are quite effective. Yeah, I switched to pools only to more easily respect alignment of vectors, IIRC. Paolo
Index: bitmap.c =================================================================== --- bitmap.c (revision 160447) +++ bitmap.c (working copy) @@ -908,7 +908,7 @@ bitmap_and (bitmap dst, const_bitmap a, dst_elt = bitmap_elt_insert_after (dst, dst_prev, a_elt->indx); else dst_elt->indx = a_elt->indx; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] & b_elt->bits[ix]; @@ -960,7 +960,7 @@ bitmap_and_into (bitmap a, const_bitmap unsigned ix; BITMAP_WORD ior = 0; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] & b_elt->bits[ix]; @@ -991,13 +991,14 @@ bitmap_elt_copy (bitmap dst, bitmap_elem if (!changed && dst_elt && dst_elt->indx == src_elt->indx) { unsigned ix; + BITMAP_WORD changed_here = 0; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) - if (src_elt->bits[ix] != dst_elt->bits[ix]) - { - dst_elt->bits[ix] = src_elt->bits[ix]; - changed = true; - } + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) + { + changed_here |= src_elt->bits[ix] ^ dst_elt->bits[ix]; + dst_elt->bits[ix] = src_elt->bits[ix]; + } + changed = changed_here != 0; } else { @@ -1056,17 +1057,16 @@ bitmap_and_compl (bitmap dst, const_bitm if (!changed && dst_elt && dst_elt->indx == a_elt->indx) { - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + BITMAP_WORD changed_here = 0; + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] & ~b_elt->bits[ix]; - if (dst_elt->bits[ix] != r) - { - changed = true; - dst_elt->bits[ix] = r; - } + changed_here |= dst_elt->bits[ix] ^ r; + dst_elt->bits[ix] = r; ior |= r; } + changed = changed_here != 0; } else { @@ -1082,7 +1082,7 @@ bitmap_and_compl (bitmap dst, const_bitm new_element = false; } - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] & ~b_elt->bits[ix]; @@ -1158,16 +1158,18 @@ bitmap_and_compl_into (bitmap a, const_b /* Matching elts, generate A &= ~B. */ unsigned ix; BITMAP_WORD ior = 0; + BITMAP_WORD changed_here = 0; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD cleared = a_elt->bits[ix] & b_elt->bits[ix]; BITMAP_WORD r = a_elt->bits[ix] ^ cleared; a_elt->bits[ix] = r; - changed |= cleared; + changed_here |= cleared; ior |= r; } + changed = changed_here != 0; next = a_elt->next; if (!ior) bitmap_element_free (a, a_elt); @@ -1453,7 +1455,7 @@ bitmap_compl_and_into (bitmap a, const_b unsigned ix; BITMAP_WORD ior = 0; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD cleared = a_elt->bits[ix] & b_elt->bits[ix]; BITMAP_WORD r = b_elt->bits[ix] ^ cleared; @@ -1494,15 +1496,14 @@ bitmap_elt_ior (bitmap dst, bitmap_eleme if (!changed && dst_elt && dst_elt->indx == a_elt->indx) { - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + BITMAP_WORD changed_here = 0; + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] | b_elt->bits[ix]; - if (r != dst_elt->bits[ix]) - { - dst_elt->bits[ix] = r; - changed = true; - } + changed_here |= r ^ dst_elt->bits[ix]; + dst_elt->bits[ix] = r; } + changed = changed_here != 0; } else { @@ -1511,7 +1512,7 @@ bitmap_elt_ior (bitmap dst, bitmap_eleme dst_elt = bitmap_elt_insert_after (dst, dst_prev, a_elt->indx); else dst_elt->indx = a_elt->indx; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] | b_elt->bits[ix]; dst_elt->bits[ix] = r; @@ -1650,7 +1651,7 @@ bitmap_xor (bitmap dst, const_bitmap a, dst_elt = bitmap_elt_insert_after (dst, dst_prev, a_elt->indx); else dst_elt->indx = a_elt->indx; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] ^ b_elt->bits[ix]; @@ -1735,7 +1736,7 @@ bitmap_xor_into (bitmap a, const_bitmap BITMAP_WORD ior = 0; bitmap_element *next = a_elt->next; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = a_elt->bits[ix] ^ b_elt->bits[ix]; @@ -1772,7 +1773,7 @@ bitmap_equal_p (const_bitmap a, const_bi { if (a_elt->indx != b_elt->indx) return false; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) if (a_elt->bits[ix] != b_elt->bits[ix]) return false; } @@ -1797,7 +1798,7 @@ bitmap_intersect_p (const_bitmap a, cons b_elt = b_elt->next; else { - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) if (a_elt->bits[ix] & b_elt->bits[ix]) return true; a_elt = a_elt->next; @@ -1824,7 +1825,7 @@ bitmap_intersect_compl_p (const_bitmap a b_elt = b_elt->next; else { - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) if (a_elt->bits[ix] & ~b_elt->bits[ix]) return true; a_elt = a_elt->next; @@ -1880,7 +1881,7 @@ bitmap_ior_and_compl (bitmap dst, const_ BITMAP_WORD ior = 0; tmp_elt.indx = b_elt->indx; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { BITMAP_WORD r = b_elt->bits[ix] & ~kill_elt->bits[ix]; ior |= r; @@ -1998,7 +1999,7 @@ bitmap_ior_and_into (bitmap a, const_bit overall = 0; and_elt.indx = b_elt->indx; - for (ix = BITMAP_ELEMENT_WORDS; ix--;) + for (ix = 0; ix < BITMAP_ELEMENT_WORDS; ix++) { and_elt.bits[ix] = b_elt->bits[ix] & c_elt->bits[ix]; overall |= and_elt.bits[ix];