Message ID | 52A61506.5000407@naturalbridge.com |
---|---|
State | New |
Headers | show |
Hi Kenny, Sorry for the slow response. Kenneth Zadeck <zadeck@naturalbridge.com> writes: > Index: gcc/wide-int.cc > =================================================================== > --- gcc/wide-int.cc (revision 205765) > +++ gcc/wide-int.cc (working copy) > @@ -1275,23 +1275,31 @@ wi::mul_internal (HOST_WIDE_INT *val, co > unsigned int j; > unsigned int blocks_needed = BLOCKS_NEEDED (prec); > unsigned int half_blocks_needed = blocks_needed * 2; > - /* The sizes here are scaled to support a 2x largest mode by 2x > - largest mode yielding a 4x largest mode result. This is what is > - needed by vpn. */ > > + /* The sizes here are scaled to support a 2*largest mode + a little > + by 2*largest mode + a little yielding a 4*largest mode + a > + little. This is what is needed by vpn. */ > unsigned HOST_HALF_WIDE_INT > - u[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; > + u[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; > + unsigned int ulen; > unsigned HOST_HALF_WIDE_INT > - v[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; > - /* The '2' in 'R' is because we are internally doing a full > + v[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; > + unsigned int vlen; > + unsigned int uvlen; > + unsigned int vallen; > + > + /* The '4' in 'R' is because we are internally doing a wide > multiply. */ > unsigned HOST_HALF_WIDE_INT > - r[2 * 4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; > - HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1; > + r[4 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; While we're here I think we should convert these arrays into allocas, so that we're not hard-coding the specialness of vpn in wide-int.cc. I.e. rather than having arrays of maximum size, use XALLOCAVEC to allocate a specific number of stack HWIs, once we know how many we actually need. That includes the new arrays added in the patch. > + /* True if the result needs to be negated. */ > + bool is_neg = false; > > /* If the top level routine did not really pass in an overflow, then > just make sure that we never attempt to set it. */ > bool needs_overflow = (overflow != 0); > + const HOST_WIDE_INT zero = 0; > if (needs_overflow) > *overflow = false; > > @@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co > return 1; > } > > - /* We do unsigned mul and then correct it. */ > - wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len, > - half_blocks_needed, prec, SIGNED); > - wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len, > - half_blocks_needed, prec, SIGNED); > - > - /* The 2 is for a full mult. */ > - memset (r, 0, half_blocks_needed * 2 > - * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); > + ulen = op1len * 2; > + vlen = op2len * 2; > + > + if (sgn == SIGNED) > + { > + if (wi::neg_p (op1)) > + { > + HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS]; > + E.g. following on from the above: /* Add an extra HWI so that we can represent the negative of the minimum. */ HOST_WIDE_INT *nop1 = XALLOCAVEC (HOST_WIDE_INT, op1len + 1); > + int nop1len > + = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, 0); Not sure about the "prec + 1" here, since it could cause nop1len to be greater than the maximum length for "prec". And you go on to unpack nop1 in "prec" rather than "prec + 1". AIUI, once you've taken the absolutes, you've got two unsigned numbers of precision "prec" and create a "prec * 2" result, so naybe: sub_large (..., prec, UNSIGNED, 0) instead? > + is_neg = !is_neg; > + ulen = nop1len * 2; > + wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len, > + ulen, prec, SIGNED); Back on the alloca theme, we'd have: u = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, ulen); before this and other unpacks. Note that there's supposed to be space after ")" in casts. Lots of instances in the patch :-) > + /* UNSIGNED mul. */ > + /* For large positive integers inside regular wide-ints, we must > + explictly expand out the 1s to full width for the mul to be > + correct. Note that the top bit will never be set for a > + unsigned number in offset_int of widest-int. */ s/of/or/. But I think the comment is confusing because the unsignedness is a property of the multiplication rather than the inputs. We should never be doing an unsigned operation on widest_int to begin with. And if offset_ints are being treated as having a sign then the same is true there. If we ever do have an unsigned multiplication on those types for some reason then -1 will (rightly) be treated as the maximum unsigned value. Maybe easiest just to drop the note? Or if you don't like that then maybe: Note that this case is typically only used for variable-precision wide_ints. > + if (wi::neg_p (op1)) > + ulen = half_blocks_needed; > + if (wi::neg_p (op2)) > + vlen = half_blocks_needed; > + > + wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len, > + ulen, prec, SIGNED); > + > + wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len, > + vlen, prec, SIGNED); > + } > > - for (j = 0; j < half_blocks_needed; j++) > + uvlen = ulen + vlen; The alloca change here would be: r = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, uvlen); > + memset (r, 0, uvlen * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); > + > + /* Do the actually multiply. */ > + for (j = 0; j < vlen; j++) > { > k = 0; > - for (i = 0; i < half_blocks_needed; i++) > + for (i = 0; i < ulen; i++) > { > t = ((unsigned HOST_WIDE_INT)u[i] * (unsigned HOST_WIDE_INT)v[j] > + r[i + j] + k); > r[i + j] = t & HALF_INT_MASK; > k = t >> HOST_BITS_PER_HALF_WIDE_INT; > } > - r[j + half_blocks_needed] = k; > + r[j + ulen] = k; > } > > - /* We did unsigned math above. For signed we must adjust the > - product (assuming we need to see that). */ > - if (sgn == SIGNED && (high || needs_overflow)) > + /* We did unsigned math above. For signed we may need to adjust the > + product if exactly 1 of the operands was negative. */ > + if (is_neg) > { > - unsigned HOST_WIDE_INT b; > - if (wi::neg_p (op1)) > + t = (unsigned HOST_WIDE_INT)(~r[0]) + 1; r[0] ^ HALF_INT_MASK is more portable than ~r[0] here. If HOST_WIDE_INT is int then HOST_HALF_WIDE_INT is short, and the short will be promoted to int before being inverted. E.g. with ~, a half-int of 0x0000 would become 0xffffffff rather than 0x0000ffff. (Same for the ~ inside the loop of course.) > + const HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1; Long line. But it looks like this is just HALF_INT_MASK. > @@ -1459,15 +1504,42 @@ wi::mul_internal (HOST_WIDE_INT *val, co > { > /* compute [prec] <- ([prec] * [prec]) >> [prec] */ > wi_pack ((unsigned HOST_WIDE_INT *) val, > - &r[half_blocks_needed], half_blocks_needed); > - return canonize (val, blocks_needed, prec); > + r, uvlen); This wi_pack now fits more naturally on a single line. > + vallen = canonize (val, (uvlen + 1) >> 1, prec); > + > + /* Shift is not always safe to write over one of the > + operands, so we must copy. */ > + HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS]; > + memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); vallen * sizeof (HOST_WIDE_INT) would be more typical. But why not unpack into tval directly and avoid the copy? > +#if 0 > + static int cnt = 0; > + printf ("cnt=%d op1=", cnt++); > + for (i = 0; i < op1len; i++) > + printf ("0x%lx ", op1[i]); > + > + printf (" op2="); > + for (i = 0; i < op2len; i++) > + printf ("0x%lx ", op2[i]); > + printf (" result="); > + for (i = 0; i < vallen; i++) > + printf ("0x%lx ", val[i]); > + if (needs_overflow && *overflow) > + printf (" OVERFLOW"); > + printf ("\n"); > +#endif I think this should either be removed or put under: #define DEBUG_MUL 0 if (DEBUG_MUL) { } protection, so that at least it gets compile testing while disabled. Thanks, Richard
On 12/14/2013 06:40 AM, Richard Sandiford wrote: > Hi Kenny, > > Sorry for the slow response. > > Kenneth Zadeck <zadeck@naturalbridge.com> writes: >> Index: gcc/wide-int.cc >> =================================================================== >> --- gcc/wide-int.cc (revision 205765) >> +++ gcc/wide-int.cc (working copy) >> @@ -1275,23 +1275,31 @@ wi::mul_internal (HOST_WIDE_INT *val, co >> unsigned int j; >> unsigned int blocks_needed = BLOCKS_NEEDED (prec); >> unsigned int half_blocks_needed = blocks_needed * 2; >> - /* The sizes here are scaled to support a 2x largest mode by 2x >> - largest mode yielding a 4x largest mode result. This is what is >> - needed by vpn. */ >> >> + /* The sizes here are scaled to support a 2*largest mode + a little >> + by 2*largest mode + a little yielding a 4*largest mode + a >> + little. This is what is needed by vpn. */ >> unsigned HOST_HALF_WIDE_INT >> - u[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; >> + u[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; >> + unsigned int ulen; >> unsigned HOST_HALF_WIDE_INT >> - v[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; >> - /* The '2' in 'R' is because we are internally doing a full >> + v[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; >> + unsigned int vlen; >> + unsigned int uvlen; >> + unsigned int vallen; >> + >> + /* The '4' in 'R' is because we are internally doing a wide >> multiply. */ >> unsigned HOST_HALF_WIDE_INT >> - r[2 * 4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; >> - HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1; >> + r[4 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; > While we're here I think we should convert these arrays into allocas, > so that we're not hard-coding the specialness of vpn in wide-int.cc. > I.e. rather than having arrays of maximum size, use XALLOCAVEC to > allocate a specific number of stack HWIs, once we know how many > we actually need. That includes the new arrays added in the patch. I had thought of doing this but there is a part of me that has always thought of this as being more expensive, but in the scheme of things it is just noise here, so i will do it. >> + /* True if the result needs to be negated. */ >> + bool is_neg = false; >> >> /* If the top level routine did not really pass in an overflow, then >> just make sure that we never attempt to set it. */ >> bool needs_overflow = (overflow != 0); >> + const HOST_WIDE_INT zero = 0; >> if (needs_overflow) >> *overflow = false; >> >> @@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co >> return 1; >> } >> >> - /* We do unsigned mul and then correct it. */ >> - wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len, >> - half_blocks_needed, prec, SIGNED); >> - wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len, >> - half_blocks_needed, prec, SIGNED); >> - >> - /* The 2 is for a full mult. */ >> - memset (r, 0, half_blocks_needed * 2 >> - * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); >> + ulen = op1len * 2; >> + vlen = op2len * 2; >> + >> + if (sgn == SIGNED) >> + { >> + if (wi::neg_p (op1)) >> + { >> + HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS]; >> + > E.g. following on from the above: > > /* Add an extra HWI so that we can represent the negative of > the minimum. */ > HOST_WIDE_INT *nop1 = XALLOCAVEC (HOST_WIDE_INT, op1len + 1); > >> + int nop1len >> + = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, 0); > Not sure about the "prec + 1" here, since it could cause nop1len to be > greater than the maximum length for "prec". And you go on to unpack > nop1 in "prec" rather than "prec + 1". the unpacking with prec is the wrong part. That should have been prec + 1 > > AIUI, once you've taken the absolutes, you've got two unsigned numbers > of precision "prec" and create a "prec * 2" result, so naybe: > sub_large (..., prec, UNSIGNED, 0) instead? the signed and unsigned does not matter here. the prec + 1 means we never overflow. >> + is_neg = !is_neg; >> + ulen = nop1len * 2; >> + wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len, >> + ulen, prec, SIGNED); > Back on the alloca theme, we'd have: > > u = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, ulen); > > before this and other unpacks. > > Note that there's supposed to be space after ")" in casts. Lots of > instances in the patch :-) > >> + /* UNSIGNED mul. */ >> + /* For large positive integers inside regular wide-ints, we must >> + explictly expand out the 1s to full width for the mul to be >> + correct. Note that the top bit will never be set for a >> + unsigned number in offset_int of widest-int. */ > s/of/or/. But I think the comment is confusing because the unsignedness > is a property of the multiplication rather than the inputs. We should never > be doing an unsigned operation on widest_int to begin with. And if > offset_ints are being treated as having a sign then the same is true there. > > If we ever do have an unsigned multiplication on those types for some reason > then -1 will (rightly) be treated as the maximum unsigned value. The real question here is "do you want to do an assert or do you want to explain what happens if the input comes in that way?" The current world is actually structured so that we never ask about overflow for the two larger classes because the reason that you used those classes was that you never wanted to have this discussion. So if you never ask about overflow, then it really does not matter because we are not going to return enough bits for you to care what happened on the inside. Of course that could change and someone could say that they wanted overflow on widest-int. Then the comment makes sense, with revisions, unless your review of the code that wants overflow on widest int suggests that they are just being stupid. > Maybe easiest just to drop the note? Or if you don't like that then maybe: > > Note that this case is typically only used for variable-precision > wide_ints. > >> + if (wi::neg_p (op1)) >> + ulen = half_blocks_needed; >> + if (wi::neg_p (op2)) >> + vlen = half_blocks_needed; >> + >> + wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len, >> + ulen, prec, SIGNED); >> + >> + wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len, >> + vlen, prec, SIGNED); >> + } >> >> - for (j = 0; j < half_blocks_needed; j++) >> + uvlen = ulen + vlen; > The alloca change here would be: > > r = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, uvlen); > >> + memset (r, 0, uvlen * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); >> + >> + /* Do the actually multiply. */ >> + for (j = 0; j < vlen; j++) >> { >> k = 0; >> - for (i = 0; i < half_blocks_needed; i++) >> + for (i = 0; i < ulen; i++) >> { >> t = ((unsigned HOST_WIDE_INT)u[i] * (unsigned HOST_WIDE_INT)v[j] >> + r[i + j] + k); >> r[i + j] = t & HALF_INT_MASK; >> k = t >> HOST_BITS_PER_HALF_WIDE_INT; >> } >> - r[j + half_blocks_needed] = k; >> + r[j + ulen] = k; >> } >> >> - /* We did unsigned math above. For signed we must adjust the >> - product (assuming we need to see that). */ >> - if (sgn == SIGNED && (high || needs_overflow)) >> + /* We did unsigned math above. For signed we may need to adjust the >> + product if exactly 1 of the operands was negative. */ >> + if (is_neg) >> { >> - unsigned HOST_WIDE_INT b; >> - if (wi::neg_p (op1)) >> + t = (unsigned HOST_WIDE_INT)(~r[0]) + 1; > r[0] ^ HALF_INT_MASK is more portable than ~r[0] here. If HOST_WIDE_INT > is int then HOST_HALF_WIDE_INT is short, and the short will be promoted > to int before being inverted. E.g. with ~, a half-int of 0x0000 would > become 0xffffffff rather than 0x0000ffff. > > (Same for the ~ inside the loop of course.) > >> + const HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1; > Long line. But it looks like this is just HALF_INT_MASK. > >> @@ -1459,15 +1504,42 @@ wi::mul_internal (HOST_WIDE_INT *val, co >> { >> /* compute [prec] <- ([prec] * [prec]) >> [prec] */ >> wi_pack ((unsigned HOST_WIDE_INT *) val, >> - &r[half_blocks_needed], half_blocks_needed); >> - return canonize (val, blocks_needed, prec); >> + r, uvlen); > This wi_pack now fits more naturally on a single line. > >> + vallen = canonize (val, (uvlen + 1) >> 1, prec); >> + >> + /* Shift is not always safe to write over one of the >> + operands, so we must copy. */ >> + HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS]; >> + memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); > vallen * sizeof (HOST_WIDE_INT) would be more typical. > But why not unpack into tval directly and avoid the copy? > >> +#if 0 >> + static int cnt = 0; >> + printf ("cnt=%d op1=", cnt++); >> + for (i = 0; i < op1len; i++) >> + printf ("0x%lx ", op1[i]); >> + >> + printf (" op2="); >> + for (i = 0; i < op2len; i++) >> + printf ("0x%lx ", op2[i]); >> + printf (" result="); >> + for (i = 0; i < vallen; i++) >> + printf ("0x%lx ", val[i]); >> + if (needs_overflow && *overflow) >> + printf (" OVERFLOW"); >> + printf ("\n"); >> +#endif i had forgotten to remove this from the patch, sorry. > I think this should either be removed or put under: > > #define DEBUG_MUL 0 > > if (DEBUG_MUL) > { > } > > protection, so that at least it gets compile testing while disabled. > > Thanks, > Richard
Kenneth Zadeck <zadeck@naturalbridge.com> writes: > On 12/14/2013 06:40 AM, Richard Sandiford wrote: >> Hi Kenny, >> >> Sorry for the slow response. >> >> Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>> Index: gcc/wide-int.cc >>> =================================================================== >>> --- gcc/wide-int.cc (revision 205765) >>> +++ gcc/wide-int.cc (working copy) >>> @@ -1275,23 +1275,31 @@ wi::mul_internal (HOST_WIDE_INT *val, co >>> unsigned int j; >>> unsigned int blocks_needed = BLOCKS_NEEDED (prec); >>> unsigned int half_blocks_needed = blocks_needed * 2; >>> - /* The sizes here are scaled to support a 2x largest mode by 2x >>> - largest mode yielding a 4x largest mode result. This is what is >>> - needed by vpn. */ >>> >>> + /* The sizes here are scaled to support a 2*largest mode + a little >>> + by 2*largest mode + a little yielding a 4*largest mode + a >>> + little. This is what is needed by vpn. */ >>> unsigned HOST_HALF_WIDE_INT >>> - u[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; >>> + u[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; >>> + unsigned int ulen; >>> unsigned HOST_HALF_WIDE_INT >>> - v[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; >>> - /* The '2' in 'R' is because we are internally doing a full >>> + v[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; >>> + unsigned int vlen; >>> + unsigned int uvlen; >>> + unsigned int vallen; >>> + >>> + /* The '4' in 'R' is because we are internally doing a wide >>> multiply. */ >>> unsigned HOST_HALF_WIDE_INT >>> - r[2 * 4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; >>> - HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1; >>> + r[4 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; >> While we're here I think we should convert these arrays into allocas, >> so that we're not hard-coding the specialness of vpn in wide-int.cc. >> I.e. rather than having arrays of maximum size, use XALLOCAVEC to >> allocate a specific number of stack HWIs, once we know how many >> we actually need. That includes the new arrays added in the patch. > I had thought of doing this but there is a part of me that has always > thought of this as being more expensive, but in the scheme of things it > is just noise here, so i will do it. Thanks. >>> + /* True if the result needs to be negated. */ >>> + bool is_neg = false; >>> >>> /* If the top level routine did not really pass in an overflow, then >>> just make sure that we never attempt to set it. */ >>> bool needs_overflow = (overflow != 0); >>> + const HOST_WIDE_INT zero = 0; >>> if (needs_overflow) >>> *overflow = false; >>> >>> @@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co >>> return 1; >>> } >>> >>> - /* We do unsigned mul and then correct it. */ >>> - wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len, >>> - half_blocks_needed, prec, SIGNED); >>> - wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len, >>> - half_blocks_needed, prec, SIGNED); >>> - >>> - /* The 2 is for a full mult. */ >>> - memset (r, 0, half_blocks_needed * 2 >>> - * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); >>> + ulen = op1len * 2; >>> + vlen = op2len * 2; >>> + >>> + if (sgn == SIGNED) >>> + { >>> + if (wi::neg_p (op1)) >>> + { >>> + HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS]; >>> + >> E.g. following on from the above: >> >> /* Add an extra HWI so that we can represent the negative of >> the minimum. */ >> HOST_WIDE_INT *nop1 = XALLOCAVEC (HOST_WIDE_INT, op1len + 1); >> >>> + int nop1len >>> + = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, >>> 0); >> Not sure about the "prec + 1" here, since it could cause nop1len to be >> greater than the maximum length for "prec". And you go on to unpack >> nop1 in "prec" rather than "prec + 1". > the unpacking with prec is the wrong part. That should have been prec + 1 Hmm, I still think it's the opposite, because... >> AIUI, once you've taken the absolutes, you've got two unsigned numbers >> of precision "prec" and create a "prec * 2" result, so naybe: >> sub_large (..., prec, UNSIGNED, 0) instead? > > the signed and unsigned does not matter here. the prec + 1 means we > never overflow. ...my point is that after this we treat the number as unsigned, so it isn't a question of whether the absolute value overflows the precision as a signed number but whether it overflows as an unsigned number. And it never will. It might need 1 extra HWI than the original input if op1len < blocks_needed, but it shouldn't need extra precision. E.g. say for 8-bit HWIs we start out with a signed -0x80 * -0x80. We convert that into an unsigned 0x80 * 0x80, which is still precision 8 * precision 8 -> precision 16. And we'd need to be able to handle those precision-8 values correctly (with no extra zero HWI) if we had an unsigned 0x80 * 0x80 from the outset. >>> + is_neg = !is_neg; >>> + ulen = nop1len * 2; >>> + wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len, >>> + ulen, prec, SIGNED); >> Back on the alloca theme, we'd have: >> >> u = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, ulen); >> >> before this and other unpacks. >> >> Note that there's supposed to be space after ")" in casts. Lots of >> instances in the patch :-) >> >>> + /* UNSIGNED mul. */ >>> + /* For large positive integers inside regular wide-ints, we must >>> + explictly expand out the 1s to full width for the mul to be >>> + correct. Note that the top bit will never be set for a >>> + unsigned number in offset_int of widest-int. */ >> s/of/or/. But I think the comment is confusing because the unsignedness >> is a property of the multiplication rather than the inputs. We should never >> be doing an unsigned operation on widest_int to begin with. And if >> offset_ints are being treated as having a sign then the same is true there. >> >> If we ever do have an unsigned multiplication on those types for some reason >> then -1 will (rightly) be treated as the maximum unsigned value. > The real question here is "do you want to do an assert or do you want to > explain what happens if the input comes in that way?" Sorry, to be clear, I was only talking about the "Note..." part of the comment rather than the full thing. I think the first part of the comment and the code itself is fine. > The current world > is actually structured so that we never ask about overflow for the two > larger classes because the reason that you used those classes was that > you never wanted to have this discussion. So if you never ask about > overflow, then it really does not matter because we are not going to > return enough bits for you to care what happened on the inside. Of > course that could change and someone could say that they wanted overflow > on widest-int. Then the comment makes sense, with revisions, unless > your review of the code that wants overflow on widest int suggests that > they are just being stupid. But widest_int is now supposed to be at least 1 bit wider than widest input type (unlike previously where it was double the widest input type). So I can definitely see cases where we'd want to know whether a widest_int * widest_int result overflows. My point is that the widest_int * widest_int would normally be a signed multiplication rather than an unsigned multiplication, since the extra 1 bit of precision allows every operation to be signed. So it isn't a case of whether the top bit of a widest_int will be set, but whether we ever reach here for widest_int in the first place. We should be going down the sgn == SIGNED path rather than the sgn == UNSIGNED path. widest_int can represent an all-1s value, usually interpreted as -1. If we do go down this sgn == UNSIGNED path for widest_int then we will instead treat the all-1s value as the maximum unsigned number, just like for any other kind of wide int. As far as this function goes there really is no difference between wide_int, offset_int and widest_int. Which is good, because offset_int and widest_int should just be wide_ints that are optimised for a specific and fixed precision. Thanks, Richard
>> The current world >> is actually structured so that we never ask about overflow for the two >> larger classes because the reason that you used those classes was that >> you never wanted to have this discussion. So if you never ask about >> overflow, then it really does not matter because we are not going to >> return enough bits for you to care what happened on the inside. Of >> course that could change and someone could say that they wanted overflow >> on widest-int. Then the comment makes sense, with revisions, unless >> your review of the code that wants overflow on widest int suggests that >> they are just being stupid. > But widest_int is now supposed to be at least 1 bit wider than widest > input type (unlike previously where it was double the widest input type). > So I can definitely see cases where we'd want to know whether a > widest_int * widest_int result overflows. > > My point is that the widest_int * widest_int would normally be a signed > multiplication rather than an unsigned multiplication, since the extra > 1 bit of precision allows every operation to be signed. So it isn't > a case of whether the top bit of a widest_int will be set, but whether > we ever reach here for widest_int in the first place. We should be > going down the sgn == SIGNED path rather than the sgn == UNSIGNED path. > > widest_int can represent an all-1s value, usually interpreted as -1. > If we do go down this sgn == UNSIGNED path for widest_int then we will > instead treat the all-1s value as the maximum unsigned number, just like > for any other kind of wide int. > > As far as this function goes there really is no difference between > wide_int, offset_int and widest_int. Which is good, because offset_int > and widest_int should just be wide_ints that are optimised for a specific > and fixed precision. > > Thanks, > Richard I am now seriously regretting letting richi talk me into changing the size of the wide int buffer from being 2x of the largest mode on the machine. It was a terrible mistake AND i would guess making it smaller does not provide any real benefit. The problem is that when you use widest-int (and by analogy offset int) it should NEVER EVER overflow. Furthermore we need to change the interfaces for these two so that you cannot even ask!!!!!! (i do not believe that anyone does ask so the change would be small.) There are a huge set of bugs on the trunk that are "fixed" with wide-int because people wrote code for double-int thinking that it was infinite precision. So they never tested the cases of what happens when the size of the variable needed two HWIs. Most of those cases were resolved by making passes like tree-vrp use wide-int and then being explicit about the overflow on every operation, because with wide-int the issue is in your face since things overflow even for 32 bit numbers. However, with the current widest-int, we will only be safe for add and subtract by adding the extra bit. In multiply we are exposed. The perception is that widest-int is a good as infinite precision and no one will ever write the code to check if it overflowed because it only rarely happens.
On 12/14/2013 09:30 AM, Richard Sandiford wrote: >>>> + /* True if the result needs to be negated. */ >>>> + bool is_neg = false; >>>> >>>> /* If the top level routine did not really pass in an overflow, then >>>> just make sure that we never attempt to set it. */ >>>> bool needs_overflow = (overflow != 0); >>>> + const HOST_WIDE_INT zero = 0; >>>> if (needs_overflow) >>>> *overflow = false; >>>> >>>> @@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co >>>> return 1; >>>> } >>>> >>>> - /* We do unsigned mul and then correct it. */ >>>> - wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len, >>>> - half_blocks_needed, prec, SIGNED); >>>> - wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len, >>>> - half_blocks_needed, prec, SIGNED); >>>> - >>>> - /* The 2 is for a full mult. */ >>>> - memset (r, 0, half_blocks_needed * 2 >>>> - * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); >>>> + ulen = op1len * 2; >>>> + vlen = op2len * 2; >>>> + >>>> + if (sgn == SIGNED) >>>> + { >>>> + if (wi::neg_p (op1)) >>>> + { >>>> + HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS]; >>>> + >>> E.g. following on from the above: >>> >>> /* Add an extra HWI so that we can represent the negative of >>> the minimum. */ >>> HOST_WIDE_INT *nop1 = XALLOCAVEC (HOST_WIDE_INT, op1len + 1); >>> >>>> + int nop1len >>>> + = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, >>>> 0); >>> Not sure about the "prec + 1" here, since it could cause nop1len to be >>> greater than the maximum length for "prec". And you go on to unpack >>> nop1 in "prec" rather than "prec + 1". >> the unpacking with prec is the wrong part. That should have been prec + 1 > Hmm, I still think it's the opposite, because... > >>> AIUI, once you've taken the absolutes, you've got two unsigned numbers >>> of precision "prec" and create a "prec * 2" result, so naybe: >>> sub_large (..., prec, UNSIGNED, 0) instead? >> the signed and unsigned does not matter here. the prec + 1 means we >> never overflow. > ...my point is that after this we treat the number as unsigned, > so it isn't a question of whether the absolute value overflows > the precision as a signed number but whether it overflows as an > unsigned number. And it never will. It might need 1 extra HWI > than the original input if op1len < blocks_needed, but it shouldn't > need extra precision. > > E.g. say for 8-bit HWIs we start out with a signed -0x80 * -0x80. > We convert that into an unsigned 0x80 * 0x80, which is still > precision 8 * precision 8 -> precision 16. And we'd need to be > able to handle those precision-8 values correctly (with no extra > zero HWI) if we had an unsigned 0x80 * 0x80 from the outset. it will not work like that. You are thinking that i really can do an 8 bit X 8 bit multiply. I cannot. The case that i am worried about is 115 bits x 115 bits. in this case i have to make sure that the top bits of the top block are provisioned properly. If you think about your case above where bit 114 is set and the bits below it are zeros, then i need to invert it so that the bit 115 and above are zero. otherwise i get the top of the block polluted with the 1 bits which would mess up the top block multiplied by any of the lower blocks. however, you are, at a deeper level correct. if i change the parms to wi_unpack to UNSIGNED then it is likely you are correct. I will think this through today and see if i can find a counter example. but it may be true that if i invert the number, i always want the top bits of the block to be 0 which is what the sign parameter controls. >>>> + is_neg = !is_neg; >>>> + ulen = nop1len * 2; >>>> + wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len, >>>> + ulen, prec, SIGNED); >>> Back on the alloca theme, we'd have: >>> >>> u = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, ulen); >>> >>> before this and other unpacks. >>> >>> Note that there's supposed to be space after ")" in casts. Lots of >>> instances in the patch :-) i actually did not know this. i will fix. >>> >>>> + /* UNSIGNED mul. */ >>>> + /* For large positive integers inside regular wide-ints, we must >>>> + explictly expand out the 1s to full width for the mul to be >>>> + correct. Note that the top bit will never be set for a >>>> + unsigned number in offset_int of widest-int. */ >>> s/of/or/. But I think the comment is confusing because the unsignedness >>> is a property of the multiplication rather than the inputs. We should never >>> be doing an unsigned operation on widest_int to begin with. And if >>> offset_ints are being treated as having a sign then the same is true there. >>> >>> If we ever do have an unsigned multiplication on those types for some reason >>> then -1 will (rightly) be treated as the maximum unsigned value. >> The real question here is "do you want to do an assert or do you want to >> explain what happens if the input comes in that way?" >
>> + vallen = canonize (val, (uvlen + 1) >> 1, prec); >> + >> + /* Shift is not always safe to write over one of the >> + operands, so we must copy. */ >> + HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS]; >> + memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); > vallen * sizeof (HOST_WIDE_INT) would be more typical. > But why not unpack into tval directly and avoid the copy? I could special case this, but the old code was not correct for odd precisions.
Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>> The current world >>> is actually structured so that we never ask about overflow for the two >>> larger classes because the reason that you used those classes was that >>> you never wanted to have this discussion. So if you never ask about >>> overflow, then it really does not matter because we are not going to >>> return enough bits for you to care what happened on the inside. Of >>> course that could change and someone could say that they wanted overflow >>> on widest-int. Then the comment makes sense, with revisions, unless >>> your review of the code that wants overflow on widest int suggests that >>> they are just being stupid. >> But widest_int is now supposed to be at least 1 bit wider than widest >> input type (unlike previously where it was double the widest input type). >> So I can definitely see cases where we'd want to know whether a >> widest_int * widest_int result overflows. >> >> My point is that the widest_int * widest_int would normally be a signed >> multiplication rather than an unsigned multiplication, since the extra >> 1 bit of precision allows every operation to be signed. So it isn't >> a case of whether the top bit of a widest_int will be set, but whether >> we ever reach here for widest_int in the first place. We should be >> going down the sgn == SIGNED path rather than the sgn == UNSIGNED path. >> >> widest_int can represent an all-1s value, usually interpreted as -1. >> If we do go down this sgn == UNSIGNED path for widest_int then we will >> instead treat the all-1s value as the maximum unsigned number, just like >> for any other kind of wide int. >> >> As far as this function goes there really is no difference between >> wide_int, offset_int and widest_int. Which is good, because offset_int >> and widest_int should just be wide_ints that are optimised for a specific >> and fixed precision. >> >> Thanks, >> Richard > I am now seriously regretting letting richi talk me into changing the > size of the wide int buffer from being 2x of the largest mode on the > machine. It was a terrible mistake AND i would guess making it smaller > does not provide any real benefit. > > The problem is that when you use widest-int (and by analogy offset int) > it should NEVER EVER overflow. Furthermore we need to change the > interfaces for these two so that you cannot even ask!!!!!! (i do not > believe that anyone does ask so the change would be small.) offset_int * offset_int could overflow too, at least in the sense that there are combinations of valid offset_ints whose product can't be represented in an offset_int. E.g. (1ULL << 67) * (1ULL << 67). I think that was always the case. > There are a huge set of bugs on the trunk that are "fixed" with wide-int > because people wrote code for double-int thinking that it was infinite > precision. So they never tested the cases of what happens when the > size of the variable needed two HWIs. Most of those cases were > resolved by making passes like tree-vrp use wide-int and then being > explicit about the overflow on every operation, because with wide-int > the issue is in your face since things overflow even for 32 bit > numbers. However, with the current widest-int, we will only be safe for > add and subtract by adding the extra bit. In multiply we are exposed. > The perception is that widest-int is a good as infinite precision and no > one will ever write the code to check if it overflowed because it only > rarely happens. All operations can overflow. We would need 2 extra bits rather than 1 extra bit to stop addition overflowing, because the 1 extra bit we already have is to allow unsigned values to be treated as signed. But 2 extra bits is only good for one addition, not a chain of two additions. That's why ignoring overflow seems dangerous to me. The old wide-int way might have allowed any x * y to be represented, but if nothing checked whether x * y was bigger than expected then x * y + z could overflow. Thanks, Richard
Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>> + vallen = canonize (val, (uvlen + 1) >> 1, prec); >>> + >>> + /* Shift is not always safe to write over one of the >>> + operands, so we must copy. */ >>> + HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS]; >>> + memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); > > >> vallen * sizeof (HOST_WIDE_INT) would be more typical. >> But why not unpack into tval directly and avoid the copy? > I could special case this, but the old code was not correct for odd > precisions. It's not really special-casing, since the pack is already local to this block. I.e. the patch had: wi_pack ((unsigned HOST_WIDE_INT *) val, r, uvlen); vallen = canonize (val, (uvlen + 1) >> 1, prec); /* Shift is not always safe to write over one of the operands, so we must copy. */ HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS]; memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); vallen = wi::lrshift_large (val, tval, vallen, prec*2, prec, prec); and I think it should be: unsigned int tvallen = (uvlen + 1) >> 1; HOST_WIDE_INT *tval = XALLOCAVEC (HOST_WIDE_INT, tvallen); wi_pack ((unsigned HOST_WIDE_INT *) tval, r, tvallen); tvallen = canonize (tval, tvalen, prec); vallen = wi::lrshift_large (val, tval, tvallen, prec * 2, prec, prec); Thanks, Richard
On 12/15/2013 03:54 AM, Richard Sandiford wrote: > Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>>> The current world >>>> is actually structured so that we never ask about overflow for the two >>>> larger classes because the reason that you used those classes was that >>>> you never wanted to have this discussion. So if you never ask about >>>> overflow, then it really does not matter because we are not going to >>>> return enough bits for you to care what happened on the inside. Of >>>> course that could change and someone could say that they wanted overflow >>>> on widest-int. Then the comment makes sense, with revisions, unless >>>> your review of the code that wants overflow on widest int suggests that >>>> they are just being stupid. >>> But widest_int is now supposed to be at least 1 bit wider than widest >>> input type (unlike previously where it was double the widest input type). >>> So I can definitely see cases where we'd want to know whether a >>> widest_int * widest_int result overflows. >>> >>> My point is that the widest_int * widest_int would normally be a signed >>> multiplication rather than an unsigned multiplication, since the extra >>> 1 bit of precision allows every operation to be signed. So it isn't >>> a case of whether the top bit of a widest_int will be set, but whether >>> we ever reach here for widest_int in the first place. We should be >>> going down the sgn == SIGNED path rather than the sgn == UNSIGNED path. >>> >>> widest_int can represent an all-1s value, usually interpreted as -1. >>> If we do go down this sgn == UNSIGNED path for widest_int then we will >>> instead treat the all-1s value as the maximum unsigned number, just like >>> for any other kind of wide int. >>> >>> As far as this function goes there really is no difference between >>> wide_int, offset_int and widest_int. Which is good, because offset_int >>> and widest_int should just be wide_ints that are optimised for a specific >>> and fixed precision. >>> >>> Thanks, >>> Richard >> I am now seriously regretting letting richi talk me into changing the >> size of the wide int buffer from being 2x of the largest mode on the >> machine. It was a terrible mistake AND i would guess making it smaller >> does not provide any real benefit. >> >> The problem is that when you use widest-int (and by analogy offset int) >> it should NEVER EVER overflow. Furthermore we need to change the >> interfaces for these two so that you cannot even ask!!!!!! (i do not >> believe that anyone does ask so the change would be small.) > offset_int * offset_int could overflow too, at least in the sense that > there are combinations of valid offset_ints whose product can't be > represented in an offset_int. E.g. (1ULL << 67) * (1ULL << 67). > I think that was always the case. see answer below. > >> There are a huge set of bugs on the trunk that are "fixed" with wide-int >> because people wrote code for double-int thinking that it was infinite >> precision. So they never tested the cases of what happens when the >> size of the variable needed two HWIs. Most of those cases were >> resolved by making passes like tree-vrp use wide-int and then being >> explicit about the overflow on every operation, because with wide-int >> the issue is in your face since things overflow even for 32 bit >> numbers. However, with the current widest-int, we will only be safe for >> add and subtract by adding the extra bit. In multiply we are exposed. >> The perception is that widest-int is a good as infinite precision and no >> one will ever write the code to check if it overflowed because it only >> rarely happens. > All operations can overflow. We would need 2 extra bits rather than 1 > extra bit to stop addition overflowing, because the 1 extra bit we already > have is to allow unsigned values to be treated as signed. But 2 extra bits > is only good for one addition, not a chain of two additions. > > That's why ignoring overflow seems dangerous to me. The old wide-int > way might have allowed any x * y to be represented, but if nothing > checked whether x * y was bigger than expected then x * y + z could > overflow. > > Thanks, > Richard > > it is certainly true that in order to do an unbounded set of operations, you would have to check on every operation. so my suggestion that we should remove the checking from the infinite precision would not support this. but the reality is that there are currently no places in the compiler that do this. Currently all of the uses of widest-int are one or two operations, and the style of code writing is that you do these and then you deal with the overflow at the time that you convert the widest-int to a tree. I think that it is important to maintain the style of programming where for a small finite number of computations do not need to check until they convert back. The problem with making the buffer size so tight is that we do not have an adequate reserves to allow this style for any supportable type. I personally think that 2x + some small n is what we need to have. i am not as familiar with how this is used (or to be used when all of the offset math is converted to use wide-int), but there appear to be two uses of multiply. one is the "harmless" mult by 3" and the other is where people are trying to compute the size of arrays. These last operations do need to be checked for overflow. The question here is do you want to force those operations to overflow individually or do you want to check when you convert out. Again, i think 2x + some small number is what we might want to consider. kenny
Kenneth Zadeck <zadeck@naturalbridge.com> writes: > it is certainly true that in order to do an unbounded set of operations, > you would have to check on every operation. so my suggestion that we > should remove the checking from the infinite precision would not support > this. but the reality is that there are currently no places in the > compiler that do this. > > Currently all of the uses of widest-int are one or two operations, and > the style of code writing is that you do these and then you deal with > the overflow at the time that you convert the widest-int to a tree. I > think that it is important to maintain the style of programming where > for a small finite number of computations do not need to check until > they convert back. > > The problem with making the buffer size so tight is that we do not have > an adequate reserves to allow this style for any supportable type. > I personally think that 2x + some small n is what we need to have. > > > i am not as familiar with how this is used (or to be used when all of > the offset math is converted to use wide-int), but there appear to be > two uses of multiply. one is the "harmless" mult by 3" and the other > is where people are trying to compute the size of arrays. These last > operations do need to be checked for overflow. The question here is > do you want to force those operations to overflow individually or do you > want to check when you convert out. Again, i think 2x + some small > number is what we might want to consider. It's a fair question, but personally I think checking for overflow on the operation is much more robust. Checking on conversion doesn't allow you to stop thinking about overflow, it just changes the way you think about it: rather than handling explicit overflow flags, you have to remember to ask "is the range of the unconverted result within the range of widest_int", which I bet it is something that would be easily forgotten once widest_int & co. are part of the furniture. E.g. the SPARC operation (picked only because I remember it): for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) { tree e0 = VECTOR_CST_ELT (arg0, i); tree e1 = VECTOR_CST_ELT (arg1, i); bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; tmp = wi::neg (e1, &neg1_ovf); tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); if (wi::neg_p (tmp)) tmp = wi::neg (tmp, &neg2_ovf); else neg2_ovf = false; result = wi::add (result, tmp, SIGNED, &add2_ovf); overflow |= neg1_ovf | neg2_ovf | add1_ovf | add2_ovf; } gcc_assert (!overflow); return wide_int_to_tree (rtype, result); seems pretty natural. If instead it was modelled as a widest_int chain without overflow then it would be less obviously correct. Thanks, Richard
On 12/15/2013 11:40 AM, Richard Sandiford wrote: > Kenneth Zadeck <zadeck@naturalbridge.com> writes: >> it is certainly true that in order to do an unbounded set of operations, >> you would have to check on every operation. so my suggestion that we >> should remove the checking from the infinite precision would not support >> this. but the reality is that there are currently no places in the >> compiler that do this. >> >> Currently all of the uses of widest-int are one or two operations, and >> the style of code writing is that you do these and then you deal with >> the overflow at the time that you convert the widest-int to a tree. I >> think that it is important to maintain the style of programming where >> for a small finite number of computations do not need to check until >> they convert back. >> >> The problem with making the buffer size so tight is that we do not have >> an adequate reserves to allow this style for any supportable type. >> I personally think that 2x + some small n is what we need to have. >> >> >> i am not as familiar with how this is used (or to be used when all of >> the offset math is converted to use wide-int), but there appear to be >> two uses of multiply. one is the "harmless" mult by 3" and the other >> is where people are trying to compute the size of arrays. These last >> operations do need to be checked for overflow. The question here is >> do you want to force those operations to overflow individually or do you >> want to check when you convert out. Again, i think 2x + some small >> number is what we might want to consider. > It's a fair question, but personally I think checking for overflow > on the operation is much more robust. Checking on conversion doesn't > allow you to stop thinking about overflow, it just changes the way you > think about it: rather than handling explicit overflow flags, you have > to remember to ask "is the range of the unconverted result within the > range of widest_int", which I bet it is something that would be easily > forgotten once widest_int & co. are part of the furniture. > > E.g. the SPARC operation (picked only because I remember it): > > for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) > { > tree e0 = VECTOR_CST_ELT (arg0, i); > tree e1 = VECTOR_CST_ELT (arg1, i); > > bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; > > tmp = wi::neg (e1, &neg1_ovf); > tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); > if (wi::neg_p (tmp)) > tmp = wi::neg (tmp, &neg2_ovf); > else > neg2_ovf = false; > result = wi::add (result, tmp, SIGNED, &add2_ovf); > overflow |= neg1_ovf | neg2_ovf | add1_ovf | add2_ovf; > } > > gcc_assert (!overflow); > > return wide_int_to_tree (rtype, result); > > seems pretty natural. If instead it was modelled as a widest_int > chain without overflow then it would be less obviously correct. > > Thanks, > Richard Let us for the sake of argument assume that this was common code rather than code in a particular port, because code in a particular port can know more about the environment than common code is allowed to. My main point is that this code is in wide-int not widest-int because at this level the writer of this code actually wants to model what the target wants to do. So doing the adds in precision and testing overflow is perfectly fine at every step. But this loop CANNOT be written in a style where you tested the overflow at the end because if this is common code you cannot make any assumptions about the largest mode on the machine. If the buffer was 2x + n in size, then it would be reasonably safe to assume that the number of elements in the vector could be represented in an integer and so you could wait till the end. I think that my point was that (and i feel a little uncomfortable putting words in richi's mouth but i believe that this was his point early on) was that he thinks of the widest int as an infinite precision representation. he was the one who was pushing for the entire rep to be done with a large internal (or perhaps unbounded) rep because he felt that this was more natural to not have to think about overflow. He wanted you to be able to chain a mult and a divide and not see the product get truncated before the divide was done. The rep that we have now really sucks with respect to this because widest int truncates if you are close to the largest precision on the machine and does not if you are small with respect to that. My other point is that while you think that the example above is nice, the experience with double-int is contrary to this. people will say (and test) the normal modes and anyone trying to use large modes will die a terrible death of a thousand cuts.
On 12/15/13 7:48 PM, Kenneth Zadeck wrote: > > On 12/15/2013 11:40 AM, Richard Sandiford wrote: >> Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>> it is certainly true that in order to do an unbounded set of operations, >>> you would have to check on every operation. so my suggestion that we >>> should remove the checking from the infinite precision would not support >>> this. but the reality is that there are currently no places in the >>> compiler that do this. >>> >>> Currently all of the uses of widest-int are one or two operations, and >>> the style of code writing is that you do these and then you deal with >>> the overflow at the time that you convert the widest-int to a tree. I >>> think that it is important to maintain the style of programming where >>> for a small finite number of computations do not need to check until >>> they convert back. >>> >>> The problem with making the buffer size so tight is that we do not have >>> an adequate reserves to allow this style for any supportable type. >>> I personally think that 2x + some small n is what we need to have. >>> >>> >>> i am not as familiar with how this is used (or to be used when all of >>> the offset math is converted to use wide-int), but there appear to be >>> two uses of multiply. one is the "harmless" mult by 3" and the other >>> is where people are trying to compute the size of arrays. These last >>> operations do need to be checked for overflow. The question here is >>> do you want to force those operations to overflow individually or do you >>> want to check when you convert out. Again, i think 2x + some small >>> number is what we might want to consider. >> It's a fair question, but personally I think checking for overflow >> on the operation is much more robust. Checking on conversion doesn't >> allow you to stop thinking about overflow, it just changes the way you >> think about it: rather than handling explicit overflow flags, you have >> to remember to ask "is the range of the unconverted result within the >> range of widest_int", which I bet it is something that would be easily >> forgotten once widest_int & co. are part of the furniture. >> >> E.g. the SPARC operation (picked only because I remember it): >> >> for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) >> { >> tree e0 = VECTOR_CST_ELT (arg0, i); >> tree e1 = VECTOR_CST_ELT (arg1, i); >> >> bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; >> >> tmp = wi::neg (e1, &neg1_ovf); >> tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); >> if (wi::neg_p (tmp)) >> tmp = wi::neg (tmp, &neg2_ovf); >> else >> neg2_ovf = false; >> result = wi::add (result, tmp, SIGNED, &add2_ovf); >> overflow |= neg1_ovf | neg2_ovf | add1_ovf | add2_ovf; >> } >> >> gcc_assert (!overflow); >> >> return wide_int_to_tree (rtype, result); >> >> seems pretty natural. If instead it was modelled as a widest_int >> chain without overflow then it would be less obviously correct. >> >> Thanks, >> Richard > Let us for the sake of argument assume that this was common code rather > than code in a particular port, because code in a particular port can > know more about the environment than common code is allowed to. > > My main point is that this code is in wide-int not widest-int because at > this level the writer of this code actually wants to model what the > target wants to do. So doing the adds in precision and testing > overflow is perfectly fine at every step. But this loop CANNOT be > written in a style where you tested the overflow at the end because if > this is common code you cannot make any assumptions about the largest > mode on the machine. If the buffer was 2x + n in size, then it would > be reasonably safe to assume that the number of elements in the vector > could be represented in an integer and so you could wait till the end. > > I think that my point was that (and i feel a little uncomfortable > putting words in richi's mouth but i believe that this was his point > early on) was that he thinks of the widest int as an infinite precision > representation. he was the one who was pushing for the entire rep to > be done with a large internal (or perhaps unbounded) rep because he felt > that this was more natural to not have to think about overflow. He > wanted you to be able to chain a mult and a divide and not see the > product get truncated before the divide was done. The rep that we > have now really sucks with respect to this because widest int truncates > if you are close to the largest precision on the machine and does not if > you are small with respect to that. > > My other point is that while you think that the example above is nice, > the experience with double-int is contrary to this. people will say > (and test) the normal modes and anyone trying to use large modes will > die a terrible death of a thousand cuts. Well - the cases that matter in practice are 1) the things we have offset_int for - code that does bit vs. byte quantity calculations on addresses or address offsets. It used either HWI before (and probably still does, and thus is buggy) or double-int. The usual patter was/is to do host_integerp (t, 0) and then TREE_LOW_CST (t) * BITS_PER_UNIT (oops) or blindly assume that doing things in double-int works (which it does in practice). 2) passes that want to know whether a single operation overflows the multiple-operation and then check overflow after-the-fact is seldomly used - it is, mainly from the frontends which use trees and thus get a sticky TREE_OVERFLOW. Yes, infinite precision would make this work as well, and yes, originally I thought of basing all of wide-int on an internally infinite precision implementation (and luckily we are close enough that I may end up fixing the implementation detail to work that way ...). With the infinite precision internal rep you'd have explicit truncations and sign-/zero-extensions at the right point and failing to do that before conversion to tree/RTX could have been easily turned into ICEs saying we overflowed and nobody cared. Well. Let's see how the thing we have now works out. Richard. > >
On 12/16/2013 06:19 AM, Richard Biener wrote: > On 12/15/13 7:48 PM, Kenneth Zadeck wrote: >> On 12/15/2013 11:40 AM, Richard Sandiford wrote: >>> Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>>> it is certainly true that in order to do an unbounded set of operations, >>>> you would have to check on every operation. so my suggestion that we >>>> should remove the checking from the infinite precision would not support >>>> this. but the reality is that there are currently no places in the >>>> compiler that do this. >>>> >>>> Currently all of the uses of widest-int are one or two operations, and >>>> the style of code writing is that you do these and then you deal with >>>> the overflow at the time that you convert the widest-int to a tree. I >>>> think that it is important to maintain the style of programming where >>>> for a small finite number of computations do not need to check until >>>> they convert back. >>>> >>>> The problem with making the buffer size so tight is that we do not have >>>> an adequate reserves to allow this style for any supportable type. >>>> I personally think that 2x + some small n is what we need to have. >>>> >>>> >>>> i am not as familiar with how this is used (or to be used when all of >>>> the offset math is converted to use wide-int), but there appear to be >>>> two uses of multiply. one is the "harmless" mult by 3" and the other >>>> is where people are trying to compute the size of arrays. These last >>>> operations do need to be checked for overflow. The question here is >>>> do you want to force those operations to overflow individually or do you >>>> want to check when you convert out. Again, i think 2x + some small >>>> number is what we might want to consider. >>> It's a fair question, but personally I think checking for overflow >>> on the operation is much more robust. Checking on conversion doesn't >>> allow you to stop thinking about overflow, it just changes the way you >>> think about it: rather than handling explicit overflow flags, you have >>> to remember to ask "is the range of the unconverted result within the >>> range of widest_int", which I bet it is something that would be easily >>> forgotten once widest_int & co. are part of the furniture. >>> >>> E.g. the SPARC operation (picked only because I remember it): >>> >>> for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) >>> { >>> tree e0 = VECTOR_CST_ELT (arg0, i); >>> tree e1 = VECTOR_CST_ELT (arg1, i); >>> >>> bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; >>> >>> tmp = wi::neg (e1, &neg1_ovf); >>> tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); >>> if (wi::neg_p (tmp)) >>> tmp = wi::neg (tmp, &neg2_ovf); >>> else >>> neg2_ovf = false; >>> result = wi::add (result, tmp, SIGNED, &add2_ovf); >>> overflow |= neg1_ovf | neg2_ovf | add1_ovf | add2_ovf; >>> } >>> >>> gcc_assert (!overflow); >>> >>> return wide_int_to_tree (rtype, result); >>> >>> seems pretty natural. If instead it was modelled as a widest_int >>> chain without overflow then it would be less obviously correct. >>> >>> Thanks, >>> Richard >> Let us for the sake of argument assume that this was common code rather >> than code in a particular port, because code in a particular port can >> know more about the environment than common code is allowed to. >> >> My main point is that this code is in wide-int not widest-int because at >> this level the writer of this code actually wants to model what the >> target wants to do. So doing the adds in precision and testing >> overflow is perfectly fine at every step. But this loop CANNOT be >> written in a style where you tested the overflow at the end because if >> this is common code you cannot make any assumptions about the largest >> mode on the machine. If the buffer was 2x + n in size, then it would >> be reasonably safe to assume that the number of elements in the vector >> could be represented in an integer and so you could wait till the end. >> >> I think that my point was that (and i feel a little uncomfortable >> putting words in richi's mouth but i believe that this was his point >> early on) was that he thinks of the widest int as an infinite precision >> representation. he was the one who was pushing for the entire rep to >> be done with a large internal (or perhaps unbounded) rep because he felt >> that this was more natural to not have to think about overflow. He >> wanted you to be able to chain a mult and a divide and not see the >> product get truncated before the divide was done. The rep that we >> have now really sucks with respect to this because widest int truncates >> if you are close to the largest precision on the machine and does not if >> you are small with respect to that. >> >> My other point is that while you think that the example above is nice, >> the experience with double-int is contrary to this. people will say >> (and test) the normal modes and anyone trying to use large modes will >> die a terrible death of a thousand cuts. > Well - the cases that matter in practice are > > 1) the things we have offset_int for - code that does bit vs. byte > quantity calculations on addresses or address offsets. It used > either HWI before (and probably still does, and thus is buggy) or > double-int. The usual patter was/is to do host_integerp (t, 0) > and then TREE_LOW_CST (t) * BITS_PER_UNIT (oops) or blindly assume > that doing things in double-int works (which it does in practice). > > 2) passes that want to know whether a single operation overflows > > the multiple-operation and then check overflow after-the-fact is > seldomly used - it is, mainly from the frontends which use trees > and thus get a sticky TREE_OVERFLOW. Yes, infinite precision > would make this work as well, and yes, originally I thought of > basing all of wide-int on an internally infinite precision > implementation (and luckily we are close enough that I may end > up fixing the implementation detail to work that way ...). > With the infinite precision internal rep you'd have explicit > truncations and sign-/zero-extensions at the right point and > failing to do that before conversion to tree/RTX could have been > easily turned into ICEs saying we overflowed and nobody cared. > > Well. Let's see how the thing we have now works out. richi, i think that you miss the point. right now on the x86-64, the max size is 128 +64 bits. On my private platform it is 256 + 64 bits. if, at the tree level, i do a widest-int multiply of two timode values that are large with respect to their precision, i get different answers on the two targets. 95% of the existing trunk bugs that we fixed we because people thought that double-int was big enough so that they did not have to care. But what you and richard are saying is that you do need to care (even though none of the code currently does care), but only for rarely executed cases. With widest-int and a larger buffer we could can make that expectation true in all of the places where you get in, do some finite work and get out. In my opinion, being able to do this is the only reason for widest-int and if this does not work reliably, then i would be inclined to convert everything to the explicit precision where the definition of what is done is rock solid. Because to me, if the rep is not completely predictable, then it is not worth using. Kenny > Richard. > >>
Kenneth Zadeck <zadeck@naturalbridge.com> wrote: >On 12/16/2013 06:19 AM, Richard Biener wrote: >> On 12/15/13 7:48 PM, Kenneth Zadeck wrote: >>> On 12/15/2013 11:40 AM, Richard Sandiford wrote: >>>> Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>>>> it is certainly true that in order to do an unbounded set of >operations, >>>>> you would have to check on every operation. so my suggestion >that we >>>>> should remove the checking from the infinite precision would not >support >>>>> this. but the reality is that there are currently no places in >the >>>>> compiler that do this. >>>>> >>>>> Currently all of the uses of widest-int are one or two operations, >and >>>>> the style of code writing is that you do these and then you deal >with >>>>> the overflow at the time that you convert the widest-int to a >tree. I >>>>> think that it is important to maintain the style of programming >where >>>>> for a small finite number of computations do not need to check >until >>>>> they convert back. >>>>> >>>>> The problem with making the buffer size so tight is that we do not >have >>>>> an adequate reserves to allow this style for any supportable type. >>>>> I personally think that 2x + some small n is what we need to have. >>>>> >>>>> >>>>> i am not as familiar with how this is used (or to be used when all >of >>>>> the offset math is converted to use wide-int), but there appear to >be >>>>> two uses of multiply. one is the "harmless" mult by 3" and the >other >>>>> is where people are trying to compute the size of arrays. These >last >>>>> operations do need to be checked for overflow. The question >here is >>>>> do you want to force those operations to overflow individually or >do you >>>>> want to check when you convert out. Again, i think 2x + some >small >>>>> number is what we might want to consider. >>>> It's a fair question, but personally I think checking for overflow >>>> on the operation is much more robust. Checking on conversion >doesn't >>>> allow you to stop thinking about overflow, it just changes the way >you >>>> think about it: rather than handling explicit overflow flags, you >have >>>> to remember to ask "is the range of the unconverted result within >the >>>> range of widest_int", which I bet it is something that would be >easily >>>> forgotten once widest_int & co. are part of the furniture. >>>> >>>> E.g. the SPARC operation (picked only because I remember it): >>>> >>>> for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) >>>> { >>>> tree e0 = VECTOR_CST_ELT (arg0, i); >>>> tree e1 = VECTOR_CST_ELT (arg1, i); >>>> >>>> bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; >>>> >>>> tmp = wi::neg (e1, &neg1_ovf); >>>> tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); >>>> if (wi::neg_p (tmp)) >>>> tmp = wi::neg (tmp, &neg2_ovf); >>>> else >>>> neg2_ovf = false; >>>> result = wi::add (result, tmp, SIGNED, &add2_ovf); >>>> overflow |= neg1_ovf | neg2_ovf | add1_ovf | add2_ovf; >>>> } >>>> >>>> gcc_assert (!overflow); >>>> >>>> return wide_int_to_tree (rtype, result); >>>> >>>> seems pretty natural. If instead it was modelled as a widest_int >>>> chain without overflow then it would be less obviously correct. >>>> >>>> Thanks, >>>> Richard >>> Let us for the sake of argument assume that this was common code >rather >>> than code in a particular port, because code in a particular port >can >>> know more about the environment than common code is allowed to. >>> >>> My main point is that this code is in wide-int not widest-int >because at >>> this level the writer of this code actually wants to model what the >>> target wants to do. So doing the adds in precision and testing >>> overflow is perfectly fine at every step. But this loop CANNOT be >>> written in a style where you tested the overflow at the end because >if >>> this is common code you cannot make any assumptions about the >largest >>> mode on the machine. If the buffer was 2x + n in size, then it >would >>> be reasonably safe to assume that the number of elements in the >vector >>> could be represented in an integer and so you could wait till the >end. >>> >>> I think that my point was that (and i feel a little uncomfortable >>> putting words in richi's mouth but i believe that this was his point >>> early on) was that he thinks of the widest int as an infinite >precision >>> representation. he was the one who was pushing for the entire rep >to >>> be done with a large internal (or perhaps unbounded) rep because he >felt >>> that this was more natural to not have to think about overflow. >He >>> wanted you to be able to chain a mult and a divide and not see the >>> product get truncated before the divide was done. The rep that we >>> have now really sucks with respect to this because widest int >truncates >>> if you are close to the largest precision on the machine and does >not if >>> you are small with respect to that. >>> >>> My other point is that while you think that the example above is >nice, >>> the experience with double-int is contrary to this. people will >say >>> (and test) the normal modes and anyone trying to use large modes >will >>> die a terrible death of a thousand cuts. >> Well - the cases that matter in practice are >> >> 1) the things we have offset_int for - code that does bit vs. byte >> quantity calculations on addresses or address offsets. It used >> either HWI before (and probably still does, and thus is buggy) or >> double-int. The usual patter was/is to do host_integerp (t, 0) >> and then TREE_LOW_CST (t) * BITS_PER_UNIT (oops) or blindly assume >> that doing things in double-int works (which it does in practice). >> >> 2) passes that want to know whether a single operation overflows >> >> the multiple-operation and then check overflow after-the-fact is >> seldomly used - it is, mainly from the frontends which use trees >> and thus get a sticky TREE_OVERFLOW. Yes, infinite precision >> would make this work as well, and yes, originally I thought of >> basing all of wide-int on an internally infinite precision >> implementation (and luckily we are close enough that I may end >> up fixing the implementation detail to work that way ...). >> With the infinite precision internal rep you'd have explicit >> truncations and sign-/zero-extensions at the right point and >> failing to do that before conversion to tree/RTX could have been >> easily turned into ICEs saying we overflowed and nobody cared. >> >> Well. Let's see how the thing we have now works out. >richi, > >i think that you miss the point. right now on the x86-64, the max >size >is 128 +64 bits. On my private platform it is 256 + 64 bits. > >if, at the tree level, i do a widest-int multiply of two timode values >that are large with respect to their precision, i get different answers >on the two targets. 95% of the existing trunk bugs that we fixed we >because people thought that double-int was big enough so that they did >not have to care. But what you and richard are saying is that you do >need to care (even though none of the code currently does care), but >only for rarely executed cases. With widest-int and a larger buffer we > >could can make that expectation true in all of the places where you get > >in, do some finite work and get out. In my opinion, being able to do >this is the only reason for widest-int and if this does not work >reliably, then i would be inclined to convert everything to the >explicit >precision where the definition of what is done is rock solid. >Because >to me, if the rep is not completely predictable, then it is not worth >using. At the moment we do not have the infinite precision code. We have offset._int which effectively has a precision and widest int which is indeed somewhat useless. We have fixed int where you specify your desired infinite precision which is better. Note that we have shifted the double-int host dependence to a widest-int target dependence. Which is equally bad. Richard. >Kenny > > > >> Richard. >> >>>
On 12/16/2013 09:37 AM, Richard Biener wrote: > Kenneth Zadeck <zadeck@naturalbridge.com> wrote: >> On 12/16/2013 06:19 AM, Richard Biener wrote: >>> On 12/15/13 7:48 PM, Kenneth Zadeck wrote: >>>> On 12/15/2013 11:40 AM, Richard Sandiford wrote: >>>>> Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>>>>> it is certainly true that in order to do an unbounded set of >> operations, >>>>>> you would have to check on every operation. so my suggestion >> that we >>>>>> should remove the checking from the infinite precision would not >> support >>>>>> this. but the reality is that there are currently no places in >> the >>>>>> compiler that do this. >>>>>> >>>>>> Currently all of the uses of widest-int are one or two operations, >> and >>>>>> the style of code writing is that you do these and then you deal >> with >>>>>> the overflow at the time that you convert the widest-int to a >> tree. I >>>>>> think that it is important to maintain the style of programming >> where >>>>>> for a small finite number of computations do not need to check >> until >>>>>> they convert back. >>>>>> >>>>>> The problem with making the buffer size so tight is that we do not >> have >>>>>> an adequate reserves to allow this style for any supportable type. >>>>>> I personally think that 2x + some small n is what we need to have. >>>>>> >>>>>> >>>>>> i am not as familiar with how this is used (or to be used when all >> of >>>>>> the offset math is converted to use wide-int), but there appear to >> be >>>>>> two uses of multiply. one is the "harmless" mult by 3" and the >> other >>>>>> is where people are trying to compute the size of arrays. These >> last >>>>>> operations do need to be checked for overflow. The question >> here is >>>>>> do you want to force those operations to overflow individually or >> do you >>>>>> want to check when you convert out. Again, i think 2x + some >> small >>>>>> number is what we might want to consider. >>>>> It's a fair question, but personally I think checking for overflow >>>>> on the operation is much more robust. Checking on conversion >> doesn't >>>>> allow you to stop thinking about overflow, it just changes the way >> you >>>>> think about it: rather than handling explicit overflow flags, you >> have >>>>> to remember to ask "is the range of the unconverted result within >> the >>>>> range of widest_int", which I bet it is something that would be >> easily >>>>> forgotten once widest_int & co. are part of the furniture. >>>>> >>>>> E.g. the SPARC operation (picked only because I remember it): >>>>> >>>>> for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) >>>>> { >>>>> tree e0 = VECTOR_CST_ELT (arg0, i); >>>>> tree e1 = VECTOR_CST_ELT (arg1, i); >>>>> >>>>> bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; >>>>> >>>>> tmp = wi::neg (e1, &neg1_ovf); >>>>> tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); >>>>> if (wi::neg_p (tmp)) >>>>> tmp = wi::neg (tmp, &neg2_ovf); >>>>> else >>>>> neg2_ovf = false; >>>>> result = wi::add (result, tmp, SIGNED, &add2_ovf); >>>>> overflow |= neg1_ovf | neg2_ovf | add1_ovf | add2_ovf; >>>>> } >>>>> >>>>> gcc_assert (!overflow); >>>>> >>>>> return wide_int_to_tree (rtype, result); >>>>> >>>>> seems pretty natural. If instead it was modelled as a widest_int >>>>> chain without overflow then it would be less obviously correct. >>>>> >>>>> Thanks, >>>>> Richard >>>> Let us for the sake of argument assume that this was common code >> rather >>>> than code in a particular port, because code in a particular port >> can >>>> know more about the environment than common code is allowed to. >>>> >>>> My main point is that this code is in wide-int not widest-int >> because at >>>> this level the writer of this code actually wants to model what the >>>> target wants to do. So doing the adds in precision and testing >>>> overflow is perfectly fine at every step. But this loop CANNOT be >>>> written in a style where you tested the overflow at the end because >> if >>>> this is common code you cannot make any assumptions about the >> largest >>>> mode on the machine. If the buffer was 2x + n in size, then it >> would >>>> be reasonably safe to assume that the number of elements in the >> vector >>>> could be represented in an integer and so you could wait till the >> end. >>>> I think that my point was that (and i feel a little uncomfortable >>>> putting words in richi's mouth but i believe that this was his point >>>> early on) was that he thinks of the widest int as an infinite >> precision >>>> representation. he was the one who was pushing for the entire rep >> to >>>> be done with a large internal (or perhaps unbounded) rep because he >> felt >>>> that this was more natural to not have to think about overflow. >> He >>>> wanted you to be able to chain a mult and a divide and not see the >>>> product get truncated before the divide was done. The rep that we >>>> have now really sucks with respect to this because widest int >> truncates >>>> if you are close to the largest precision on the machine and does >> not if >>>> you are small with respect to that. >>>> >>>> My other point is that while you think that the example above is >> nice, >>>> the experience with double-int is contrary to this. people will >> say >>>> (and test) the normal modes and anyone trying to use large modes >> will >>>> die a terrible death of a thousand cuts. >>> Well - the cases that matter in practice are >>> >>> 1) the things we have offset_int for - code that does bit vs. byte >>> quantity calculations on addresses or address offsets. It used >>> either HWI before (and probably still does, and thus is buggy) or >>> double-int. The usual patter was/is to do host_integerp (t, 0) >>> and then TREE_LOW_CST (t) * BITS_PER_UNIT (oops) or blindly assume >>> that doing things in double-int works (which it does in practice). >>> >>> 2) passes that want to know whether a single operation overflows >>> >>> the multiple-operation and then check overflow after-the-fact is >>> seldomly used - it is, mainly from the frontends which use trees >>> and thus get a sticky TREE_OVERFLOW. Yes, infinite precision >>> would make this work as well, and yes, originally I thought of >>> basing all of wide-int on an internally infinite precision >>> implementation (and luckily we are close enough that I may end >>> up fixing the implementation detail to work that way ...). >>> With the infinite precision internal rep you'd have explicit >>> truncations and sign-/zero-extensions at the right point and >>> failing to do that before conversion to tree/RTX could have been >>> easily turned into ICEs saying we overflowed and nobody cared. >>> >>> Well. Let's see how the thing we have now works out. >> richi, >> >> i think that you miss the point. right now on the x86-64, the max >> size >> is 128 +64 bits. On my private platform it is 256 + 64 bits. >> >> if, at the tree level, i do a widest-int multiply of two timode values >> that are large with respect to their precision, i get different answers >> on the two targets. 95% of the existing trunk bugs that we fixed we >> because people thought that double-int was big enough so that they did >> not have to care. But what you and richard are saying is that you do >> need to care (even though none of the code currently does care), but >> only for rarely executed cases. With widest-int and a larger buffer we >> >> could can make that expectation true in all of the places where you get >> >> in, do some finite work and get out. In my opinion, being able to do >> this is the only reason for widest-int and if this does not work >> reliably, then i would be inclined to convert everything to the >> explicit >> precision where the definition of what is done is rock solid. >> Because >> to me, if the rep is not completely predictable, then it is not worth >> using. > At the moment we do not have the infinite precision code. We have offset._int which effectively has a precision and widest int which is indeed somewhat useless. We have fixed int where you specify your desired infinite precision which is better. > > Note that we have shifted the double-int host dependence to a widest-int target dependence. Which is equally bad. > > Richard. yes, my point is that it does not need to be this bad, we can make the buffer large enough to hide this, at least for the kind of bounded computation that we have in the compiler. Having written most of this code, we tend to get in, do a couple of things and then get out. If we up the buffer size, that will always work reliably. I realize that it will not be perfect, in that if you have enough bounded size computation you would hit the limit. But right now every multiply (except the ones by 3 in the offset int code) are a source of failure. We can do a lot better than that by upping the buffer size. you and i have slightly different experiences here. i did not do a lot of the offset int code and so i am not as familiar with the potential for bugs here. But I did a significant amount of the widest-int code, and for that, 2x + 2 bits makes all the potential problems go away. my guess is that the non 3x multiplies in offset-int most likely need to be checked. >> Kenny >> >> >> >>> Richard. >>> >
Kenneth Zadeck <zadeck@naturalbridge.com> wrote: >On 12/16/2013 09:37 AM, Richard Biener wrote: >> Kenneth Zadeck <zadeck@naturalbridge.com> wrote: >>> On 12/16/2013 06:19 AM, Richard Biener wrote: >>>> On 12/15/13 7:48 PM, Kenneth Zadeck wrote: >>>>> On 12/15/2013 11:40 AM, Richard Sandiford wrote: >>>>>> Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>>>>>> it is certainly true that in order to do an unbounded set of >>> operations, >>>>>>> you would have to check on every operation. so my suggestion >>> that we >>>>>>> should remove the checking from the infinite precision would not >>> support >>>>>>> this. but the reality is that there are currently no places >in >>> the >>>>>>> compiler that do this. >>>>>>> >>>>>>> Currently all of the uses of widest-int are one or two >operations, >>> and >>>>>>> the style of code writing is that you do these and then you deal >>> with >>>>>>> the overflow at the time that you convert the widest-int to a >>> tree. I >>>>>>> think that it is important to maintain the style of programming >>> where >>>>>>> for a small finite number of computations do not need to check >>> until >>>>>>> they convert back. >>>>>>> >>>>>>> The problem with making the buffer size so tight is that we do >not >>> have >>>>>>> an adequate reserves to allow this style for any supportable >type. >>>>>>> I personally think that 2x + some small n is what we need to >have. >>>>>>> >>>>>>> >>>>>>> i am not as familiar with how this is used (or to be used when >all >>> of >>>>>>> the offset math is converted to use wide-int), but there appear >to >>> be >>>>>>> two uses of multiply. one is the "harmless" mult by 3" and >the >>> other >>>>>>> is where people are trying to compute the size of arrays. >These >>> last >>>>>>> operations do need to be checked for overflow. The question >>> here is >>>>>>> do you want to force those operations to overflow individually >or >>> do you >>>>>>> want to check when you convert out. Again, i think 2x + some >>> small >>>>>>> number is what we might want to consider. >>>>>> It's a fair question, but personally I think checking for >overflow >>>>>> on the operation is much more robust. Checking on conversion >>> doesn't >>>>>> allow you to stop thinking about overflow, it just changes the >way >>> you >>>>>> think about it: rather than handling explicit overflow flags, you >>> have >>>>>> to remember to ask "is the range of the unconverted result within >>> the >>>>>> range of widest_int", which I bet it is something that would be >>> easily >>>>>> forgotten once widest_int & co. are part of the furniture. >>>>>> >>>>>> E.g. the SPARC operation (picked only because I remember it): >>>>>> >>>>>> for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) >>>>>> { >>>>>> tree e0 = VECTOR_CST_ELT (arg0, i); >>>>>> tree e1 = VECTOR_CST_ELT (arg1, i); >>>>>> >>>>>> bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; >>>>>> >>>>>> tmp = wi::neg (e1, &neg1_ovf); >>>>>> tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); >>>>>> if (wi::neg_p (tmp)) >>>>>> tmp = wi::neg (tmp, &neg2_ovf); >>>>>> else >>>>>> neg2_ovf = false; >>>>>> result = wi::add (result, tmp, SIGNED, &add2_ovf); >>>>>> overflow |= neg1_ovf | neg2_ovf | add1_ovf | >add2_ovf; >>>>>> } >>>>>> >>>>>> gcc_assert (!overflow); >>>>>> >>>>>> return wide_int_to_tree (rtype, result); >>>>>> >>>>>> seems pretty natural. If instead it was modelled as a widest_int >>>>>> chain without overflow then it would be less obviously correct. >>>>>> >>>>>> Thanks, >>>>>> Richard >>>>> Let us for the sake of argument assume that this was common code >>> rather >>>>> than code in a particular port, because code in a particular port >>> can >>>>> know more about the environment than common code is allowed to. >>>>> >>>>> My main point is that this code is in wide-int not widest-int >>> because at >>>>> this level the writer of this code actually wants to model what >the >>>>> target wants to do. So doing the adds in precision and testing >>>>> overflow is perfectly fine at every step. But this loop CANNOT >be >>>>> written in a style where you tested the overflow at the end >because >>> if >>>>> this is common code you cannot make any assumptions about the >>> largest >>>>> mode on the machine. If the buffer was 2x + n in size, then it >>> would >>>>> be reasonably safe to assume that the number of elements in the >>> vector >>>>> could be represented in an integer and so you could wait till the >>> end. >>>>> I think that my point was that (and i feel a little uncomfortable >>>>> putting words in richi's mouth but i believe that this was his >point >>>>> early on) was that he thinks of the widest int as an infinite >>> precision >>>>> representation. he was the one who was pushing for the entire >rep >>> to >>>>> be done with a large internal (or perhaps unbounded) rep because >he >>> felt >>>>> that this was more natural to not have to think about overflow. >>> He >>>>> wanted you to be able to chain a mult and a divide and not see the >>>>> product get truncated before the divide was done. The rep that >we >>>>> have now really sucks with respect to this because widest int >>> truncates >>>>> if you are close to the largest precision on the machine and does >>> not if >>>>> you are small with respect to that. >>>>> >>>>> My other point is that while you think that the example above is >>> nice, >>>>> the experience with double-int is contrary to this. people will >>> say >>>>> (and test) the normal modes and anyone trying to use large modes >>> will >>>>> die a terrible death of a thousand cuts. >>>> Well - the cases that matter in practice are >>>> >>>> 1) the things we have offset_int for - code that does bit vs. byte >>>> quantity calculations on addresses or address offsets. It used >>>> either HWI before (and probably still does, and thus is buggy) or >>>> double-int. The usual patter was/is to do host_integerp (t, 0) >>>> and then TREE_LOW_CST (t) * BITS_PER_UNIT (oops) or blindly assume >>>> that doing things in double-int works (which it does in practice). >>>> >>>> 2) passes that want to know whether a single operation overflows >>>> >>>> the multiple-operation and then check overflow after-the-fact is >>>> seldomly used - it is, mainly from the frontends which use trees >>>> and thus get a sticky TREE_OVERFLOW. Yes, infinite precision >>>> would make this work as well, and yes, originally I thought of >>>> basing all of wide-int on an internally infinite precision >>>> implementation (and luckily we are close enough that I may end >>>> up fixing the implementation detail to work that way ...). >>>> With the infinite precision internal rep you'd have explicit >>>> truncations and sign-/zero-extensions at the right point and >>>> failing to do that before conversion to tree/RTX could have been >>>> easily turned into ICEs saying we overflowed and nobody cared. >>>> >>>> Well. Let's see how the thing we have now works out. >>> richi, >>> >>> i think that you miss the point. right now on the x86-64, the max >>> size >>> is 128 +64 bits. On my private platform it is 256 + 64 bits. >>> >>> if, at the tree level, i do a widest-int multiply of two timode >values >>> that are large with respect to their precision, i get different >answers >>> on the two targets. 95% of the existing trunk bugs that we fixed >we >>> because people thought that double-int was big enough so that they >did >>> not have to care. But what you and richard are saying is that you do >>> need to care (even though none of the code currently does care), but >>> only for rarely executed cases. With widest-int and a larger buffer >we >>> >>> could can make that expectation true in all of the places where you >get >>> >>> in, do some finite work and get out. In my opinion, being able to >do >>> this is the only reason for widest-int and if this does not work >>> reliably, then i would be inclined to convert everything to the >>> explicit >>> precision where the definition of what is done is rock solid. >>> Because >>> to me, if the rep is not completely predictable, then it is not >worth >>> using. >> At the moment we do not have the infinite precision code. We have >offset._int which effectively has a precision and widest int which is >indeed somewhat useless. We have fixed int where you specify your >desired infinite precision which is better. >> >> Note that we have shifted the double-int host dependence to a >widest-int target dependence. Which is equally bad. >> >> Richard. >yes, my point is that it does not need to be this bad, we can make the >buffer large enough to hide this, at least for the kind of bounded >computation that we have in the compiler. Having written most of this > >code, we tend to get in, do a couple of things and then get out. If >we up the buffer size, that will always work reliably. > >I realize that it will not be perfect, in that if you have enough >bounded size computation you would hit the limit. But right now >every >multiply (except the ones by 3 in the offset int code) are a source of >failure. We can do a lot better than that by upping the buffer size. > >you and i have slightly different experiences here. i did not do a >lot of the offset int code and so i am not as familiar with the >potential for bugs here. But I did a significant amount of the >widest-int code, and for that, 2x + 2 bits makes all the potential >problems go away. > >my guess is that the non 3x multiplies in offset-int most likely need >to >be checked. As long as we have persistent widest ints we cannot make them large. Btw, most arithmetic is from user code and thus subject to language constraints, including overflow. Its only when the compiler introduces artificial ops that possible issues start. Richard. >>> Kenny >>> >>> >>> >>>> Richard. >>>> >>
On 12/16/2013 01:37 PM, Richard Biener wrote: > Kenneth Zadeck <zadeck@naturalbridge.com> wrote: >> On 12/16/2013 09:37 AM, Richard Biener wrote: >>> Kenneth Zadeck <zadeck@naturalbridge.com> wrote: >>>> On 12/16/2013 06:19 AM, Richard Biener wrote: >>>>> On 12/15/13 7:48 PM, Kenneth Zadeck wrote: >>>>>> On 12/15/2013 11:40 AM, Richard Sandiford wrote: >>>>>>> Kenneth Zadeck <zadeck@naturalbridge.com> writes: >>>>>>>> it is certainly true that in order to do an unbounded set of >>>> operations, >>>>>>>> you would have to check on every operation. so my suggestion >>>> that we >>>>>>>> should remove the checking from the infinite precision would not >>>> support >>>>>>>> this. but the reality is that there are currently no places >> in >>>> the >>>>>>>> compiler that do this. >>>>>>>> >>>>>>>> Currently all of the uses of widest-int are one or two >> operations, >>>> and >>>>>>>> the style of code writing is that you do these and then you deal >>>> with >>>>>>>> the overflow at the time that you convert the widest-int to a >>>> tree. I >>>>>>>> think that it is important to maintain the style of programming >>>> where >>>>>>>> for a small finite number of computations do not need to check >>>> until >>>>>>>> they convert back. >>>>>>>> >>>>>>>> The problem with making the buffer size so tight is that we do >> not >>>> have >>>>>>>> an adequate reserves to allow this style for any supportable >> type. >>>>>>>> I personally think that 2x + some small n is what we need to >> have. >>>>>>>> >>>>>>>> i am not as familiar with how this is used (or to be used when >> all >>>> of >>>>>>>> the offset math is converted to use wide-int), but there appear >> to >>>> be >>>>>>>> two uses of multiply. one is the "harmless" mult by 3" and >> the >>>> other >>>>>>>> is where people are trying to compute the size of arrays. >> These >>>> last >>>>>>>> operations do need to be checked for overflow. The question >>>> here is >>>>>>>> do you want to force those operations to overflow individually >> or >>>> do you >>>>>>>> want to check when you convert out. Again, i think 2x + some >>>> small >>>>>>>> number is what we might want to consider. >>>>>>> It's a fair question, but personally I think checking for >> overflow >>>>>>> on the operation is much more robust. Checking on conversion >>>> doesn't >>>>>>> allow you to stop thinking about overflow, it just changes the >> way >>>> you >>>>>>> think about it: rather than handling explicit overflow flags, you >>>> have >>>>>>> to remember to ask "is the range of the unconverted result within >>>> the >>>>>>> range of widest_int", which I bet it is something that would be >>>> easily >>>>>>> forgotten once widest_int & co. are part of the furniture. >>>>>>> >>>>>>> E.g. the SPARC operation (picked only because I remember it): >>>>>>> >>>>>>> for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) >>>>>>> { >>>>>>> tree e0 = VECTOR_CST_ELT (arg0, i); >>>>>>> tree e1 = VECTOR_CST_ELT (arg1, i); >>>>>>> >>>>>>> bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; >>>>>>> >>>>>>> tmp = wi::neg (e1, &neg1_ovf); >>>>>>> tmp = wi::add (e0, tmp, SIGNED, &add1_ovf); >>>>>>> if (wi::neg_p (tmp)) >>>>>>> tmp = wi::neg (tmp, &neg2_ovf); >>>>>>> else >>>>>>> neg2_ovf = false; >>>>>>> result = wi::add (result, tmp, SIGNED, &add2_ovf); >>>>>>> overflow |= neg1_ovf | neg2_ovf | add1_ovf | >> add2_ovf; >>>>>>> } >>>>>>> >>>>>>> gcc_assert (!overflow); >>>>>>> >>>>>>> return wide_int_to_tree (rtype, result); >>>>>>> >>>>>>> seems pretty natural. If instead it was modelled as a widest_int >>>>>>> chain without overflow then it would be less obviously correct. >>>>>>> >>>>>>> Thanks, >>>>>>> Richard >>>>>> Let us for the sake of argument assume that this was common code >>>> rather >>>>>> than code in a particular port, because code in a particular port >>>> can >>>>>> know more about the environment than common code is allowed to. >>>>>> >>>>>> My main point is that this code is in wide-int not widest-int >>>> because at >>>>>> this level the writer of this code actually wants to model what >> the >>>>>> target wants to do. So doing the adds in precision and testing >>>>>> overflow is perfectly fine at every step. But this loop CANNOT >> be >>>>>> written in a style where you tested the overflow at the end >> because >>>> if >>>>>> this is common code you cannot make any assumptions about the >>>> largest >>>>>> mode on the machine. If the buffer was 2x + n in size, then it >>>> would >>>>>> be reasonably safe to assume that the number of elements in the >>>> vector >>>>>> could be represented in an integer and so you could wait till the >>>> end. >>>>>> I think that my point was that (and i feel a little uncomfortable >>>>>> putting words in richi's mouth but i believe that this was his >> point >>>>>> early on) was that he thinks of the widest int as an infinite >>>> precision >>>>>> representation. he was the one who was pushing for the entire >> rep >>>> to >>>>>> be done with a large internal (or perhaps unbounded) rep because >> he >>>> felt >>>>>> that this was more natural to not have to think about overflow. >>>> He >>>>>> wanted you to be able to chain a mult and a divide and not see the >>>>>> product get truncated before the divide was done. The rep that >> we >>>>>> have now really sucks with respect to this because widest int >>>> truncates >>>>>> if you are close to the largest precision on the machine and does >>>> not if >>>>>> you are small with respect to that. >>>>>> >>>>>> My other point is that while you think that the example above is >>>> nice, >>>>>> the experience with double-int is contrary to this. people will >>>> say >>>>>> (and test) the normal modes and anyone trying to use large modes >>>> will >>>>>> die a terrible death of a thousand cuts. >>>>> Well - the cases that matter in practice are >>>>> >>>>> 1) the things we have offset_int for - code that does bit vs. byte >>>>> quantity calculations on addresses or address offsets. It used >>>>> either HWI before (and probably still does, and thus is buggy) or >>>>> double-int. The usual patter was/is to do host_integerp (t, 0) >>>>> and then TREE_LOW_CST (t) * BITS_PER_UNIT (oops) or blindly assume >>>>> that doing things in double-int works (which it does in practice). >>>>> >>>>> 2) passes that want to know whether a single operation overflows >>>>> >>>>> the multiple-operation and then check overflow after-the-fact is >>>>> seldomly used - it is, mainly from the frontends which use trees >>>>> and thus get a sticky TREE_OVERFLOW. Yes, infinite precision >>>>> would make this work as well, and yes, originally I thought of >>>>> basing all of wide-int on an internally infinite precision >>>>> implementation (and luckily we are close enough that I may end >>>>> up fixing the implementation detail to work that way ...). >>>>> With the infinite precision internal rep you'd have explicit >>>>> truncations and sign-/zero-extensions at the right point and >>>>> failing to do that before conversion to tree/RTX could have been >>>>> easily turned into ICEs saying we overflowed and nobody cared. >>>>> >>>>> Well. Let's see how the thing we have now works out. >>>> richi, >>>> >>>> i think that you miss the point. right now on the x86-64, the max >>>> size >>>> is 128 +64 bits. On my private platform it is 256 + 64 bits. >>>> >>>> if, at the tree level, i do a widest-int multiply of two timode >> values >>>> that are large with respect to their precision, i get different >> answers >>>> on the two targets. 95% of the existing trunk bugs that we fixed >> we >>>> because people thought that double-int was big enough so that they >> did >>>> not have to care. But what you and richard are saying is that you do >>>> need to care (even though none of the code currently does care), but >>>> only for rarely executed cases. With widest-int and a larger buffer >> we >>>> could can make that expectation true in all of the places where you >> get >>>> in, do some finite work and get out. In my opinion, being able to >> do >>>> this is the only reason for widest-int and if this does not work >>>> reliably, then i would be inclined to convert everything to the >>>> explicit >>>> precision where the definition of what is done is rock solid. >>>> Because >>>> to me, if the rep is not completely predictable, then it is not >> worth >>>> using. >>> At the moment we do not have the infinite precision code. We have >> offset._int which effectively has a precision and widest int which is >> indeed somewhat useless. We have fixed int where you specify your >> desired infinite precision which is better. >>> Note that we have shifted the double-int host dependence to a >> widest-int target dependence. Which is equally bad. >>> Richard. >> yes, my point is that it does not need to be this bad, we can make the >> buffer large enough to hide this, at least for the kind of bounded >> computation that we have in the compiler. Having written most of this >> >> code, we tend to get in, do a couple of things and then get out. If >> we up the buffer size, that will always work reliably. >> >> I realize that it will not be perfect, in that if you have enough >> bounded size computation you would hit the limit. But right now >> every >> multiply (except the ones by 3 in the offset int code) are a source of >> failure. We can do a lot better than that by upping the buffer size. >> >> you and i have slightly different experiences here. i did not do a >> lot of the offset int code and so i am not as familiar with the >> potential for bugs here. But I did a significant amount of the >> widest-int code, and for that, 2x + 2 bits makes all the potential >> problems go away. >> >> my guess is that the non 3x multiplies in offset-int most likely need >> to >> be checked. > As long as we have persistent widest ints we cannot make them large. Btw, most arithmetic is from user code and thus subject to language constraints, including overflow. Its only when the compiler introduces artificial ops that possible issues start. > > Richard. > when richard s made his last round of changes which remove the performance issues with widest and offset ints, i was ready to concede that i was wrong in fighting you about your love of the pseudo infinite precision. It is not that i am disagreeing with your view that the persistent forms have to have a reasonable size, it is just that the cost that imposes on the correctness of the rest of the compiler is pretty steep. I think that it is clear that widest int is not the rep of choice where you have a choice between the default version and widest-int. kenny >>>> Kenny >>>> >>>> >>>> >>>>> Richard. >>>>> >
Index: gcc/wide-int.cc =================================================================== --- gcc/wide-int.cc (revision 205765) +++ gcc/wide-int.cc (working copy) @@ -1275,23 +1275,31 @@ wi::mul_internal (HOST_WIDE_INT *val, co unsigned int j; unsigned int blocks_needed = BLOCKS_NEEDED (prec); unsigned int half_blocks_needed = blocks_needed * 2; - /* The sizes here are scaled to support a 2x largest mode by 2x - largest mode yielding a 4x largest mode result. This is what is - needed by vpn. */ + /* The sizes here are scaled to support a 2*largest mode + a little + by 2*largest mode + a little yielding a 4*largest mode + a + little. This is what is needed by vpn. */ unsigned HOST_HALF_WIDE_INT - u[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; + u[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; + unsigned int ulen; unsigned HOST_HALF_WIDE_INT - v[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; - /* The '2' in 'R' is because we are internally doing a full + v[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; + unsigned int vlen; + unsigned int uvlen; + unsigned int vallen; + + /* The '4' in 'R' is because we are internally doing a wide multiply. */ unsigned HOST_HALF_WIDE_INT - r[2 * 4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT]; - HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1; + r[4 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT]; + + /* True if the result needs to be negated. */ + bool is_neg = false; /* If the top level routine did not really pass in an overflow, then just make sure that we never attempt to set it. */ bool needs_overflow = (overflow != 0); + const HOST_WIDE_INT zero = 0; if (needs_overflow) *overflow = false; @@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co return 1; } - /* We do unsigned mul and then correct it. */ - wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len, - half_blocks_needed, prec, SIGNED); - wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len, - half_blocks_needed, prec, SIGNED); - - /* The 2 is for a full mult. */ - memset (r, 0, half_blocks_needed * 2 - * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); + ulen = op1len * 2; + vlen = op2len * 2; + + if (sgn == SIGNED) + { + if (wi::neg_p (op1)) + { + HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS]; + + int nop1len + = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, 0); + is_neg = !is_neg; + ulen = nop1len * 2; + wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len, + ulen, prec, SIGNED); + } + else + wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len, + ulen, prec, SIGNED); + + if (wi::neg_p (op2)) + { + HOST_WIDE_INT nop2[2 * WIDE_INT_MAX_ELTS]; + + int nop2len + = wi::sub_large (nop2, &zero, 1, op2val, op2len, prec + 1, SIGNED, 0); + is_neg = !is_neg; + vlen = nop2len * 2; + wi_unpack (v, (const unsigned HOST_WIDE_INT*)nop2, nop2len, + vlen, prec, SIGNED); + } + else + wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len, + vlen, prec, SIGNED); + } + else + { + /* UNSIGNED mul. */ + /* For large positive integers inside regular wide-ints, we must + explictly expand out the 1s to full width for the mul to be + correct. Note that the top bit will never be set for a + unsigned number in offset_int of widest-int. */ + if (wi::neg_p (op1)) + ulen = half_blocks_needed; + if (wi::neg_p (op2)) + vlen = half_blocks_needed; + + wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len, + ulen, prec, SIGNED); + + wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len, + vlen, prec, SIGNED); + } - for (j = 0; j < half_blocks_needed; j++) + uvlen = ulen + vlen; + memset (r, 0, uvlen * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT); + + /* Do the actually multiply. */ + for (j = 0; j < vlen; j++) { k = 0; - for (i = 0; i < half_blocks_needed; i++) + for (i = 0; i < ulen; i++) { t = ((unsigned HOST_WIDE_INT)u[i] * (unsigned HOST_WIDE_INT)v[j] + r[i + j] + k); r[i + j] = t & HALF_INT_MASK; k = t >> HOST_BITS_PER_HALF_WIDE_INT; } - r[j + half_blocks_needed] = k; + r[j + ulen] = k; } - /* We did unsigned math above. For signed we must adjust the - product (assuming we need to see that). */ - if (sgn == SIGNED && (high || needs_overflow)) + /* We did unsigned math above. For signed we may need to adjust the + product if exactly 1 of the operands was negative. */ + if (is_neg) { - unsigned HOST_WIDE_INT b; - if (wi::neg_p (op1)) + t = (unsigned HOST_WIDE_INT)(~r[0]) + 1; + r[0] = t & HALF_INT_MASK; + for (i = 1; i < uvlen; i++) { - b = 0; - for (i = 0; i < half_blocks_needed; i++) - { - t = (unsigned HOST_WIDE_INT)r[i + half_blocks_needed] - - (unsigned HOST_WIDE_INT)v[i] - b; - r[i + half_blocks_needed] = t & HALF_INT_MASK; - b = t >> (HOST_BITS_PER_WIDE_INT - 1); - } - } - if (wi::neg_p (op2)) - { - b = 0; - for (i = 0; i < half_blocks_needed; i++) - { - t = (unsigned HOST_WIDE_INT)r[i + half_blocks_needed] - - (unsigned HOST_WIDE_INT)u[i] - b; - r[i + half_blocks_needed] = t & HALF_INT_MASK; - b = t >> (HOST_BITS_PER_WIDE_INT - 1); - } + t = (unsigned HOST_WIDE_INT)(~r[i]) + (t >> HOST_BITS_PER_HALF_WIDE_INT); + r[i] = t & HALF_INT_MASK; } } - if (needs_overflow) + /* We only need to do the expensive check for overflow if the value + really was big enough to overflow. */ + if (needs_overflow && (uvlen >= half_blocks_needed)) { HOST_WIDE_INT top; + const HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1; /* For unsigned, overflow is true if any of the top bits are set. For signed, overflow is true if any of the top bits are not equal @@ -1450,7 +1495,7 @@ wi::mul_internal (HOST_WIDE_INT *val, co top &= mask; } - for (i = half_blocks_needed; i < half_blocks_needed * 2; i++) + for (i = half_blocks_needed; i < uvlen; i++) if (((HOST_WIDE_INT)(r[i] & mask)) != top) *overflow = true; } @@ -1459,15 +1504,42 @@ wi::mul_internal (HOST_WIDE_INT *val, co { /* compute [prec] <- ([prec] * [prec]) >> [prec] */ wi_pack ((unsigned HOST_WIDE_INT *) val, - &r[half_blocks_needed], half_blocks_needed); - return canonize (val, blocks_needed, prec); + r, uvlen); + vallen = canonize (val, (uvlen + 1) >> 1, prec); + + /* Shift is not always safe to write over one of the + operands, so we must copy. */ + HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS]; + memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); + vallen = wi::lrshift_large (val, tval, vallen, prec*2, prec, prec); } else { + if (uvlen > half_blocks_needed) + uvlen = half_blocks_needed; /* compute [prec] <- ([prec] * [prec]) && ((1 << [prec]) - 1) */ - wi_pack ((unsigned HOST_WIDE_INT *) val, r, half_blocks_needed); - return canonize (val, blocks_needed, prec); + wi_pack ((unsigned HOST_WIDE_INT *) val, r, uvlen); + vallen = canonize (val, (uvlen + 1) >> 1, prec); } + +#if 0 + static int cnt = 0; + printf ("cnt=%d op1=", cnt++); + for (i = 0; i < op1len; i++) + printf ("0x%lx ", op1[i]); + + printf (" op2="); + for (i = 0; i < op2len; i++) + printf ("0x%lx ", op2[i]); + printf (" result="); + for (i = 0; i < vallen; i++) + printf ("0x%lx ", val[i]); + if (needs_overflow && *overflow) + printf (" OVERFLOW"); + printf ("\n"); +#endif + + return vallen; } /* Compute the population count of X. */ Index: gcc/wide-int.h =================================================================== --- gcc/wide-int.h (revision 205765) +++ gcc/wide-int.h (working copy) @@ -2406,7 +2408,7 @@ wi::mul (const T1 &x, const T2 &y) } else result.set_len (mul_internal (val, xi.val, xi.len, yi.val, yi.len, - precision, UNSIGNED, 0, false)); + precision, SIGNED, 0, false)); return result; }