+ // Compute some values needed by the remaining shift algorithms
+ uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
+ uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
+
+ // If we are shifting whole words, just move whole words
+ if (wordShift == 0) {
+ for (uint32_t i = 0; i < offset; i++)
+ val[i] = 0;
+ for (uint32_t i = offset; i < getNumWords(); i++)
+ val[i] = pVal[i-offset];
+ return APInt(val,BitWidth).clearUnusedBits();
+ }
+
+ // Copy whole words from this to Result.
+ uint32_t i = getNumWords() - 1;
+ for (; i > offset; --i)
+ val[i] = pVal[i-offset] << wordShift |
+ pVal[i-offset-1] >> (APINT_BITS_PER_WORD - wordShift);
+ val[offset] = pVal[0] << wordShift;
+ for (i = 0; i < offset; ++i)
+ val[i] = 0;
+ return APInt(val, BitWidth).clearUnusedBits();
+}
+
+
+// Square Root - this method computes and returns the square root of "this".
+// Three mechanisms are used for computation. For small values (<= 5 bits),
+// a table lookup is done. This gets some performance for common cases. For
+// values using less than 52 bits, the value is converted to double and then
+// the libc sqrt function is called. The result is rounded and then converted
+// back to a uint64_t which is then used to construct the result. Finally,
+// the Babylonian method for computing square roots is used.
+APInt APInt::sqrt() const {
+
+ // Determine the magnitude of the value.
+ uint32_t magnitude = getActiveBits();
+
+ // Use a fast table for some small values. This also gets rid of some
+ // rounding errors in libc sqrt for small values.
+ if (magnitude <= 5) {
+ static const uint8_t results[32] = {
+ /* 0 */ 0,
+ /* 1- 2 */ 1, 1,
+ /* 3- 6 */ 2, 2, 2, 2,
+ /* 7-12 */ 3, 3, 3, 3, 3, 3,
+ /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ /* 31 */ 6
+ };
+ return APInt(BitWidth, results[ (isSingleWord() ? VAL : pVal[0]) ]);
+ }
+
+ // If the magnitude of the value fits in less than 52 bits (the precision of
+ // an IEEE double precision floating point value), then we can use the
+ // libc sqrt function which will probably use a hardware sqrt computation.
+ // This should be faster than the algorithm below.
+ if (magnitude < 52)
+ return APInt(BitWidth,
+ uint64_t(::round(::sqrt(double(isSingleWord()?VAL:pVal[0])))));
+
+ // Okay, all the short cuts are exhausted. We must compute it. The following
+ // is a classical Babylonian method for computing the square root. This code
+ // was adapted to APINt from a wikipedia article on such computations.
+ // See http://www.wikipedia.org/ and go to the page named
+ // Calculate_an_integer_square_root.
+ uint32_t nbits = BitWidth, i = 4;
+ APInt testy(BitWidth, 16);
+ APInt x_old(BitWidth, 1);
+ APInt x_new(BitWidth, 0);
+ APInt two(BitWidth, 2);
+
+ // Select a good starting value using binary logarithms.
+ for (;; i += 2, testy = testy.shl(2))
+ if (i >= nbits || this->ule(testy)) {
+ x_old = x_old.shl(i / 2);
+ break;
+ }
+
+ // Use the Babylonian method to arrive at the integer square root:
+ for (;;) {
+ x_new = (this->udiv(x_old) + x_old).udiv(two);
+ if (x_old.ule(x_new))
+ break;
+ x_old = x_new;
+ }
+
+ // Make sure we return the closest approximation
+ // NOTE: The rounding calculation below is correct. It will produce an
+ // off-by-one discrepancy with results from pari/gp. That discrepancy has been
+ // determined to be a rounding issue with pari/gp as it begins to use a
+ // floating point representation after 192 bits. There are no discrepancies
+ // between this algorithm and pari/gp for bit widths < 192 bits.
+ APInt square(x_old * x_old);
+ APInt nextSquare((x_old + 1) * (x_old +1));
+ if (this->ult(square))
+ return x_old;
+ else if (this->ule(nextSquare)) {
+ APInt midpoint((nextSquare - square).udiv(two));
+ APInt offset(*this - square);
+ if (offset.ult(midpoint))
+ return x_old;
+ else
+ return x_old + 1;
+ } else
+ assert(0 && "Error in APInt::sqrt computation");
+ return x_old + 1;
+}
+
+/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
+/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
+/// variables here have the same names as in the algorithm. Comments explain
+/// the algorithm and any deviation from it.
+static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
+ uint32_t m, uint32_t n) {
+ assert(u && "Must provide dividend");
+ assert(v && "Must provide divisor");
+ assert(q && "Must provide quotient");
+ assert(u != v && u != q && v != q && "Must us different memory");
+ assert(n>1 && "n must be > 1");
+
+ // Knuth uses the value b as the base of the number system. In our case b
+ // is 2^31 so we just set it to -1u.
+ uint64_t b = uint64_t(1) << 32;
+
+ DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n');
+ DEBUG(cerr << "KnuthDiv: original:");
+ DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) << u[i]);
+ DEBUG(cerr << " by");
+ DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) << v[i-1]);
+ DEBUG(cerr << '\n');
+ // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
+ // u and v by d. Note that we have taken Knuth's advice here to use a power
+ // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
+ // 2 allows us to shift instead of multiply and it is easy to determine the
+ // shift amount from the leading zeros. We are basically normalizing the u
+ // and v so that its high bits are shifted to the top of v's range without
+ // overflow. Note that this can require an extra word in u so that u must
+ // be of length m+n+1.
+ uint32_t shift = CountLeadingZeros_32(v[n-1]);
+ uint32_t v_carry = 0;
+ uint32_t u_carry = 0;
+ if (shift) {
+ for (uint32_t i = 0; i < m+n; ++i) {
+ uint32_t u_tmp = u[i] >> (32 - shift);
+ u[i] = (u[i] << shift) | u_carry;
+ u_carry = u_tmp;
+ }
+ for (uint32_t i = 0; i < n; ++i) {
+ uint32_t v_tmp = v[i] >> (32 - shift);
+ v[i] = (v[i] << shift) | v_carry;
+ v_carry = v_tmp;
+ }
+ }
+ u[m+n] = u_carry;
+ DEBUG(cerr << "KnuthDiv: normal:");
+ DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) << u[i]);
+ DEBUG(cerr << " by");
+ DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) << v[i-1]);
+ DEBUG(cerr << '\n');
+
+ // D2. [Initialize j.] Set j to m. This is the loop counter over the places.
+ int j = m;
+ do {
+ DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n');
+ // D3. [Calculate q'.].
+ // Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
+ // Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
+ // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
+ // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+ // on v[n-2] determines at high speed most of the cases in which the trial
+ // value qp is one too large, and it eliminates all cases where qp is two
+ // too large.
+ uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+ DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n');
+ uint64_t qp = dividend / v[n-1];
+ uint64_t rp = dividend % v[n-1];
+ if (qp == b || qp*v[n-2] > b*rp + u[j+n-2]) {
+ qp--;
+ rp += v[n-1];
+ if (rp < b && (qp == b || qp*v[n-2] > b*rp + u[j+n-2]))
+ qp--;
+ }
+ DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
+
+ // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
+ // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
+ // consists of a simple multiplication by a one-place number, combined with
+ // a subtraction.
+ bool isNeg = false;
+ for (uint32_t i = 0; i < n; ++i) {
+ uint64_t u_tmp = uint64_t(u[j+i]) | (uint64_t(u[j+i+1]) << 32);
+ uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
+ bool borrow = subtrahend > u_tmp;
+ DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp
+ << ", subtrahend == " << subtrahend
+ << ", borrow = " << borrow << '\n');
+
+ uint64_t result = u_tmp - subtrahend;
+ uint32_t k = j + i;
+ u[k++] = result & (b-1); // subtract low word
+ u[k++] = result >> 32; // subtract high word
+ while (borrow && k <= m+n) { // deal with borrow to the left
+ borrow = u[k] == 0;
+ u[k]--;
+ k++;
+ }
+ isNeg |= borrow;
+ DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ", u[j+i+1] == " <<
+ u[j+i+1] << '\n');
+ }
+ DEBUG(cerr << "KnuthDiv: after subtraction:");
+ DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+ DEBUG(cerr << '\n');
+ // The digits (u[j+n]...u[j]) should be kept positive; if the result of
+ // this step is actually negative, (u[j+n]...u[j]) should be left as the
+ // true value plus b**(n+1), namely as the b's complement of
+ // the true value, and a "borrow" to the left should be remembered.
+ //
+ if (isNeg) {
+ bool carry = true; // true because b's complement is "complement + 1"
+ for (uint32_t i = 0; i <= m+n; ++i) {
+ u[i] = ~u[i] + carry; // b's complement
+ carry = carry && u[i] == 0;
+ }
+ }
+ DEBUG(cerr << "KnuthDiv: after complement:");
+ DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+ DEBUG(cerr << '\n');
+
+ // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
+ // negative, go to step D6; otherwise go on to step D7.
+ q[j] = qp;
+ if (isNeg) {
+ // D6. [Add back]. The probability that this step is necessary is very
+ // small, on the order of only 2/b. Make sure that test data accounts for
+ // this possibility. Decrease q[j] by 1
+ q[j]--;
+ // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]).
+ // A carry will occur to the left of u[j+n], and it should be ignored
+ // since it cancels with the borrow that occurred in D4.
+ bool carry = false;
+ for (uint32_t i = 0; i < n; i++) {
+ uint32_t limit = std::min(u[j+i],v[i]);
+ u[j+i] += v[i] + carry;
+ carry = u[j+i] < limit || (carry && u[j+i] == limit);
+ }
+ u[j+n] += carry;
+ }
+ DEBUG(cerr << "KnuthDiv: after correction:");
+ DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]);
+ DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');
+
+ // D7. [Loop on j.] Decrease j by one. Now if j >= 0, go back to D3.
+ } while (--j >= 0);
+
+ DEBUG(cerr << "KnuthDiv: quotient:");
+ DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]);
+ DEBUG(cerr << '\n');
+
+ // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
+ // remainder may be obtained by dividing u[...] by d. If r is non-null we
+ // compute the remainder (urem uses this).
+ if (r) {
+ // The value d is expressed by the "shift" value above since we avoided
+ // multiplication by d by using a shift left. So, all we have to do is
+ // shift right here. In order to mak
+ if (shift) {
+ uint32_t carry = 0;
+ DEBUG(cerr << "KnuthDiv: remainder:");
+ for (int i = n-1; i >= 0; i--) {
+ r[i] = (u[i] >> shift) | carry;
+ carry = u[i] << (32 - shift);
+ DEBUG(cerr << " " << r[i]);
+ }
+ } else {
+ for (int i = n-1; i >= 0; i--) {
+ r[i] = u[i];
+ DEBUG(cerr << " " << r[i]);
+ }
+ }
+ DEBUG(cerr << '\n');
+ }
+ DEBUG(cerr << std::setbase(10) << '\n');
+}
+
+void APInt::divide(const APInt LHS, uint32_t lhsWords,
+ const APInt &RHS, uint32_t rhsWords,
+ APInt *Quotient, APInt *Remainder)
+{
+ assert(lhsWords >= rhsWords && "Fractional result");
+
+ // First, compose the values into an array of 32-bit words instead of
+ // 64-bit words. This is a necessity of both the "short division" algorithm
+ // and the the Knuth "classical algorithm" which requires there to be native
+ // operations for +, -, and * on an m bit value with an m*2 bit result. We
+ // can't use 64-bit operands here because we don't have native results of
+ // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
+ // work on large-endian machines.
+ uint64_t mask = ~0ull >> (sizeof(uint32_t)*8);
+ uint32_t n = rhsWords * 2;
+ uint32_t m = (lhsWords * 2) - n;
+
+ // Allocate space for the temporary values we need either on the stack, if
+ // it will fit, or on the heap if it won't.
+ uint32_t SPACE[128];
+ uint32_t *U = 0;
+ uint32_t *V = 0;
+ uint32_t *Q = 0;
+ uint32_t *R = 0;
+ if ((Remainder?4:3)*n+2*m+1 <= 128) {
+ U = &SPACE[0];
+ V = &SPACE[m+n+1];
+ Q = &SPACE[(m+n+1) + n];
+ if (Remainder)
+ R = &SPACE[(m+n+1) + n + (m+n)];