Fix a ValueTracking rule: RHS means operand 1, not 0. Add a simple

[oota-llvm.git] / lib / Transforms / Scalar / InstructionCombining.cpp
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp

index 186d8378a4972cd2323ab101f4eae4545408dca0..62554a5e91fa056c81f88e3dfa5b2821c89c308a 100644 (file)
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -352,8 +352,8 @@ namespace {
      /// properties that allow us to simplify its operands.
      bool SimplifyDemandedInstructionBits(Instruction &Inst);
          
-    Value *SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
-                                      uint64_t &UndefElts, unsigned Depth = 0);
+    Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                      APInt& UndefElts, unsigned Depth = 0);
        
      // FoldOpIntoPhi - Given a binary operator or cast instruction which has a
      // PHI node as operand #0, see if we can fold the instruction into the PHI
@@ -817,24 +817,88 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
      if (isa<UndefValue>(V))
        return 0;
      return UndefValue::get(VTy);
-  } else if (!V->hasOneUse()) {    // Other users may use these bits.
-    if (Depth != 0) {       // Not at the root.
-      // Just compute the KnownZero/KnownOne bits to simplify things downstream.
-      ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
-      return 0;
-    }
-    // If this is the root being simplified, allow it to have multiple uses,
-    // just set the DemandedMask to all bits.
-    DemandedMask = APInt::getAllOnesValue(BitWidth);
-  } else if (Depth == 6) {        // Limit search depth.
-    return 0;
    }
    
+  if (Depth == 6)        // Limit search depth.
+    return 0;
+  
    Instruction *I = dyn_cast<Instruction>(V);
    if (!I) return 0;        // Only analyze instructions.
-
+  
    APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
    APInt &RHSKnownZero = KnownZero, &RHSKnownOne = KnownOne;
+
+  // If there are multiple uses of this value and we aren't at the root, then
+  // we can't do any simplifications of the operands, because DemandedMask
+  // only reflects the bits demanded by *one* of the users.
+  if (Depth != 0 && !I->hasOneUse()) {
+    // Despite the fact that we can't simplify this instruction in all User's
+    // context, we can at least compute the knownzero/knownone bits, and we can
+    // do simplifications that apply to *just* the one user if we know that
+    // this instruction has a simpler value in that context.
+    if (I->getOpcode() == Instruction::And) {
+      // If either the LHS or the RHS are Zero, the result is zero.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask,
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known 1 on one side, return the other.
+      // These bits cannot contribute to the result of the 'and' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+          (DemandedMask & ~LHSKnownZero))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+          (DemandedMask & ~RHSKnownZero))
+        return I->getOperand(1);
+      
+      // If all of the demanded bits in the inputs are known zeros, return zero.
+      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+        return Constant::getNullValue(VTy);
+      
+    } else if (I->getOpcode() == Instruction::Or) {
+      // We can simplify (X|Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+      
+      // If either the LHS or the RHS are One, the result is One.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask, 
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known zero on one side, return the
+      // other.  These bits cannot contribute to the result of the 'or' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+          (DemandedMask & ~LHSKnownOne))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+          (DemandedMask & ~RHSKnownOne))
+        return I->getOperand(1);
+      
+      // If all of the potentially set bits on one side are known to be set on
+      // the other side, just use the 'other' side.
+      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+          (DemandedMask & (~RHSKnownZero)))
+        return I->getOperand(0);
+      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+          (DemandedMask & (~LHSKnownZero)))
+        return I->getOperand(1);
+    }
+    
+    // Compute the KnownZero/KnownOne bits to simplify things downstream.
+    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    return 0;
+  }
+  
+  // If this is the root being simplified, allow it to have multiple uses,
+  // just set the DemandedMask to all bits so that we can try to simplify the
+  // operands.  This allows visitTruncInst (for example) to simplify the
+  // operand of a trunc without duplicating all the logic below.
+  if (Depth == 0 && !V->hasOneUse())
+    DemandedMask = APInt::getAllOnesValue(BitWidth);
+  
    switch (I->getOpcode()) {
    default:
      ComputeMaskedBits(I, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
@@ -1332,19 +1396,18 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
  
  
  /// SimplifyDemandedVectorElts - The specified value produces a vector with
-/// 64 or fewer elements.  DemandedElts contains the set of elements that are
+/// any number of elements. DemandedElts contains the set of elements that are
  /// actually used by the caller.  This method analyzes which elements of the
  /// operand are undef and returns that information in UndefElts.
  ///
  /// If the information about demanded elements can be used to simplify the
  /// operation, the operation is simplified, then the resultant value is
  /// returned.  This returns null if no change was made.
-Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
-                                                uint64_t &UndefElts,
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                                APInt& UndefElts,
                                                  unsigned Depth) {
    unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
-  assert(VWidth <= 64 && "Vector too wide to analyze!");
-  uint64_t EltMask = ~0ULL >> (64-VWidth);
+  APInt EltMask(APInt::getAllOnesValue(VWidth));
    assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
  
    if (isa<UndefValue>(V)) {
@@ -1363,12 +1426,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
  
      std::vector<Constant*> Elts;
      for (unsigned i = 0; i != VWidth; ++i)
-      if (!(DemandedElts & (1ULL << i))) {   // If not demanded, set to undef.
+      if (!DemandedElts[i]) {   // If not demanded, set to undef.
          Elts.push_back(Undef);
-        UndefElts |= (1ULL << i);
+        UndefElts.set(i);
        } else if (isa<UndefValue>(CP->getOperand(i))) {   // Already undef.
          Elts.push_back(Undef);
-        UndefElts |= (1ULL << i);
+        UndefElts.set(i);
        } else {                               // Otherwise, defined.
          Elts.push_back(CP->getOperand(i));
        }
@@ -1389,8 +1452,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      Constant *Zero = Constant::getNullValue(EltTy);
      Constant *Undef = UndefValue::get(EltTy);
      std::vector<Constant*> Elts;
-    for (unsigned i = 0; i != VWidth; ++i)
-      Elts.push_back((DemandedElts & (1ULL << i)) ? Zero : Undef);
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = DemandedElts[i] ? Zero : Undef;
+      Elts.push_back(Elt);
+    }
      UndefElts = DemandedElts ^ EltMask;
      return ConstantVector::get(Elts);
    }
@@ -1418,7 +1483,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
    if (!I) return false;        // Only analyze instructions.
    
    bool MadeChange = false;
-  uint64_t UndefElts2;
+  APInt UndefElts2(VWidth, 0);
    Value *TmpV;
    switch (I->getOpcode()) {
    default: break;
@@ -1439,44 +1504,46 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      // If this is inserting an element that isn't demanded, remove this
      // insertelement.
      unsigned IdxNo = Idx->getZExtValue();
-    if (IdxNo >= VWidth || (DemandedElts & (1ULL << IdxNo)) == 0)
+    if (IdxNo >= VWidth || !DemandedElts[IdxNo])
        return AddSoonDeadInstToWorklist(*I, 0);
      
      // Otherwise, the element inserted overwrites whatever was there, so the
      // input demanded set is simpler than the output set.
-    TmpV = SimplifyDemandedVectorElts(I->getOperand(0),
-                                      DemandedElts & ~(1ULL << IdxNo),
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clear(IdxNo);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
                                        UndefElts, Depth+1);
      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
  
      // The inserted element is defined.
-    UndefElts &= ~(1ULL << IdxNo);
+    UndefElts.clear(IdxNo);
      break;
    }
    case Instruction::ShuffleVector: {
      ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
      uint64_t LHSVWidth =
        cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
-    uint64_t LeftDemanded = 0, RightDemanded = 0;
+    APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
      for (unsigned i = 0; i < VWidth; i++) {
-      if (DemandedElts & (1ULL << i)) {
+      if (DemandedElts[i]) {
          unsigned MaskVal = Shuffle->getMaskValue(i);
          if (MaskVal != -1u) {
            assert(MaskVal < LHSVWidth * 2 &&
                   "shufflevector mask index out of range!");
            if (MaskVal < LHSVWidth)
-            LeftDemanded |= 1ULL << MaskVal;
+            LeftDemanded.set(MaskVal);
            else
-            RightDemanded |= 1ULL << (MaskVal - LHSVWidth);
+            RightDemanded.set(MaskVal - LHSVWidth);
          }
        }
      }
  
+    APInt UndefElts4(LHSVWidth, 0);
      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
-                                      UndefElts2, Depth+1);
+                                      UndefElts4, Depth+1);
      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
  
-    uint64_t UndefElts3;
+    APInt UndefElts3(LHSVWidth, 0);
      TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
                                        UndefElts3, Depth+1);
      if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
@@ -1485,16 +1552,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      for (unsigned i = 0; i < VWidth; i++) {
        unsigned MaskVal = Shuffle->getMaskValue(i);
        if (MaskVal == -1u) {
-        uint64_t NewBit = 1ULL << i;
-        UndefElts |= NewBit;
+        UndefElts.set(i);
        } else if (MaskVal < LHSVWidth) {
-        uint64_t NewBit = ((UndefElts2 >> MaskVal) & 1) << i;
-        NewUndefElts |= NewBit;
-        UndefElts |= NewBit;
+        if (UndefElts4[MaskVal]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
        } else {
-        uint64_t NewBit = ((UndefElts3 >> (MaskVal - LHSVWidth)) & 1) << i;
-        NewUndefElts |= NewBit;
-        UndefElts |= NewBit;
+        if (UndefElts3[MaskVal - LHSVWidth]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
        }
      }
  
@@ -1502,7 +1570,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // Add additional discovered undefs.
        std::vector<Constant*> Elts;
        for (unsigned i = 0; i < VWidth; ++i) {
-        if (UndefElts & (1ULL << i))
+        if (UndefElts[i])
            Elts.push_back(UndefValue::get(Type::Int32Ty));
          else
            Elts.push_back(ConstantInt::get(Type::Int32Ty,
@@ -1518,7 +1586,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
      if (!VTy) break;
      unsigned InVWidth = VTy->getNumElements();
-    uint64_t InputDemandedElts = 0;
+    APInt InputDemandedElts(InVWidth, 0);
      unsigned Ratio;
  
      if (VWidth == InVWidth) {
@@ -1535,8 +1603,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // elements are live.
        Ratio = VWidth/InVWidth;
        for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
-        if (DemandedElts & (1ULL << OutIdx))
-          InputDemandedElts |= 1ULL << (OutIdx/Ratio);
+        if (DemandedElts[OutIdx])
+          InputDemandedElts.set(OutIdx/Ratio);
        }
      } else {
        // Untested so far.
@@ -1547,8 +1615,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // live.
        Ratio = InVWidth/VWidth;
        for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if (DemandedElts & (1ULL << InIdx/Ratio))
-          InputDemandedElts |= 1ULL << InIdx;
+        if (DemandedElts[InIdx/Ratio])
+          InputDemandedElts.set(InIdx);
      }
      
      // div/rem demand all inputs, because they don't want divide by zero.
@@ -1566,8 +1634,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // then an output element is undef if the corresponding input element is
        // undef.
        for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
-        if (UndefElts2 & (1ULL << (OutIdx/Ratio)))
-          UndefElts |= 1ULL << OutIdx;
+        if (UndefElts2[OutIdx/Ratio])
+          UndefElts.set(OutIdx);
      } else if (VWidth < InVWidth) {
        assert(0 && "Unimp");
        // If there are more elements in the source than there are in the result,
@@ -1575,8 +1643,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // elements are undef.
        UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
        for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if ((UndefElts2 & (1ULL << InIdx)) == 0)    // Not undef?
-          UndefElts &= ~(1ULL << (InIdx/Ratio));    // Clear undef bit.
+        if (!UndefElts2[InIdx])            // Not undef?
+          UndefElts.clear(InIdx/Ratio);    // Clear undef bit.
      }
      break;
    }
@@ -6031,18 +6099,39 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
      if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
        if (Op0I->getOpcode() == Op1I->getOpcode() && Op0I->hasOneUse() &&
-          Op1I->hasOneUse() && Op0I->getOperand(1) == Op1I->getOperand(1) &&
-          I.isEquality()) {
+          Op1I->hasOneUse() && Op0I->getOperand(1) == Op1I->getOperand(1)) {
          switch (Op0I->getOpcode()) {
          default: break;
          case Instruction::Add:
          case Instruction::Sub:
          case Instruction::Xor:
-          // a+x icmp eq/ne b+x --> a icmp b
-          return new ICmpInst(I.getPredicate(), Op0I->getOperand(0),
-                              Op1I->getOperand(0));
+          if (I.isEquality())    // a+x icmp eq/ne b+x --> a icmp b
+            return new ICmpInst(I.getPredicate(), Op0I->getOperand(0),
+                                Op1I->getOperand(0));
+          // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+            if (CI->getValue().isSignBit()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+            
+            if (CI->getValue().isMaxSignedValue()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              Pred = I.getSwappedPredicate(Pred);
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+          }
            break;
          case Instruction::Mul:
+          if (!I.isEquality())
+            break;
+
            if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
              // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
              // Mask = -1 >> count-trailing-zeros(Cst).
@@ -6361,6 +6450,29 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
          else
            return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal, AddOne(RHS));
        }
+
+      if (LHSI->hasOneUse()) {
+        // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit))
+        if (!ICI.isEquality() && XorCST->getValue().isSignBit()) {
+          const APInt &SignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ SignBit));
+        }
+
+        // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
+        if (!ICI.isEquality() && XorCST->getValue().isMaxSignedValue()) {
+          const APInt &NotSignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          Pred = ICI.getSwappedPredicate(Pred);
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ NotSignBit));
+        }
+      }
      }
      break;
    case Instruction::And:         // (icmp pred (and X, AndCST), RHS)
@@ -6919,7 +7031,12 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
        MaskedValueIsZero(Op0,
                        APInt::getSignBit(I.getType()->getPrimitiveSizeInBits())))
      return BinaryOperator::CreateLShr(Op0, I.getOperand(1));
-  
+
+  // Arithmetic shifting an all-sign-bit value is a no-op.
+  unsigned NumSignBits = ComputeNumSignBits(Op0);
+  if (NumSignBits == Op0->getType()->getPrimitiveSizeInBits())
+    return ReplaceInstUsesWith(I, Op0);
+
    return 0;
  }
  
@@ -6960,7 +7077,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
  
  Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
                                                 BinaryOperator &I) {
-  bool isLeftShift    = I.getOpcode() == Instruction::Shl;
+  bool isLeftShift = I.getOpcode() == Instruction::Shl;
  
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
@@ -7828,15 +7945,15 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
        break;
      case Instruction::ZExt: {
        DoXForm = NumCastsRemoved >= 1;
-      if (!DoXForm) {
+      if (!DoXForm && 0) {
          // If it's unnecessary to issue an AND to clear the high bits, it's
          // always profitable to do this xform.
-        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, 
-                                           CI.getOpcode() == Instruction::SExt);
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, false);
          APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
          if (MaskedValueIsZero(TryRes, Mask))
            return ReplaceInstUsesWith(CI, TryRes);
-        else if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
            if (TryI->use_empty())
              EraseInstFromFunction(*TryI);
        }
@@ -7844,7 +7961,7 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
      }
      case Instruction::SExt: {
        DoXForm = NumCastsRemoved >= 2;
-      if (!DoXForm && !isa<TruncInst>(SrcI)) {
+      if (!DoXForm && !isa<TruncInst>(SrcI) && 0) {
          // If we do not have to emit the truncate + sext pair, then it's always
          // profitable to do this xform.
          //
@@ -7854,12 +7971,12 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
          // t3 = sext i16 t2 to i32
          // !=
          // i32 t1
-        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, 
-                                           CI.getOpcode() == Instruction::SExt);
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, true);
          unsigned NumSignBits = ComputeNumSignBits(TryRes);
          if (NumSignBits > (DestBitSize - SrcBitSize))
            return ReplaceInstUsesWith(CI, TryRes);
-        else if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
            if (TryI->use_empty())
              EraseInstFromFunction(*TryI);
        }
@@ -7868,11 +7985,13 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
      }
      
      if (DoXForm) {
+      DOUT << "ICE: EvaluateInDifferentType converting expression type to avoid"
+           << " cast: " << CI;
        Value *Res = EvaluateInDifferentType(SrcI, DestTy, 
                                             CI.getOpcode() == Instruction::SExt);
        if (JustReplace)
-          // Just replace this cast with the result.
-          return ReplaceInstUsesWith(CI, Res);
+        // Just replace this cast with the result.
+        return ReplaceInstUsesWith(CI, Res);
  
        assert(Res->getType() == DestTy);
        switch (CI.getOpcode()) {
@@ -8157,32 +8276,35 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
  
    Value *Src = CI.getOperand(0);
  
-  // If this is a cast of a cast
-  if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
-    // If this is a TRUNC followed by a ZEXT then we are dealing with integral
-    // types and if the sizes are just right we can convert this into a logical
-    // 'and' which will be much cheaper than the pair of casts.
-    if (isa<TruncInst>(CSrc)) {
-      // Get the sizes of the types involved
-      Value *A = CSrc->getOperand(0);
-      uint32_t SrcSize = A->getType()->getPrimitiveSizeInBits();
-      uint32_t MidSize = CSrc->getType()->getPrimitiveSizeInBits();
-      uint32_t DstSize = CI.getType()->getPrimitiveSizeInBits();
-      // If we're actually extending zero bits and the trunc is a no-op
-      if (MidSize < DstSize && SrcSize == DstSize) {
-        // Replace both of the casts with an And of the type mask.
-        APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
-        Constant *AndConst = ConstantInt::get(AndValue);
-        Instruction *And = 
-          BinaryOperator::CreateAnd(CSrc->getOperand(0), AndConst);
-        // Unfortunately, if the type changed, we need to cast it back.
-        if (And->getType() != CI.getType()) {
-          And->setName(CSrc->getName()+".mask");
-          InsertNewInstBefore(And, CI);
-          And = CastInst::CreateIntegerCast(And, CI.getType(), false/*ZExt*/);
-        }
-        return And;
-      }
+  // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+  // types and if the sizes are just right we can convert this into a logical
+  // 'and' which will be much cheaper than the pair of casts.
+  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // Get the sizes of the types involved.  We know that the intermediate type
+    // will be smaller than A or C, but don't know the relation between A and C.
+    Value *A = CSrc->getOperand(0);
+    unsigned SrcSize = A->getType()->getPrimitiveSizeInBits();
+    unsigned MidSize = CSrc->getType()->getPrimitiveSizeInBits();
+    unsigned DstSize = CI.getType()->getPrimitiveSizeInBits();
+    // If we're actually extending zero bits, then if
+    // SrcSize <  DstSize: zext(a & mask)
+    // SrcSize == DstSize: a & mask
+    // SrcSize  > DstSize: trunc(a) & mask
+    if (SrcSize < DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      Constant *AndConst = ConstantInt::get(AndValue);
+      Instruction *And =
+        BinaryOperator::CreateAnd(A, AndConst, CSrc->getName()+".mask");
+      InsertNewInstBefore(And, CI);
+      return new ZExtInst(And, CI.getType());
+    } else if (SrcSize == DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      return BinaryOperator::CreateAnd(A, ConstantInt::get(AndValue));
+    } else if (SrcSize > DstSize) {
+      Instruction *Trunc = new TruncInst(A, CI.getType(), "tmp");
+      InsertNewInstBefore(Trunc, CI);
+      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+      return BinaryOperator::CreateAnd(Trunc, ConstantInt::get(AndValue));
      }
    }
  
@@ -9123,15 +9245,23 @@ static unsigned EnforceKnownAlignment(Value *V,
      // If there is a large requested alignment and we can, bump up the alignment
      // of the global.
      if (!GV->isDeclaration()) {
-      GV->setAlignment(PrefAlign);
-      Align = PrefAlign;
+      if (GV->getAlignment() >= PrefAlign)
+        Align = GV->getAlignment();
+      else {
+        GV->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
      }
    } else if (AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
      // If there is a requested alignment and if this is an alloca, round up.  We
      // don't do this for malloc, because some systems can't respect the request.
      if (isa<AllocaInst>(AI)) {
-      AI->setAlignment(PrefAlign);
-      Align = PrefAlign;
+      if (AI->getAlignment() >= PrefAlign)
+        Align = AI->getAlignment();
+      else {
+        AI->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
      }
    }
  
@@ -9383,8 +9513,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    case Intrinsic::x86_sse_cvttss2si: {
      // These intrinsics only demands the 0th element of its input vector.  If
      // we can simplify the input based on that, do so now.
-    uint64_t UndefElts;
-    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, 
+    unsigned VWidth =
+      cast<VectorType>(II->getOperand(1)->getType())->getNumElements();
+    APInt DemandedElts(VWidth, 1);
+    APInt UndefElts(VWidth, 0);
+    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
                                                UndefElts)) {
        II->setOperand(1, V);
        return II;
@@ -10052,6 +10185,9 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
    
    SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
                                          FirstInst->op_end());
+  // This is true if all GEP bases are allocas and if all indices into them are
+  // constants.
+  bool AllBasePointersAreAllocas = true;
    
    // Scan to see if all operands are the same opcode, all have one use, and all
    // kill their operands (i.e. the operands have one use).
@@ -10061,6 +10197,12 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
        GEP->getNumOperands() != FirstInst->getNumOperands())
        return 0;
  
+    // Keep track of whether or not all GEPs are of alloca pointers.
+    if (AllBasePointersAreAllocas &&
+        (!isa<AllocaInst>(GEP->getOperand(0)) ||
+         !GEP->hasAllConstantIndices()))
+      AllBasePointersAreAllocas = false;
+    
      // Compare the operand lists.
      for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
        if (FirstInst->getOperand(op) == GEP->getOperand(op))
@@ -10081,6 +10223,15 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
      }
    }
    
+  // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+  // bother doing this transformation.  At best, this will just save a bit of
+  // offset calculation, but all the predecessors will have to materialize the
+  // stack address into a register anyway.  We'd actually rather *clone* the
+  // load up into the predecessors so that we have a load of a gep of an alloca,
+  // which can usually all be folded into the load.
+  if (AllBasePointersAreAllocas)
+    return 0;
+  
    // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
    // that is variable.
    SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
@@ -10119,15 +10270,15 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
  }
  
  
-/// isSafeToSinkLoad - Return true if we know that it is safe sink the load out
-/// of the block that defines it.  This means that it must be obvious the value
-/// of the load is not changed from the point of the load to the end of the
-/// block it is in.
+/// isSafeAndProfitableToSinkLoad - Return true if we know that it is safe to
+/// sink the load out of the block that defines it.  This means that it must be
+/// obvious the value of the load is not changed from the point of the load to
+/// the end of the block it is in.
  ///
  /// Finally, it is safe, but not profitable, to sink a load targetting a
  /// non-address-taken alloca.  Doing so will cause us to not promote the alloca
  /// to a register.
-static bool isSafeToSinkLoad(LoadInst *L) {
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
    BasicBlock::iterator BBI = L, E = L->getParent()->end();
    
    for (++BBI; BBI != E; ++BBI)
@@ -10149,10 +10300,20 @@ static bool isSafeToSinkLoad(LoadInst *L) {
        break;
      }
      
-    if (!isAddressTaken)
+    if (!isAddressTaken && AI->isStaticAlloca())
        return false;
    }
    
+  // If this load is a load from a GEP with a constant offset from an alloca,
+  // then we don't want to sink it.  In its present form, it will be
+  // load [constant stack offset].  Sinking it will cause us to have to
+  // materialize the stack addresses in each predecessor in a register only to
+  // do a shared load from register in the successor.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+        return false;
+  
    return true;
  }
  
@@ -10183,7 +10344,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
      // We can't sink the load if the loaded value could be modified between the
      // load and the PHI.
      if (LI->getParent() != PN.getIncomingBlock(0) ||
-        !isSafeToSinkLoad(LI))
+        !isSafeAndProfitableToSinkLoad(LI))
        return 0;
      
      // If the PHI is of volatile loads and the load block has multiple
@@ -10213,7 +10374,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
        // the load and the PHI.
        if (LI->isVolatile() != isVolatile ||
            LI->getParent() != PN.getIncomingBlock(i) ||
-          !isSafeToSinkLoad(LI))
+          !isSafeAndProfitableToSinkLoad(LI))
          return 0;
        
        // If the PHI is of volatile loads and the load block has multiple
@@ -10222,7 +10383,6 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
        if (isVolatile &&
            LI->getParent()->getTerminator()->getNumSuccessors() != 1)
          return 0;
-
        
      } else if (I->getOperand(1) != ConstantOp) {
        return 0;
@@ -10873,12 +11033,12 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
          APInt SingleChar(numBits, 0);
          if (TD->isLittleEndian()) {
            for (signed i = len-1; i >= 0; i--) {
-            SingleChar = (uint64_t) Str[i];
+            SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
              StrVal = (StrVal << 8) | SingleChar;
            }
          } else {
            for (unsigned i = 0; i < len; i++) {
-            SingleChar = (uint64_t) Str[i];
+            SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
              StrVal = (StrVal << 8) | SingleChar;
            }
            // Append NULL at the end.
@@ -10891,8 +11051,14 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
      }
    }
  
-  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  const PointerType *DestTy = cast<PointerType>(CI->getType());
+  const Type *DestPTy = DestTy->getElementType();
    if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+
+    // If the address spaces don't match, don't eliminate the cast.
+    if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
+      return 0;
+
      const Type *SrcPTy = SrcTy->getElementType();
  
      if (DestPTy->isInteger() || isa<PointerType>(DestPTy) || 
@@ -10974,7 +11140,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
    Value *Op = LI.getOperand(0);
  
    // Attempt to improve the alignment.
-  unsigned KnownAlign = GetOrEnforceKnownAlignment(Op);
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()));
    if (KnownAlign >
        (LI.getAlignment() == 0 ? TD->getABITypeAlignment(LI.getType()) :
                                  LI.getAlignment()))
@@ -11253,7 +11420,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
    }
  
    // Attempt to improve the alignment.
-  unsigned KnownAlign = GetOrEnforceKnownAlignment(Ptr);
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()));
    if (KnownAlign >
        (SI.getAlignment() == 0 ? TD->getABITypeAlignment(Val->getType()) :
                                  SI.getAlignment()))
@@ -11758,10 +11926,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
      // If the input vector has a single use, simplify it based on this use
      // property.
      if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) {
-      uint64_t UndefElts;
+      APInt UndefElts(VectorWidth, 0);
+      APInt DemandedMask(VectorWidth, 1 << IndexVal);
        if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
-                                                1 << IndexVal,
-                                                UndefElts)) {
+                                                DemandedMask, UndefElts)) {
          EI.setOperand(0, V);
          return &EI;
        }
@@ -12060,15 +12228,14 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    if (isa<UndefValue>(SVI.getOperand(2)))
      return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
  
-  uint64_t UndefElts;
    unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
  
    if (VWidth != cast<VectorType>(LHS->getType())->getNumElements())
      return 0;
  
-  uint64_t AllOnesEltMask = ~0ULL >> (64-VWidth);
-  if (VWidth <= 64 &&
-      SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
      LHS = SVI.getOperand(0);
      RHS = SVI.getOperand(1);
      MadeChange = true;
@@ -12257,6 +12424,8 @@ static void AddReachableCodeToWorklist(BasicBlock *BB,
            DBI_Prev->eraseFromParent();
          }
          DBI_Prev = DBI_Next;
+      } else {
+        DBI_Prev = 0;
        }
  
        IC.AddToWorkList(Inst);