Fix a ValueTracking rule: RHS means operand 1, not 0. Add a simple

[oota-llvm.git] / lib / Transforms / Scalar / InstructionCombining.cpp
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp

index c7123637083cd624a4d8a3843eefaa8b5b87ab9f..62554a5e91fa056c81f88e3dfa5b2821c89c308a 100644 (file)
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -180,7 +180,11 @@ namespace {
      Instruction *visitUDiv(BinaryOperator &I);
      Instruction *visitSDiv(BinaryOperator &I);
      Instruction *visitFDiv(BinaryOperator &I);
+    Instruction *FoldAndOfICmps(Instruction &I, ICmpInst *LHS, ICmpInst *RHS);
      Instruction *visitAnd(BinaryOperator &I);
+    Instruction *FoldOrOfICmps(Instruction &I, ICmpInst *LHS, ICmpInst *RHS);
+    Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                     Value *A, Value *B, Value *C);
      Instruction *visitOr (BinaryOperator &I);
      Instruction *visitXor(BinaryOperator &I);
      Instruction *visitShl(BinaryOperator &I);
@@ -299,23 +303,6 @@ namespace {
        }
      }
  
-    // UpdateValueUsesWith - This method is to be used when an value is
-    // found to be replacable with another preexisting expression or was
-    // updated.  Here we add all uses of I to the worklist, replace all uses of
-    // I with the new value (unless the instruction was just updated), then
-    // return true, so that the inst combiner will know that I was modified.
-    //
-    bool UpdateValueUsesWith(Value *Old, Value *New) {
-      AddUsersToWorkList(*Old);         // Add all modified instrs to worklist
-      if (Old != New)
-        Old->replaceAllUsesWith(New);
-      if (Instruction *I = dyn_cast<Instruction>(Old))
-        AddToWorkList(I);
-      if (Instruction *I = dyn_cast<Instruction>(New))
-        AddToWorkList(I);
-      return true;
-    }
-    
      // EraseInstFromFunction - When dealing with an instruction that has side
      // effects or produces a void value, we can't rely on DCE to delete the
      // instruction.  Instead, visit methods should return the value returned by
@@ -342,13 +329,6 @@ namespace {
      }
  
    private:
-    /// InsertOperandCastBefore - This inserts a cast of V to DestTy before the
-    /// InsertBefore instruction.  This is specialized a bit to avoid inserting
-    /// casts that are known to not do anything...
-    ///
-    Value *InsertOperandCastBefore(Instruction::CastOps opcode,
-                                   Value *V, const Type *DestTy,
-                                   Instruction *InsertBefore);
  
      /// SimplifyCommutative - This performs a few simplifications for 
      /// commutative operators.
@@ -358,14 +338,22 @@ namespace {
      /// most-complex to least-complex order.
      bool SimplifyCompare(CmpInst &I);
  
-    /// SimplifyDemandedBits - Attempts to replace V with a simpler value based
-    /// on the demanded bits.
-    bool SimplifyDemandedBits(Value *V, APInt DemandedMask, 
+    /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
+    /// based on the demanded bits.
+    Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, 
+                                   APInt& KnownZero, APInt& KnownOne,
+                                   unsigned Depth);
+    bool SimplifyDemandedBits(Use &U, APInt DemandedMask, 
                                APInt& KnownZero, APInt& KnownOne,
-                              unsigned Depth = 0);
-
-    Value *SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
-                                      uint64_t &UndefElts, unsigned Depth = 0);
+                              unsigned Depth=0);
+        
+    /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+    /// SimplifyDemandedBits knows about.  See if the instruction has any
+    /// properties that allow us to simplify its operands.
+    bool SimplifyDemandedInstructionBits(Instruction &Inst);
+        
+    Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                      APInt& UndefElts, unsigned Depth = 0);
        
      // FoldOpIntoPhi - Given a binary operator or cast instruction which has a
      // PHI node as operand #0, see if we can fold the instruction into the PHI
@@ -377,7 +365,8 @@ namespace {
      // inputs, and do the operation once, to the result of the PHI.
      Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
      Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
-    
+    Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
+
      
      Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
                            ConstantInt *AndRHS, BinaryOperator &TheAnd);
@@ -396,8 +385,7 @@ namespace {
      Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned);
  
      bool CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
-                                    unsigned CastOpc,
-                                    int &NumCastsRemoved);
+                                    unsigned CastOpc, int &NumCastsRemoved);
      unsigned GetOrEnforceKnownAlignment(Value *V,
                                          unsigned PrefAlign = 0);
  
@@ -503,20 +491,6 @@ static bool ValueRequiresCast(Instruction::CastOps opcode, const Value *V,
    return true;
  }
  
-/// InsertOperandCastBefore - This inserts a cast of V to DestTy before the
-/// InsertBefore instruction.  This is specialized a bit to avoid inserting
-/// casts that are known to not do anything...
-///
-Value *InstCombiner::InsertOperandCastBefore(Instruction::CastOps opcode,
-                                             Value *V, const Type *DestTy,
-                                             Instruction *InsertBefore) {
-  if (V->getType() == DestTy) return V;
-  if (Constant *C = dyn_cast<Constant>(V))
-    return ConstantExpr::getCast(opcode, C, DestTy);
-  
-  return InsertCastBefore(opcode, V, DestTy, *InsertBefore);
-}
-
  // SimplifyCommutative - This performs a few simplifications for commutative
  // operators:
  //
@@ -767,14 +741,44 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const Type *Ty,
    Max = KnownOne|UnknownBits;
  }
  
-/// SimplifyDemandedBits - This function attempts to replace V with a simpler
-/// value based on the demanded bits. When this function is called, it is known
+/// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+/// SimplifyDemandedBits knows about.  See if the instruction has any
+/// properties that allow us to simplify its operands.
+bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+  unsigned BitWidth = cast<IntegerType>(Inst.getType())->getBitWidth();
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+  
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, 
+                                     KnownZero, KnownOne, 0);
+  if (V == 0) return false;
+  if (V == &Inst) return true;
+  ReplaceInstUsesWith(Inst, V);
+  return true;
+}
+
+/// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the
+/// specified instruction operand if possible, updating it in place.  It returns
+/// true if it made any change and false otherwise.
+bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+                                        APInt &KnownZero, APInt &KnownOne,
+                                        unsigned Depth) {
+  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
+                                          KnownZero, KnownOne, Depth);
+  if (NewVal == 0) return false;
+  U.set(NewVal);
+  return true;
+}
+
+
+/// SimplifyDemandedUseBits - This function attempts to replace V with a simpler
+/// value based on the demanded bits.  When this function is called, it is known
  /// that only the bits set in DemandedMask of the result of V are ever used
  /// downstream. Consequently, depending on the mask and V, it may be possible
  /// to replace V with a constant or one of its operands. In such cases, this
  /// function does the replacement and returns true. In all other cases, it
  /// returns false after analyzing the expression and setting KnownOne and known
-/// to be one in the expression. KnownZero contains all the bits that are known
+/// to be one in the expression.  KnownZero contains all the bits that are known
  /// to be zero in the expression. These are provided to potentially allow the
  /// caller (which might recursively be SimplifyDemandedBits itself) to simplify
  /// the expression. KnownOne and KnownZero always follow the invariant that 
@@ -782,9 +786,15 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const Type *Ty,
  /// the bits in KnownOne and KnownZero may only be accurate for those bits set
  /// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero
  /// and KnownOne must all be the same.
-bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
-                                        APInt& KnownZero, APInt& KnownOne,
-                                        unsigned Depth) {
+///
+/// This returns null if it did not change anything and it permits no
+/// simplification.  This returns V itself if it did some simplification of V's
+/// operands based on the information about what bits are demanded. This returns
+/// some other non-null value if it found out that V is equal to another value
+/// in the context where the specified bits are demanded, but not for all users.
+Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                             APInt &KnownZero, APInt &KnownOne,
+                                             unsigned Depth) {
    assert(V != 0 && "Null pointer of Value???");
    assert(Depth <= 6 && "Limit Search Depth");
    uint32_t BitWidth = DemandedMask.getBitWidth();
@@ -798,69 +808,127 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
      // We know all of the bits for a constant!
      KnownOne = CI->getValue() & DemandedMask;
      KnownZero = ~KnownOne & DemandedMask;
-    return false;
+    return 0;
    }
    
-  KnownZero.clear(); 
+  KnownZero.clear();
    KnownOne.clear();
-  if (!V->hasOneUse()) {    // Other users may use these bits.
-    if (Depth != 0) {       // Not at the root.
-      // Just compute the KnownZero/KnownOne bits to simplify things downstream.
-      ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
-      return false;
-    }
-    // If this is the root being simplified, allow it to have multiple uses,
-    // just set the DemandedMask to all bits.
-    DemandedMask = APInt::getAllOnesValue(BitWidth);
-  } else if (DemandedMask == 0) {   // Not demanding any bits from V.
-    if (V != UndefValue::get(VTy))
-      return UpdateValueUsesWith(V, UndefValue::get(VTy));
-    return false;
-  } else if (Depth == 6) {        // Limit search depth.
-    return false;
+  if (DemandedMask == 0) {   // Not demanding any bits from V.
+    if (isa<UndefValue>(V))
+      return 0;
+    return UndefValue::get(VTy);
    }
    
+  if (Depth == 6)        // Limit search depth.
+    return 0;
+  
    Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;        // Only analyze instructions.
-
+  if (!I) return 0;        // Only analyze instructions.
+  
    APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
    APInt &RHSKnownZero = KnownZero, &RHSKnownOne = KnownOne;
+
+  // If there are multiple uses of this value and we aren't at the root, then
+  // we can't do any simplifications of the operands, because DemandedMask
+  // only reflects the bits demanded by *one* of the users.
+  if (Depth != 0 && !I->hasOneUse()) {
+    // Despite the fact that we can't simplify this instruction in all User's
+    // context, we can at least compute the knownzero/knownone bits, and we can
+    // do simplifications that apply to *just* the one user if we know that
+    // this instruction has a simpler value in that context.
+    if (I->getOpcode() == Instruction::And) {
+      // If either the LHS or the RHS are Zero, the result is zero.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask,
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known 1 on one side, return the other.
+      // These bits cannot contribute to the result of the 'and' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+          (DemandedMask & ~LHSKnownZero))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+          (DemandedMask & ~RHSKnownZero))
+        return I->getOperand(1);
+      
+      // If all of the demanded bits in the inputs are known zeros, return zero.
+      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+        return Constant::getNullValue(VTy);
+      
+    } else if (I->getOpcode() == Instruction::Or) {
+      // We can simplify (X|Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+      
+      // If either the LHS or the RHS are One, the result is One.
+      ComputeMaskedBits(I->getOperand(1), DemandedMask, 
+                        RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
+                        LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known zero on one side, return the
+      // other.  These bits cannot contribute to the result of the 'or' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+          (DemandedMask & ~LHSKnownOne))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+          (DemandedMask & ~RHSKnownOne))
+        return I->getOperand(1);
+      
+      // If all of the potentially set bits on one side are known to be set on
+      // the other side, just use the 'other' side.
+      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+          (DemandedMask & (~RHSKnownZero)))
+        return I->getOperand(0);
+      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+          (DemandedMask & (~LHSKnownZero)))
+        return I->getOperand(1);
+    }
+    
+    // Compute the KnownZero/KnownOne bits to simplify things downstream.
+    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    return 0;
+  }
+  
+  // If this is the root being simplified, allow it to have multiple uses,
+  // just set the DemandedMask to all bits so that we can try to simplify the
+  // operands.  This allows visitTruncInst (for example) to simplify the
+  // operand of a trunc without duplicating all the logic below.
+  if (Depth == 0 && !V->hasOneUse())
+    DemandedMask = APInt::getAllOnesValue(BitWidth);
+  
    switch (I->getOpcode()) {
    default:
-    ComputeMaskedBits(V, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
+    ComputeMaskedBits(I, DemandedMask, RHSKnownZero, RHSKnownOne, Depth);
      break;
    case Instruction::And:
      // If either the LHS or the RHS are Zero, the result is zero.
-    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask,
-                             RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
-
-    // If something is known zero on the RHS, the bits aren't demanded on the
-    // LHS.
-    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
                               LHSKnownZero, LHSKnownOne, Depth+1))
-      return true;
-    assert((LHSKnownZero & LHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
  
      // If all of the demanded bits are known 1 on one side, return the other.
      // These bits cannot contribute to the result of the 'and'.
      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
          (DemandedMask & ~LHSKnownZero))
-      return UpdateValueUsesWith(I, I->getOperand(0));
+      return I->getOperand(0);
      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
          (DemandedMask & ~RHSKnownZero))
-      return UpdateValueUsesWith(I, I->getOperand(1));
+      return I->getOperand(1);
      
      // If all of the demanded bits in the inputs are known zeros, return zero.
      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
-      return UpdateValueUsesWith(I, Constant::getNullValue(VTy));
+      return Constant::getNullValue(VTy);
        
      // If the RHS is a constant, see if we can simplify it.
      if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
-      return UpdateValueUsesWith(I, I);
+      return I;
        
      // Output known-1 bits are only known if set in both the LHS & RHS.
      RHSKnownOne &= LHSKnownOne;
@@ -869,40 +937,35 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
      break;
    case Instruction::Or:
      // If either the LHS or the RHS are One, the result is One.
-    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask, 
-                             RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
-    // If something is known one on the RHS, the bits aren't demanded on the
-    // LHS.
-    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, 
                               LHSKnownZero, LHSKnownOne, Depth+1))
-      return true;
-    assert((LHSKnownZero & LHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
      
      // If all of the demanded bits are known zero on one side, return the other.
      // These bits cannot contribute to the result of the 'or'.
      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
          (DemandedMask & ~LHSKnownOne))
-      return UpdateValueUsesWith(I, I->getOperand(0));
+      return I->getOperand(0);
      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
          (DemandedMask & ~RHSKnownOne))
-      return UpdateValueUsesWith(I, I->getOperand(1));
+      return I->getOperand(1);
  
      // If all of the potentially set bits on one side are known to be set on
      // the other side, just use the 'other' side.
      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
          (DemandedMask & (~RHSKnownZero)))
-      return UpdateValueUsesWith(I, I->getOperand(0));
+      return I->getOperand(0);
      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
          (DemandedMask & (~LHSKnownZero)))
-      return UpdateValueUsesWith(I, I->getOperand(1));
+      return I->getOperand(1);
          
      // If the RHS is a constant, see if we can simplify it.
      if (ShrinkDemandedConstant(I, 1, DemandedMask))
-      return UpdateValueUsesWith(I, I);
+      return I;
            
      // Output known-0 bits are only known if clear in both the LHS & RHS.
      RHSKnownZero &= LHSKnownZero;
@@ -910,23 +973,20 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
      RHSKnownOne |= LHSKnownOne;
      break;
    case Instruction::Xor: {
-    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask,
-                             RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
-    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask, 
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
                               LHSKnownZero, LHSKnownOne, Depth+1))
-      return true;
-    assert((LHSKnownZero & LHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
      
      // If all of the demanded bits are known zero on one side, return the other.
      // These bits cannot contribute to the result of the 'xor'.
      if ((DemandedMask & RHSKnownZero) == DemandedMask)
-      return UpdateValueUsesWith(I, I->getOperand(0));
+      return I->getOperand(0);
      if ((DemandedMask & LHSKnownZero) == DemandedMask)
-      return UpdateValueUsesWith(I, I->getOperand(1));
+      return I->getOperand(1);
      
      // Output known-0 bits are known if clear or set in both the LHS & RHS.
      APInt KnownZeroOut = (RHSKnownZero & LHSKnownZero) | 
@@ -942,8 +1002,7 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        Instruction *Or =
          BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                   I->getName());
-      InsertNewInstBefore(Or, *I);
-      return UpdateValueUsesWith(I, Or);
+      return InsertNewInstBefore(Or, *I);
      }
      
      // If all of the demanded bits on one side are known, and all of the set
@@ -956,92 +1015,80 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
          Constant *AndC = ConstantInt::get(~RHSKnownOne & DemandedMask);
          Instruction *And = 
            BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
-        InsertNewInstBefore(And, *I);
-        return UpdateValueUsesWith(I, And);
+        return InsertNewInstBefore(And, *I);
        }
      }
      
      // If the RHS is a constant, see if we can simplify it.
      // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
      if (ShrinkDemandedConstant(I, 1, DemandedMask))
-      return UpdateValueUsesWith(I, I);
+      return I;
      
      RHSKnownZero = KnownZeroOut;
      RHSKnownOne  = KnownOneOut;
      break;
    }
    case Instruction::Select:
-    if (SimplifyDemandedBits(I->getOperand(2), DemandedMask,
-                             RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
-    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask, 
+    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
                               LHSKnownZero, LHSKnownOne, Depth+1))
-      return true;
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
-    assert((LHSKnownZero & LHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
      
      // If the operands are constants, see if we can simplify them.
-    if (ShrinkDemandedConstant(I, 1, DemandedMask))
-      return UpdateValueUsesWith(I, I);
-    if (ShrinkDemandedConstant(I, 2, DemandedMask))
-      return UpdateValueUsesWith(I, I);
+    if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
+        ShrinkDemandedConstant(I, 2, DemandedMask))
+      return I;
      
      // Only known if known in both the LHS and RHS.
      RHSKnownOne &= LHSKnownOne;
      RHSKnownZero &= LHSKnownZero;
      break;
    case Instruction::Trunc: {
-    uint32_t truncBf = 
-      cast<IntegerType>(I->getOperand(0)->getType())->getBitWidth();
+    unsigned truncBf = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
      DemandedMask.zext(truncBf);
      RHSKnownZero.zext(truncBf);
      RHSKnownOne.zext(truncBf);
-    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask, 
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
                               RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
+      return I;
      DemandedMask.trunc(BitWidth);
      RHSKnownZero.trunc(BitWidth);
      RHSKnownOne.trunc(BitWidth);
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
      break;
    }
    case Instruction::BitCast:
      if (!I->getOperand(0)->getType()->isInteger())
-      return false;
-      
-    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask,
+      return false;  // vector->int or fp->int?
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
                               RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
      break;
    case Instruction::ZExt: {
      // Compute the bits in the result that are not present in the input.
-    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
-    uint32_t SrcBitWidth = SrcTy->getBitWidth();
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getPrimitiveSizeInBits();
      
      DemandedMask.trunc(SrcBitWidth);
      RHSKnownZero.trunc(SrcBitWidth);
      RHSKnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask,
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
                               RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
+      return I;
      DemandedMask.zext(BitWidth);
      RHSKnownZero.zext(BitWidth);
      RHSKnownOne.zext(BitWidth);
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
      // The top bits are known to be zero.
      RHSKnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
      break;
    }
    case Instruction::SExt: {
      // Compute the bits in the result that are not present in the input.
-    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
-    uint32_t SrcBitWidth = SrcTy->getBitWidth();
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getPrimitiveSizeInBits();
      
      APInt InputDemandedBits = DemandedMask & 
                                APInt::getLowBitsSet(BitWidth, SrcBitWidth);
@@ -1055,25 +1102,23 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
      InputDemandedBits.trunc(SrcBitWidth);
      RHSKnownZero.trunc(SrcBitWidth);
      RHSKnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperand(0), InputDemandedBits,
+    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits,
                               RHSKnownZero, RHSKnownOne, Depth+1))
-      return true;
+      return I;
      InputDemandedBits.zext(BitWidth);
      RHSKnownZero.zext(BitWidth);
      RHSKnownOne.zext(BitWidth);
-    assert((RHSKnownZero & RHSKnownOne) == 0 && 
-           "Bits known to be one AND zero?"); 
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
        
      // If the sign bit of the input is known set or clear, then we know the
      // top bits of the result.
  
      // If the input sign bit is known zero, or if the NewBits are not demanded
      // convert this into a zero extension.
-    if (RHSKnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits)
-    {
+    if (RHSKnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
        // Convert to ZExt cast
-      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName(), I);
-      return UpdateValueUsesWith(I, NewCast);
+      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
+      return InsertNewInstBefore(NewCast, *I);
      } else if (RHSKnownOne[SrcBitWidth-1]) {    // Input sign bit known set
        RHSKnownOne |= NewBits;
      }
@@ -1083,7 +1128,7 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
      // Figure out what the input bits are.  If the top bits of the and result
      // are not demanded, then the add doesn't demand them from its input
      // either.
-    uint32_t NLZ = DemandedMask.countLeadingZeros();
+    unsigned NLZ = DemandedMask.countLeadingZeros();
        
      // If there is a constant on the RHS, there are a variety of xformations
      // we can do.
@@ -1098,14 +1143,14 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ));
  
        // Find information about known zero/one bits in the input.
-      if (SimplifyDemandedBits(I->getOperand(0), InDemandedBits, 
+      if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, 
                                 LHSKnownZero, LHSKnownOne, Depth+1))
-        return true;
+        return I;
  
        // If the RHS of the add has bits set that can't affect the input, reduce
        // the constant.
        if (ShrinkDemandedConstant(I, 1, InDemandedBits))
-        return UpdateValueUsesWith(I, I);
+        return I;
        
        // Avoid excess work.
        if (LHSKnownZero == 0 && LHSKnownOne == 0)
@@ -1116,8 +1161,7 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
          Instruction *Or =
            BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                     I->getName());
-        InsertNewInstBefore(Or, *I);
-        return UpdateValueUsesWith(I, Or);
+        return InsertNewInstBefore(Or, *I);
        }
        
        // We can say something about the output known-zero and known-one bits,
@@ -1129,7 +1173,7 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        // To compute this, we first compute the potential carry bits.  These are
        // the bits which may be modified.  I'm not aware of a better way to do
        // this scan.
-      const APInt& RHSVal = RHS->getValue();
+      const APInt &RHSVal = RHS->getValue();
        APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal));
        
        // Now that we know which bits have carries, compute the known-1/0 sets.
@@ -1149,12 +1193,11 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
          // Right fill the mask of bits for this ADD to demand the most
          // significant bit and all those below it.
          APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
-        if (SimplifyDemandedBits(I->getOperand(0), DemandedFromOps,
+        if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1) ||
+            SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
                                   LHSKnownZero, LHSKnownOne, Depth+1))
-          return true;
-        if (SimplifyDemandedBits(I->getOperand(1), DemandedFromOps,
-                                 LHSKnownZero, LHSKnownOne, Depth+1))
-          return true;
+          return I;
        }
      }
      break;
@@ -1167,12 +1210,11 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        // significant bit and all those below it.
        uint32_t NLZ = DemandedMask.countLeadingZeros();
        APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
-      if (SimplifyDemandedBits(I->getOperand(0), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth+1))
-        return true;
-      if (SimplifyDemandedBits(I->getOperand(1), DemandedFromOps,
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1) ||
+          SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
                                 LHSKnownZero, LHSKnownOne, Depth+1))
-        return true;
+        return I;
      }
      // Otherwise just hand the sub off to ComputeMaskedBits to fill in
      // the known zeros and ones.
@@ -1182,11 +1224,10 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
      if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
        uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
        APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
-      if (SimplifyDemandedBits(I->getOperand(0), DemandedMaskIn, 
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, 
                                 RHSKnownZero, RHSKnownOne, Depth+1))
-        return true;
-      assert((RHSKnownZero & RHSKnownOne) == 0 && 
-             "Bits known to be one AND zero?"); 
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
        RHSKnownZero <<= ShiftAmt;
        RHSKnownOne  <<= ShiftAmt;
        // low bits known zero.
@@ -1201,11 +1242,10 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        
        // Unsigned shift right.
        APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
-      if (SimplifyDemandedBits(I->getOperand(0), DemandedMaskIn,
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
                                 RHSKnownZero, RHSKnownOne, Depth+1))
-        return true;
-      assert((RHSKnownZero & RHSKnownOne) == 0 && 
-             "Bits known to be one AND zero?"); 
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
        RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
        RHSKnownOne  = APIntOps::lshr(RHSKnownOne, ShiftAmt);
        if (ShiftAmt) {
@@ -1222,16 +1262,15 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
      // the shift amount is >= the size of the datatype, which is undefined.
      if (DemandedMask == 1) {
        // Perform the logical shift right.
-      Value *NewVal = BinaryOperator::CreateLShr(
+      Instruction *NewVal = BinaryOperator::CreateLShr(
                          I->getOperand(0), I->getOperand(1), I->getName());
-      InsertNewInstBefore(cast<Instruction>(NewVal), *I);
-      return UpdateValueUsesWith(I, NewVal);
+      return InsertNewInstBefore(NewVal, *I);
      }    
  
      // If the sign bit is the only bit demanded by this ashr, then there is no
      // need to do it, the shift doesn't change the high bit.
      if (DemandedMask.isSignBit())
-      return UpdateValueUsesWith(I, I->getOperand(0));
+      return I->getOperand(0);
      
      if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
        uint32_t ShiftAmt = SA->getLimitedValue(BitWidth);
@@ -1242,12 +1281,10 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        // demanded.
        if (DemandedMask.countLeadingZeros() <= ShiftAmt)
          DemandedMaskIn.set(BitWidth-1);
-      if (SimplifyDemandedBits(I->getOperand(0),
-                               DemandedMaskIn,
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
                                 RHSKnownZero, RHSKnownOne, Depth+1))
-        return true;
-      assert((RHSKnownZero & RHSKnownOne) == 0 && 
-             "Bits known to be one AND zero?"); 
+        return I;
+      assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
        // Compute the new bits that are at the top now.
        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
        RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
@@ -1263,10 +1300,9 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        if (BitWidth <= ShiftAmt || RHSKnownZero[BitWidth-ShiftAmt-1] || 
            (HighBits & ~DemandedMask) == HighBits) {
          // Perform the logical shift right.
-        Value *NewVal = BinaryOperator::CreateLShr(
+        Instruction *NewVal = BinaryOperator::CreateLShr(
                            I->getOperand(0), SA, I->getName());
-        InsertNewInstBefore(cast<Instruction>(NewVal), *I);
-        return UpdateValueUsesWith(I, NewVal);
+        return InsertNewInstBefore(NewVal, *I);
        } else if ((RHSKnownOne & SignBit) != 0) { // New bits are known one.
          RHSKnownOne |= HighBits;
        }
@@ -1277,35 +1313,33 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
        APInt RA = Rem->getValue().abs();
        if (RA.isPowerOf2()) {
          if (DemandedMask.ule(RA))    // srem won't affect demanded bits
-          return UpdateValueUsesWith(I, I->getOperand(0));
+          return I->getOperand(0);
  
          APInt LowBits = RA - 1;
          APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        if (SimplifyDemandedBits(I->getOperand(0), Mask2,
+        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2,
                                   LHSKnownZero, LHSKnownOne, Depth+1))
-          return true;
+          return I;
  
          if (LHSKnownZero[BitWidth-1] || ((LHSKnownZero & LowBits) == LowBits))
            LHSKnownZero |= ~LowBits;
  
          KnownZero |= LHSKnownZero & DemandedMask;
  
-        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+        assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
        }
      }
      break;
    case Instruction::URem: {
      APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
      APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    if (SimplifyDemandedBits(I->getOperand(0), AllOnes,
+    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), AllOnes,
                               KnownZero2, KnownOne2, Depth+1))
-      return true;
-
-    uint32_t Leaders = KnownZero2.countLeadingOnes();
-    if (SimplifyDemandedBits(I->getOperand(1), AllOnes,
-                             KnownZero2, KnownOne2, Depth+1))
-      return true;
+      return I;
  
+    unsigned Leaders = KnownZero2.countLeadingOnes();
      Leaders = std::max(Leaders,
                         KnownZero2.countLeadingOnes());
      KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
@@ -1341,8 +1375,7 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
              NewVal = BinaryOperator::CreateShl(I->getOperand(1),
                      ConstantInt::get(I->getType(), ResultBit-InputBit));
            NewVal->takeName(I);
-          InsertNewInstBefore(NewVal, *I);
-          return UpdateValueUsesWith(I, NewVal);
+          return InsertNewInstBefore(NewVal, *I);
          }
            
          // TODO: Could compute known zero/one bits based on the input.
@@ -1357,25 +1390,24 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
    // If the client is only demanding bits that we know, return the known
    // constant.
    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask)
-    return UpdateValueUsesWith(I, ConstantInt::get(RHSKnownOne));
+    return ConstantInt::get(RHSKnownOne);
    return false;
  }
  
  
  /// SimplifyDemandedVectorElts - The specified value produces a vector with
-/// 64 or fewer elements.  DemandedElts contains the set of elements that are
+/// any number of elements. DemandedElts contains the set of elements that are
  /// actually used by the caller.  This method analyzes which elements of the
  /// operand are undef and returns that information in UndefElts.
  ///
  /// If the information about demanded elements can be used to simplify the
  /// operation, the operation is simplified, then the resultant value is
  /// returned.  This returns null if no change was made.
-Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
-                                                uint64_t &UndefElts,
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                                APInt& UndefElts,
                                                  unsigned Depth) {
    unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
-  assert(VWidth <= 64 && "Vector too wide to analyze!");
-  uint64_t EltMask = ~0ULL >> (64-VWidth);
+  APInt EltMask(APInt::getAllOnesValue(VWidth));
    assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
  
    if (isa<UndefValue>(V)) {
@@ -1394,12 +1426,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
  
      std::vector<Constant*> Elts;
      for (unsigned i = 0; i != VWidth; ++i)
-      if (!(DemandedElts & (1ULL << i))) {   // If not demanded, set to undef.
+      if (!DemandedElts[i]) {   // If not demanded, set to undef.
          Elts.push_back(Undef);
-        UndefElts |= (1ULL << i);
+        UndefElts.set(i);
        } else if (isa<UndefValue>(CP->getOperand(i))) {   // Already undef.
          Elts.push_back(Undef);
-        UndefElts |= (1ULL << i);
+        UndefElts.set(i);
        } else {                               // Otherwise, defined.
          Elts.push_back(CP->getOperand(i));
        }
@@ -1420,8 +1452,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      Constant *Zero = Constant::getNullValue(EltTy);
      Constant *Undef = UndefValue::get(EltTy);
      std::vector<Constant*> Elts;
-    for (unsigned i = 0; i != VWidth; ++i)
-      Elts.push_back((DemandedElts & (1ULL << i)) ? Zero : Undef);
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = DemandedElts[i] ? Zero : Undef;
+      Elts.push_back(Elt);
+    }
      UndefElts = DemandedElts ^ EltMask;
      return ConstantVector::get(Elts);
    }
@@ -1449,7 +1483,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
    if (!I) return false;        // Only analyze instructions.
    
    bool MadeChange = false;
-  uint64_t UndefElts2;
+  APInt UndefElts2(VWidth, 0);
    Value *TmpV;
    switch (I->getOpcode()) {
    default: break;
@@ -1470,44 +1504,46 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      // If this is inserting an element that isn't demanded, remove this
      // insertelement.
      unsigned IdxNo = Idx->getZExtValue();
-    if (IdxNo >= VWidth || (DemandedElts & (1ULL << IdxNo)) == 0)
+    if (IdxNo >= VWidth || !DemandedElts[IdxNo])
        return AddSoonDeadInstToWorklist(*I, 0);
      
      // Otherwise, the element inserted overwrites whatever was there, so the
      // input demanded set is simpler than the output set.
-    TmpV = SimplifyDemandedVectorElts(I->getOperand(0),
-                                      DemandedElts & ~(1ULL << IdxNo),
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clear(IdxNo);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
                                        UndefElts, Depth+1);
      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
  
      // The inserted element is defined.
-    UndefElts &= ~(1ULL << IdxNo);
+    UndefElts.clear(IdxNo);
      break;
    }
    case Instruction::ShuffleVector: {
      ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
      uint64_t LHSVWidth =
        cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
-    uint64_t LeftDemanded = 0, RightDemanded = 0;
+    APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
      for (unsigned i = 0; i < VWidth; i++) {
-      if (DemandedElts & (1ULL << i)) {
+      if (DemandedElts[i]) {
          unsigned MaskVal = Shuffle->getMaskValue(i);
          if (MaskVal != -1u) {
            assert(MaskVal < LHSVWidth * 2 &&
                   "shufflevector mask index out of range!");
            if (MaskVal < LHSVWidth)
-            LeftDemanded |= 1ULL << MaskVal;
+            LeftDemanded.set(MaskVal);
            else
-            RightDemanded |= 1ULL << (MaskVal - LHSVWidth);
+            RightDemanded.set(MaskVal - LHSVWidth);
          }
        }
      }
  
+    APInt UndefElts4(LHSVWidth, 0);
      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
-                                      UndefElts2, Depth+1);
+                                      UndefElts4, Depth+1);
      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
  
-    uint64_t UndefElts3;
+    APInt UndefElts3(LHSVWidth, 0);
      TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
                                        UndefElts3, Depth+1);
      if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
@@ -1516,16 +1552,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      for (unsigned i = 0; i < VWidth; i++) {
        unsigned MaskVal = Shuffle->getMaskValue(i);
        if (MaskVal == -1u) {
-        uint64_t NewBit = 1ULL << i;
-        UndefElts |= NewBit;
+        UndefElts.set(i);
        } else if (MaskVal < LHSVWidth) {
-        uint64_t NewBit = ((UndefElts2 >> MaskVal) & 1) << i;
-        NewUndefElts |= NewBit;
-        UndefElts |= NewBit;
+        if (UndefElts4[MaskVal]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
        } else {
-        uint64_t NewBit = ((UndefElts3 >> (MaskVal - LHSVWidth)) & 1) << i;
-        NewUndefElts |= NewBit;
-        UndefElts |= NewBit;
+        if (UndefElts3[MaskVal - LHSVWidth]) {
+          NewUndefElts = true;
+          UndefElts.set(i);
+        }
        }
      }
  
@@ -1533,7 +1570,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // Add additional discovered undefs.
        std::vector<Constant*> Elts;
        for (unsigned i = 0; i < VWidth; ++i) {
-        if (UndefElts & (1ULL << i))
+        if (UndefElts[i])
            Elts.push_back(UndefValue::get(Type::Int32Ty));
          else
            Elts.push_back(ConstantInt::get(Type::Int32Ty,
@@ -1549,7 +1586,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
      const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
      if (!VTy) break;
      unsigned InVWidth = VTy->getNumElements();
-    uint64_t InputDemandedElts = 0;
+    APInt InputDemandedElts(InVWidth, 0);
      unsigned Ratio;
  
      if (VWidth == InVWidth) {
@@ -1566,8 +1603,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // elements are live.
        Ratio = VWidth/InVWidth;
        for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
-        if (DemandedElts & (1ULL << OutIdx))
-          InputDemandedElts |= 1ULL << (OutIdx/Ratio);
+        if (DemandedElts[OutIdx])
+          InputDemandedElts.set(OutIdx/Ratio);
        }
      } else {
        // Untested so far.
@@ -1578,8 +1615,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // live.
        Ratio = InVWidth/VWidth;
        for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if (DemandedElts & (1ULL << InIdx/Ratio))
-          InputDemandedElts |= 1ULL << InIdx;
+        if (DemandedElts[InIdx/Ratio])
+          InputDemandedElts.set(InIdx);
      }
      
      // div/rem demand all inputs, because they don't want divide by zero.
@@ -1597,8 +1634,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // then an output element is undef if the corresponding input element is
        // undef.
        for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
-        if (UndefElts2 & (1ULL << (OutIdx/Ratio)))
-          UndefElts |= 1ULL << OutIdx;
+        if (UndefElts2[OutIdx/Ratio])
+          UndefElts.set(OutIdx);
      } else if (VWidth < InVWidth) {
        assert(0 && "Unimp");
        // If there are more elements in the source than there are in the result,
@@ -1606,8 +1643,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
        // elements are undef.
        UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
        for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if ((UndefElts2 & (1ULL << InIdx)) == 0)    // Not undef?
-          UndefElts &= ~(1ULL << (InIdx/Ratio));    // Clear undef bit.
+        if (!UndefElts2[InIdx])            // Not undef?
+          UndefElts.clear(InIdx/Ratio);    // Clear undef bit.
      }
      break;
    }
@@ -1813,11 +1850,7 @@ struct AddMaskingAnd {
  static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
                                               InstCombiner *IC) {
    if (CastInst *CI = dyn_cast<CastInst>(&I)) {
-    if (Constant *SOC = dyn_cast<Constant>(SO))
-      return ConstantExpr::getCast(CI->getOpcode(), SOC, I.getType());
-
-    return IC->InsertNewInstBefore(CastInst::Create(
-          CI->getOpcode(), SO, I.getType(), SO->getName() + ".cast"), I);
+    return IC->InsertCastBefore(CI->getOpcode(), SO, I.getType(), I);
    }
  
    // Figure out if the constant is the left or the right argument.
@@ -2014,12 +2047,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
        
        // See if SimplifyDemandedBits can simplify this.  This handles stuff like
        // (X & 254)+1 -> (X&254)|1
-      if (!isa<VectorType>(I.getType())) {
-        APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-        if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
-                                 KnownZero, KnownOne))
-          return &I;
-      }
+      if (!isa<VectorType>(I.getType()) && SimplifyDemandedInstructionBits(I))
+        return &I;
  
        // zext(i1) - 1  ->  select i1, 0, -1
        if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
@@ -2245,23 +2274,23 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
    // add (select X 0 (sub n A)) A  -->  select X A n
    {
      SelectInst *SI = dyn_cast<SelectInst>(LHS);
-    Value *Other = RHS;
+    Value *A = RHS;
      if (!SI) {
        SI = dyn_cast<SelectInst>(RHS);
-      Other = LHS;
+      A = LHS;
      }
      if (SI && SI->hasOneUse()) {
        Value *TV = SI->getTrueValue();
        Value *FV = SI->getFalseValue();
-      Value *A, *N;
+      Value *N;
  
        // Can we fold the add into the argument of the select?
        // We check both true and false select arguments for a matching subtract.
-      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Value(A))) &&
-          A == Other)  // Fold the add into the true select value.
+      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the true select value.
          return SelectInst::Create(SI->getCondition(), N, A);
-      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Value(A))) && 
-          A == Other)  // Fold the add into the false select value.
+      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the false select value.
          return SelectInst::Create(SI->getCondition(), A, N);
      }
    }
@@ -2411,10 +2440,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
      if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
        if (Instruction *R = FoldOpIntoSelect(I, SI, this))
          return R;
-
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
    }
  
    if (I.getType() == Type::Int1Ty)
@@ -2475,17 +2500,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
          Constant *CP1 = Subtract(ConstantInt::get(I.getType(), 1), C2);
          return BinaryOperator::CreateMul(Op0, CP1);
        }
-
-      // X - ((X / Y) * Y) --> X % Y
-      if (Op1I->getOpcode() == Instruction::Mul)
-        if (Instruction *I = dyn_cast<Instruction>(Op1I->getOperand(0)))
-          if (Op0 == I->getOperand(0) &&
-              Op1I->getOperand(1) == I->getOperand(1)) {
-            if (I->getOpcode() == Instruction::SDiv)
-              return BinaryOperator::CreateSRem(Op0, Op1I->getOperand(1));
-            if (I->getOpcode() == Instruction::UDiv)
-              return BinaryOperator::CreateURem(Op0, Op1I->getOperand(1));
-          }
      }
    }
  
@@ -2585,12 +2599,21 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
      } else if (isa<VectorType>(Op1->getType())) {
        if (isa<ConstantAggregateZero>(Op1))
          return ReplaceInstUsesWith(I, Op1);
-      
-      // As above, vector X*splat(1.0) -> X in all defined cases.
-      if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1))
-        if (ConstantFP *F = dyn_cast_or_null<ConstantFP>(Op1V->getSplatValue()))
-          if (F->isExactlyValue(1.0))
-            return ReplaceInstUsesWith(I, Op0);
+
+      if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
+        if (Op1V->isAllOnesValue())              // X * -1 == 0 - X
+          return BinaryOperator::CreateNeg(Op0, I.getName());
+
+        // As above, vector X*splat(1.0) -> X in all defined cases.
+        if (Constant *Splat = Op1V->getSplatValue()) {
+          if (ConstantFP *F = dyn_cast<ConstantFP>(Splat))
+            if (F->isExactlyValue(1.0))
+              return ReplaceInstUsesWith(I, Op0);
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Splat))
+            if (CI->equalsInt(1))
+              return ReplaceInstUsesWith(I, Op0);
+        }
+      }
      }
      
      if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
@@ -2620,6 +2643,40 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
      if (Value *Op1v = dyn_castNegVal(I.getOperand(1)))
        return BinaryOperator::CreateMul(Op0v, Op1v);
  
+  // (X / Y) *  Y = X - (X % Y)
+  // (X / Y) * -Y = (X % Y) - X
+  {
+    Value *Op1 = I.getOperand(1);
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
+    if (!BO ||
+        (BO->getOpcode() != Instruction::UDiv && 
+         BO->getOpcode() != Instruction::SDiv)) {
+      Op1 = Op0;
+      BO = dyn_cast<BinaryOperator>(I.getOperand(1));
+    }
+    Value *Neg = dyn_castNegVal(Op1);
+    if (BO && BO->hasOneUse() &&
+        (BO->getOperand(1) == Op1 || BO->getOperand(1) == Neg) &&
+        (BO->getOpcode() == Instruction::UDiv ||
+         BO->getOpcode() == Instruction::SDiv)) {
+      Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1);
+
+      Instruction *Rem;
+      if (BO->getOpcode() == Instruction::UDiv)
+        Rem = BinaryOperator::CreateURem(Op0BO, Op1BO);
+      else
+        Rem = BinaryOperator::CreateSRem(Op0BO, Op1BO);
+
+      InsertNewInstBefore(Rem, I);
+      Rem->takeName(BO);
+
+      if (Op1BO == Op1)
+        return BinaryOperator::CreateSub(Op0BO, Rem);
+      else
+        return BinaryOperator::CreateSub(Rem, Op0BO);
+    }
+  }
+
    if (I.getType() == Type::Int1Ty)
      return BinaryOperator::CreateAnd(Op0, I.getOperand(1));
  
@@ -2831,6 +2888,13 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
    if (I.getType() == Type::Int1Ty)
      return ReplaceInstUsesWith(I, Op0);
  
+  if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
+    if (ConstantInt *X = cast_or_null<ConstantInt>(Op1V->getSplatValue()))
+      // div X, 1 == X
+      if (X->isOne())
+        return ReplaceInstUsesWith(I, Op0);
+  }
+
    return 0;
  }
  
@@ -2841,13 +2905,21 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
    if (Instruction *Common = commonIDivTransforms(I))
      return Common;
  
-  // X udiv C^2 -> X >> C
-  // Check to see if this is an unsigned division with an exact power of 2,
-  // if so, convert to a right shift.
    if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+    // X udiv C^2 -> X >> C
+    // Check to see if this is an unsigned division with an exact power of 2,
+    // if so, convert to a right shift.
      if (C->getValue().isPowerOf2())  // 0 not included in isPowerOf2
        return BinaryOperator::CreateLShr(Op0, 
                 ConstantInt::get(Op0->getType(), C->getValue().logBase2()));
+
+    // X udiv C, where C >= signbit
+    if (C->getValue().isNegative()) {
+      Value *IC = InsertNewInstBefore(new ICmpInst(ICmpInst::ICMP_ULT, Op0, C),
+                                      I);
+      return SelectInst::Create(IC, Constant::getNullValue(I.getType()),
+                                ConstantInt::get(I.getType(), 1));
+    }
    }
  
    // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
@@ -2906,10 +2978,6 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
      // sdiv X, -1 == -X
      if (RHS->isAllOnesValue())
        return BinaryOperator::CreateNeg(Op0);
-
-    // -X/C -> X/-C
-    if (Value *LHSNeg = dyn_castNegVal(Op0))
-      return BinaryOperator::CreateSDiv(LHSNeg, ConstantExpr::getNeg(RHS));
    }
  
    // If the sign bits of both operands are zero (i.e. we can prove they are
@@ -2936,11 +3004,6 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
  Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) {
    Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
  
-  // 0 % X == 0 for integer, we don't need to preserve faults!
-  if (Constant *LHS = dyn_cast<Constant>(Op0))
-    if (LHS->isNullValue())
-      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-
    if (isa<UndefValue>(Op0)) {             // undef % X -> 0
      if (I.getType()->isFPOrFPVector())
        return ReplaceInstUsesWith(I, Op0);  // X % undef -> undef (could be SNaN)
@@ -2966,6 +3029,11 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
    if (Instruction *common = commonRemTransforms(I))
      return common;
  
+  // 0 % X == 0 for integer, we don't need to preserve faults!
+  if (Constant *LHS = dyn_cast<Constant>(Op0))
+    if (LHS->isNullValue())
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
    if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
      // X % 0 == undef, we don't need to preserve faults!
      if (RHS->equalsInt(0))
@@ -2984,10 +3052,7 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
        }
  
        // See if we can fold away this rem instruction.
-      uint32_t BitWidth = cast<IntegerType>(I.getType())->getBitWidth();
-      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
-                               KnownZero, KnownOne))
+      if (SimplifyDemandedInstructionBits(I))
          return &I;
      }
    }
@@ -3070,6 +3135,36 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
      }
    }
  
+  // If it's a constant vector, flip any negative values positive.
+  if (ConstantVector *RHSV = dyn_cast<ConstantVector>(Op1)) {
+    unsigned VWidth = RHSV->getNumOperands();
+
+    bool hasNegative = false;
+    for (unsigned i = 0; !hasNegative && i != VWidth; ++i)
+      if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i)))
+        if (RHS->getValue().isNegative())
+          hasNegative = true;
+
+    if (hasNegative) {
+      std::vector<Constant *> Elts(VWidth);
+      for (unsigned i = 0; i != VWidth; ++i) {
+        if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i))) {
+          if (RHS->getValue().isNegative())
+            Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
+          else
+            Elts[i] = RHS;
+        }
+      }
+
+      Constant *NewRHSV = ConstantVector::get(Elts);
+      if (NewRHSV != RHSV) {
+        AddUsesToWorkList(I);
+        I.setOperand(1, NewRHSV);
+        return &I;
+      }
+    }
+  }
+
    return 0;
  }
  
@@ -3240,13 +3335,12 @@ static Value *getFCmpValue(bool isordered, unsigned code,
    }
  }
  
-
+/// PredicatesFoldable - Return true if both predicates match sign or if at
+/// least one of them is an equality comparison (which is signless).
  static bool PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
    return (ICmpInst::isSignedPredicate(p1) == ICmpInst::isSignedPredicate(p2)) ||
-    (ICmpInst::isSignedPredicate(p1) && 
-     (p2 == ICmpInst::ICMP_EQ || p2 == ICmpInst::ICMP_NE)) ||
-    (ICmpInst::isSignedPredicate(p2) && 
-     (p1 == ICmpInst::ICMP_EQ || p1 == ICmpInst::ICMP_NE));
+         (ICmpInst::isSignedPredicate(p1) && ICmpInst::isEquality(p2)) ||
+         (ICmpInst::isSignedPredicate(p2) && ICmpInst::isEquality(p1));
  }
  
  namespace { 
@@ -3553,6 +3647,178 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
    return InsertNewInstBefore(New, I);
  }
  
+/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible.
+Instruction *InstCombiner::FoldAndOfICmps(Instruction &I,
+                                          ICmpInst *LHS, ICmpInst *RHS) {
+  Value *Val, *Val2;
+  ConstantInt *LHSCst, *RHSCst;
+  ICmpInst::Predicate LHSCC, RHSCC;
+  
+  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
+  if (!match(LHS, m_ICmp(LHSCC, m_Value(Val), m_ConstantInt(LHSCst))) ||
+      !match(RHS, m_ICmp(RHSCC, m_Value(Val2), m_ConstantInt(RHSCst))))
+    return 0;
+  
+  // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
+  // where C is a power of 2
+  if (LHSCst == RHSCst && LHSCC == RHSCC && LHSCC == ICmpInst::ICMP_ULT &&
+      LHSCst->getValue().isPowerOf2()) {
+    Instruction *NewOr = BinaryOperator::CreateOr(Val, Val2);
+    InsertNewInstBefore(NewOr, I);
+    return new ICmpInst(LHSCC, NewOr, LHSCst);
+  }
+  
+  // From here on, we only handle:
+  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+  
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+  
+  // We can't fold (ugt x, C) & (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+    
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (ICmpInst::isSignedPredicate(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) && 
+       ICmpInst::isSignedPredicate(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+
+  // At this point, we know we have have two icmp instructions
+  // comparing a value against two constants and and'ing the result
+  // together.  Because of the above check, we know that we only have
+  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know 
+  // (from the FoldICmpLogical check above), that the two constants 
+  // are not equal and that the larger constant is on the RHS
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X == 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT:        // (X == 13 & X >  15) -> false
+    case ICmpInst::ICMP_SGT:        // (X == 13 & X >  15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
+    case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
+    case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
+      return ReplaceInstUsesWith(I, LHS);
+    }
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_ULT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
+        return new ICmpInst(ICmpInst::ICMP_ULT, Val, LHSCst);
+      break;                        // (X != 13 & X u< 15) -> no change
+    case ICmpInst::ICMP_SLT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
+        return new ICmpInst(ICmpInst::ICMP_SLT, Val, LHSCst);
+      break;                        // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
+    case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_NE:
+      if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Instruction *Add = BinaryOperator::CreateAdd(Val, AddCST,
+                                                     Val->getName()+".off");
+        InsertNewInstBefore(Add, I);
+        return new ICmpInst(ICmpInst::ICMP_UGT, Add,
+                            ConstantInt::get(Add->getType(), 1));
+      }
+      break;                        // (X != 13 & X != 15) -> no change
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
+    case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s< 13 & X == 15) -> false
+    case ICmpInst::ICMP_SGT:        // (X s< 13 & X s> 15) -> false
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+    case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
+    case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
+        return new ICmpInst(LHSCC, Val, RHSCst);
+      break;                        // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true, I);
+    case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
+        return new ICmpInst(LHSCC, Val, RHSCst);
+      break;                        // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, true, true, I);
+    case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+ 
+  return 0;
+}
+
+
  Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
    bool Changed = SimplifyCommutative(I);
    Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -3567,10 +3833,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
    if (!isa<VectorType>(I.getType())) {
-    uint32_t BitWidth = cast<IntegerType>(I.getType())->getBitWidth();
-    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
-                             KnownZero, KnownOne))
+    if (SimplifyDemandedInstructionBits(I))
        return &I;
    } else {
      if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
@@ -3755,6 +4018,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
          std::swap(Op0, Op1);
        }
      }
+
      if (Op1->hasOneUse() &&
          match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
        if (B == Op0) {                                // B&(A^B) -> B&(B^A)
@@ -3767,22 +4031,14 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
          return BinaryOperator::CreateAnd(A, NotB);
        }
      }
-  }
-  
-  { // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
-    // where C is a power of 2
-    Value *A, *B;
-    ConstantInt *C1, *C2;
-    ICmpInst::Predicate LHSCC = ICmpInst::BAD_ICMP_PREDICATE;
-    ICmpInst::Predicate RHSCC = ICmpInst::BAD_ICMP_PREDICATE;
-    if (match(&I, m_And(m_ICmp(LHSCC, m_Value(A), m_ConstantInt(C1)),
-                        m_ICmp(RHSCC, m_Value(B), m_ConstantInt(C2)))))
-      if (C1 == C2 && LHSCC == RHSCC && LHSCC == ICmpInst::ICMP_ULT &&
-          C1->getValue().isPowerOf2()) {
-        Instruction *NewOr = BinaryOperator::CreateOr(A, B);
-        InsertNewInstBefore(NewOr, I);
-        return new ICmpInst(LHSCC, NewOr, C1);
-      }
+
+    // (A&((~A)|B)) -> A&B
+    if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) ||
+        match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1)))))
+      return BinaryOperator::CreateAnd(A, Op1);
+    if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) ||
+        match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0)))))
+      return BinaryOperator::CreateAnd(A, Op0);
    }
    
    if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1)) {
@@ -3790,158 +4046,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
      if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
        return R;
  
-    Value *LHSVal, *RHSVal;
-    ConstantInt *LHSCst, *RHSCst;
-    ICmpInst::Predicate LHSCC, RHSCC;
-    if (match(Op0, m_ICmp(LHSCC, m_Value(LHSVal), m_ConstantInt(LHSCst))))
-      if (match(RHS, m_ICmp(RHSCC, m_Value(RHSVal), m_ConstantInt(RHSCst))))
-        if (LHSVal == RHSVal &&    // Found (X icmp C1) & (X icmp C2)
-            // ICMP_[GL]E X, CST is folded to ICMP_[GL]T elsewhere.
-            LHSCC != ICmpInst::ICMP_UGE && LHSCC != ICmpInst::ICMP_ULE &&
-            RHSCC != ICmpInst::ICMP_UGE && RHSCC != ICmpInst::ICMP_ULE &&
-            LHSCC != ICmpInst::ICMP_SGE && LHSCC != ICmpInst::ICMP_SLE &&
-            RHSCC != ICmpInst::ICMP_SGE && RHSCC != ICmpInst::ICMP_SLE &&
-            
-            // Don't try to fold ICMP_SLT + ICMP_ULT.
-            (ICmpInst::isEquality(LHSCC) || ICmpInst::isEquality(RHSCC) ||
-             ICmpInst::isSignedPredicate(LHSCC) == 
-                 ICmpInst::isSignedPredicate(RHSCC))) {
-          // Ensure that the larger constant is on the RHS.
-          ICmpInst::Predicate GT;
-          if (ICmpInst::isSignedPredicate(LHSCC) ||
-              (ICmpInst::isEquality(LHSCC) && 
-               ICmpInst::isSignedPredicate(RHSCC)))
-            GT = ICmpInst::ICMP_SGT;
-          else
-            GT = ICmpInst::ICMP_UGT;
-          
-          Constant *Cmp = ConstantExpr::getICmp(GT, LHSCst, RHSCst);
-          ICmpInst *LHS = cast<ICmpInst>(Op0);
-          if (cast<ConstantInt>(Cmp)->getZExtValue()) {
-            std::swap(LHS, RHS);
-            std::swap(LHSCst, RHSCst);
-            std::swap(LHSCC, RHSCC);
-          }
-
-          // At this point, we know we have have two icmp instructions
-          // comparing a value against two constants and and'ing the result
-          // together.  Because of the above check, we know that we only have
-          // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know 
-          // (from the FoldICmpLogical check above), that the two constants 
-          // are not equal and that the larger constant is on the RHS
-          assert(LHSCst != RHSCst && "Compares not folded above?");
-
-          switch (LHSCC) {
-          default: assert(0 && "Unknown integer condition code!");
-          case ICmpInst::ICMP_EQ:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X == 13 & X == 15) -> false
-            case ICmpInst::ICMP_UGT:        // (X == 13 & X >  15) -> false
-            case ICmpInst::ICMP_SGT:        // (X == 13 & X >  15) -> false
-              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
-            case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
-            case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
-            case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
-              return ReplaceInstUsesWith(I, LHS);
-            }
-          case ICmpInst::ICMP_NE:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_ULT:
-              if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
-                return new ICmpInst(ICmpInst::ICMP_ULT, LHSVal, LHSCst);
-              break;                        // (X != 13 & X u< 15) -> no change
-            case ICmpInst::ICMP_SLT:
-              if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
-                return new ICmpInst(ICmpInst::ICMP_SLT, LHSVal, LHSCst);
-              break;                        // (X != 13 & X s< 15) -> no change
-            case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
-            case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
-            case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
-              return ReplaceInstUsesWith(I, RHS);
-            case ICmpInst::ICMP_NE:
-              if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
-                Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-                Instruction *Add = BinaryOperator::CreateAdd(LHSVal, AddCST,
-                                                      LHSVal->getName()+".off");
-                InsertNewInstBefore(Add, I);
-                return new ICmpInst(ICmpInst::ICMP_UGT, Add,
-                                    ConstantInt::get(Add->getType(), 1));
-              }
-              break;                        // (X != 13 & X != 15) -> no change
-            }
-            break;
-          case ICmpInst::ICMP_ULT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
-            case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
-              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
-            case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
-            case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
-              return ReplaceInstUsesWith(I, LHS);
-            case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
-              break;
-            }
-            break;
-          case ICmpInst::ICMP_SLT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X s< 13 & X == 15) -> false
-            case ICmpInst::ICMP_SGT:        // (X s< 13 & X s> 15) -> false
-              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
-            case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
-            case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
-              return ReplaceInstUsesWith(I, LHS);
-            case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
-              break;
-            }
-            break;
-          case ICmpInst::ICMP_UGT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
-            case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
-              return ReplaceInstUsesWith(I, RHS);
-            case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:
-              if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
-                return new ICmpInst(LHSCC, LHSVal, RHSCst);
-              break;                        // (X u> 13 & X != 15) -> no change
-            case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) ->(X-14) <u 1
-              return InsertRangeTest(LHSVal, AddOne(LHSCst), RHSCst, false, 
-                                     true, I);
-            case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
-              break;
-            }
-            break;
-          case ICmpInst::ICMP_SGT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
-            case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
-              return ReplaceInstUsesWith(I, RHS);
-            case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:
-              if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
-                return new ICmpInst(LHSCC, LHSVal, RHSCst);
-              break;                        // (X s> 13 & X != 15) -> no change
-            case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) ->(X-14) s< 1
-              return InsertRangeTest(LHSVal, AddOne(LHSCst), RHSCst, true, 
-                                     true, I);
-            case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
-              break;
-            }
-            break;
-          }
-        }
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0))
+      if (Instruction *Res = FoldAndOfICmps(I, LHS, RHS))
+        return Res;
    }
  
    // fold (and (cast A), (cast B)) -> (cast (and A, B))
@@ -4205,6 +4312,219 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
    return CallInst::Create(F, V);
  }
  
+/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D).  Check
+/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then
+/// we can simplify this expression to "cond ? C : D or B".
+static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
+                                         Value *C, Value *D) {
+  // If A is not a select of -1/0, this cannot match.
+  Value *Cond = 0;
+  if (!match(A, m_SelectCst<-1, 0>(m_Value(Cond))))
+    return 0;
+
+  // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B.
+  if (match(D, m_SelectCst<0, -1>(m_Specific(Cond))))
+    return SelectInst::Create(Cond, C, B);
+  if (match(D, m_Not(m_SelectCst<-1, 0>(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, B);
+  // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D.
+  if (match(B, m_SelectCst<0, -1>(m_Specific(Cond))))
+    return SelectInst::Create(Cond, C, D);
+  if (match(B, m_Not(m_SelectCst<-1, 0>(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, D);
+  return 0;
+}
+
+/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
+Instruction *InstCombiner::FoldOrOfICmps(Instruction &I,
+                                         ICmpInst *LHS, ICmpInst *RHS) {
+  Value *Val, *Val2;
+  ConstantInt *LHSCst, *RHSCst;
+  ICmpInst::Predicate LHSCC, RHSCC;
+  
+  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
+  if (!match(LHS, m_ICmp(LHSCC, m_Value(Val), m_ConstantInt(LHSCst))) ||
+      !match(RHS, m_ICmp(RHSCC, m_Value(Val2), m_ConstantInt(RHSCst))))
+    return 0;
+  
+  // From here on, we only handle:
+  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+  
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+  
+  // We can't fold (ugt x, C) | (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+  
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (ICmpInst::isSignedPredicate(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) && 
+       ICmpInst::isSignedPredicate(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+  
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+  
+  // At this point, we know we have have two icmp instructions
+  // comparing a value against two constants and or'ing the result
+  // together.  Because of the above check, we know that we only have
+  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+  // FoldICmpLogical check above), that the two constants are not
+  // equal.
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:
+      if (LHSCst == SubOne(RHSCst)) { // (X == 13 | X == 14) -> X-13 <u 2
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Instruction *Add = BinaryOperator::CreateAdd(Val, AddCST,
+                                                     Val->getName()+".off");
+        InsertNewInstBefore(Add, I);
+        AddCST = Subtract(AddOne(RHSCst), LHSCst);
+        return new ICmpInst(ICmpInst::ICMP_ULT, Add, AddCST);
+      }
+      break;                         // (X == 13 | X == 15) -> no change
+    case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+      break;
+    case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
+      return ReplaceInstUsesWith(I, RHS);
+    }
+    break;
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
+    case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
+    case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
+    case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) -> (X-13) u> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(false))
+        return ReplaceInstUsesWith(I, LHS);
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), false, false, I);
+    case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) -> (X-13) s> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(true))
+        return ReplaceInstUsesWith(I, LHS);
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), true, false, I);
+    case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
+      return ReplaceInstUsesWith(I, RHS);
+    case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
+    case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: assert(0 && "Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
+    case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
+      return ReplaceInstUsesWith(I, LHS);
+    case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
+    case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+    case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+  return 0;
+}
+
+/// FoldOrWithConstants - This helper function folds:
+///
+///     ((A | B) & C1) | (B & C2)
+///
+/// into:
+/// 
+///     (A & C1) | B
+///
+/// when the XOR of the two constants is "all ones" (-1).
+Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                               Value *A, Value *B, Value *C) {
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
+  if (!CI1) return 0;
+
+  Value *V1 = 0;
+  ConstantInt *CI2 = 0;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return 0;
+
+  APInt Xor = CI1->getValue() ^ CI2->getValue();
+  if (!Xor.isAllOnesValue()) return 0;
+
+  if (V1 == A || V1 == B) {
+    Instruction *NewOp =
+      InsertNewInstBefore(BinaryOperator::CreateAnd((V1 == A) ? B : A, CI1), I);
+    return BinaryOperator::CreateOr(NewOp, V1);
+  }
+
+  return 0;
+}
  
  Instruction *InstCombiner::visitOr(BinaryOperator &I) {
    bool Changed = SimplifyCommutative(I);
@@ -4220,10 +4540,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
    if (!isa<VectorType>(I.getType())) {
-    uint32_t BitWidth = cast<IntegerType>(I.getType())->getBitWidth();
-    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
-                             KnownZero, KnownOne))
+    if (SimplifyDemandedInstructionBits(I))
        return &I;
    } else if (isa<ConstantAggregateZero>(Op1)) {
      return ReplaceInstUsesWith(I, Op0);  // X | <0,0> -> X
@@ -4354,68 +4671,32 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
        }
      }
  
-#define SELECT_MATCH(Val, I1, I2) \
-    m_Select(m_Value(Val), m_ConstantInt(I1), m_ConstantInt(I2))
-
      // (A & (C0?-1:0)) | (B & ~(C0?-1:0)) ->  C0 ? A : B, and commuted variants
-    Value *C0 = 0;
-    if (match(A, m_Select(m_Value(C0), m_ConstantInt(-1), m_ConstantInt(0)))) {
-      Value *C1 = 0;
-      if (match(D, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, C, B);
-      if (match(B, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, C, D);
-    }
-    if (match(B, m_Select(m_Value(C0), m_ConstantInt(-1), m_ConstantInt(0)))) {
-      Value *C1 = 0;
-      if (match(C, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, A, D);
-      if (match(A, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, C, D);
-    }
-    if (match(C, m_Select(m_Value(C0), m_ConstantInt(-1), m_ConstantInt(0)))) {
-      Value *C1 = 0;
-      if (match(D, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, A, B);
-      if (match(B, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, A, D);
-    }
-    if (match(D, m_Select(m_Value(C0), m_ConstantInt(-1), m_ConstantInt(0)))) {
-      Value *C1 = 0;
-      if (match(C, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, A, B);
-      if (match(A, m_Not(SELECT_MATCH(C1, -1, 0))) && C1 == C0)
-        return SelectInst::Create(C1, C, B);
-    }
-    if (match(A, m_Select(m_Value(C0), m_ConstantInt(0), m_ConstantInt(-1)))) {
-      Value *C1 = 0;
-      if (match(D, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, B, C);
-      if (match(B, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, D, C);
-    }
-    if (match(B, m_Select(m_Value(C0), m_ConstantInt(0), m_ConstantInt(-1)))) {
-      Value *C1 = 0;
-      if (match(C, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, D, A);
-      if (match(A, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, D, C);
-    }
-    if (match(C, m_Select(m_Value(C0), m_ConstantInt(0), m_ConstantInt(-1)))) {
-      Value *C1 = 0;
-      if (match(D, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, B, A);
-      if (match(B, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, D, A);
-    }
-    if (match(D, m_Select(m_Value(C0), m_ConstantInt(0), m_ConstantInt(-1)))) {
-      Value *C1 = 0;
-      if (match(C, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, B, A);
-      if (match(A, m_Not(SELECT_MATCH(C1, 0, -1))) && C1 == C0)
-        return SelectInst::Create(C1, B, C);
-    }
-#undef SELECT_MATCH
+    if (Instruction *Match = MatchSelectFromAndOr(A, B, C, D))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(B, A, D, C))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(C, B, A, D))
+      return Match;
+    if (Instruction *Match = MatchSelectFromAndOr(D, A, B, C))
+      return Match;
+
+    // ((A&~B)|(~A&B)) -> A^B
+    if ((match(C, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, D);
+    // ((~B&A)|(~A&B)) -> A^B
+    if ((match(A, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, D);
+    // ((A&~B)|(B&~A)) -> A^B
+    if ((match(C, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, B);
+    // ((~B&A)|(B&~A)) -> A^B
+    if ((match(A, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, B);
    }
    
    // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
@@ -4433,6 +4714,19 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
        }
    }
  
+  // ((A|B)&1)|(B&-2) -> (A&1) | B
+  if (match(Op0, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
+      match(Op0, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
+    Instruction *Ret = FoldOrWithConstants(I, Op1, A, B, C);
+    if (Ret) return Ret;
+  }
+  // (B&-2)|((A|B)&1) -> (A&1) | B
+  if (match(Op1, m_And(m_Or(m_Value(A), m_Value(B)), m_Value(C))) ||
+      match(Op1, m_And(m_Value(C), m_Or(m_Value(A), m_Value(B))))) {
+    Instruction *Ret = FoldOrWithConstants(I, Op0, A, B, C);
+    if (Ret) return Ret;
+  }
+
    if (match(Op0, m_Not(m_Value(A)))) {   // ~A | Op1
      if (A == Op1)   // ~A | A == -1
        return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
@@ -4457,153 +4751,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
      if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
        return R;
  
-    Value *LHSVal, *RHSVal;
-    ConstantInt *LHSCst, *RHSCst;
-    ICmpInst::Predicate LHSCC, RHSCC;
-    if (match(Op0, m_ICmp(LHSCC, m_Value(LHSVal), m_ConstantInt(LHSCst))))
-      if (match(RHS, m_ICmp(RHSCC, m_Value(RHSVal), m_ConstantInt(RHSCst))))
-        if (LHSVal == RHSVal &&    // Found (X icmp C1) | (X icmp C2)
-            // icmp [us][gl]e x, cst is folded to icmp [us][gl]t elsewhere.
-            LHSCC != ICmpInst::ICMP_UGE && LHSCC != ICmpInst::ICMP_ULE &&
-            RHSCC != ICmpInst::ICMP_UGE && RHSCC != ICmpInst::ICMP_ULE &&
-            LHSCC != ICmpInst::ICMP_SGE && LHSCC != ICmpInst::ICMP_SLE &&
-            RHSCC != ICmpInst::ICMP_SGE && RHSCC != ICmpInst::ICMP_SLE &&
-            // We can't fold (ugt x, C) | (sgt x, C2).
-            PredicatesFoldable(LHSCC, RHSCC)) {
-          // Ensure that the larger constant is on the RHS.
-          ICmpInst *LHS = cast<ICmpInst>(Op0);
-          bool NeedsSwap;
-          if (ICmpInst::isEquality(LHSCC) ? ICmpInst::isSignedPredicate(RHSCC)
-                                          : ICmpInst::isSignedPredicate(LHSCC))
-            NeedsSwap = LHSCst->getValue().sgt(RHSCst->getValue());
-          else
-            NeedsSwap = LHSCst->getValue().ugt(RHSCst->getValue());
-            
-          if (NeedsSwap) {
-            std::swap(LHS, RHS);
-            std::swap(LHSCst, RHSCst);
-            std::swap(LHSCC, RHSCC);
-          }
-
-          // At this point, we know we have have two icmp instructions
-          // comparing a value against two constants and or'ing the result
-          // together.  Because of the above check, we know that we only have
-          // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
-          // FoldICmpLogical check above), that the two constants are not
-          // equal.
-          assert(LHSCst != RHSCst && "Compares not folded above?");
-
-          switch (LHSCC) {
-          default: assert(0 && "Unknown integer condition code!");
-          case ICmpInst::ICMP_EQ:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:
-              if (LHSCst == SubOne(RHSCst)) {// (X == 13 | X == 14) -> X-13 <u 2
-                Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-                Instruction *Add = BinaryOperator::CreateAdd(LHSVal, AddCST,
-                                                      LHSVal->getName()+".off");
-                InsertNewInstBefore(Add, I);
-                AddCST = Subtract(AddOne(RHSCst), LHSCst);
-                return new ICmpInst(ICmpInst::ICMP_ULT, Add, AddCST);
-              }
-              break;                         // (X == 13 | X == 15) -> no change
-            case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
-            case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
-              break;
-            case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
-            case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
-            case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
-              return ReplaceInstUsesWith(I, RHS);
-            }
-            break;
-          case ICmpInst::ICMP_NE:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
-            case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
-            case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
-              return ReplaceInstUsesWith(I, LHS);
-            case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
-            case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
-            case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
-              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
-            }
-            break;
-          case ICmpInst::ICMP_ULT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
-              break;
-            case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) ->(X-13) u> 2
-              // If RHSCst is [us]MAXINT, it is always false.  Not handling
-              // this can cause overflow.
-              if (RHSCst->isMaxValue(false))
-                return ReplaceInstUsesWith(I, LHS);
-              return InsertRangeTest(LHSVal, LHSCst, AddOne(RHSCst), false, 
-                                     false, I);
-            case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
-            case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
-              return ReplaceInstUsesWith(I, RHS);
-            case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
-              break;
-            }
-            break;
-          case ICmpInst::ICMP_SLT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
-              break;
-            case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) ->(X-13) s> 2
-              // If RHSCst is [us]MAXINT, it is always false.  Not handling
-              // this can cause overflow.
-              if (RHSCst->isMaxValue(true))
-                return ReplaceInstUsesWith(I, LHS);
-              return InsertRangeTest(LHSVal, LHSCst, AddOne(RHSCst), true, 
-                                     false, I);
-            case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
-            case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
-              return ReplaceInstUsesWith(I, RHS);
-            case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
-              break;
-            }
-            break;
-          case ICmpInst::ICMP_UGT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
-            case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
-              return ReplaceInstUsesWith(I, LHS);
-            case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
-            case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
-              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
-            case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
-              break;
-            }
-            break;
-          case ICmpInst::ICMP_SGT:
-            switch (RHSCC) {
-            default: assert(0 && "Unknown integer condition code!");
-            case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
-            case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
-              return ReplaceInstUsesWith(I, LHS);
-            case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
-              break;
-            case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
-            case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
-              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
-            case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
-              break;
-            }
-            break;
-          }
-        }
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+      if (Instruction *Res = FoldOrOfICmps(I, LHS, RHS))
+        return Res;
    }
      
    // fold (or (cast A), (cast B)) -> (cast (or A, B))
@@ -4728,10 +4878,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
    if (!isa<VectorType>(I.getType())) {
-    uint32_t BitWidth = cast<IntegerType>(I.getType())->getBitWidth();
-    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
-                             KnownZero, KnownOne))
+    if (SimplifyDemandedInstructionBits(I))
        return &I;
    } else if (isa<ConstantAggregateZero>(Op1)) {
      return ReplaceInstUsesWith(I, Op0);  // X ^ <0,0> -> X
@@ -4761,8 +4908,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
    
    
    if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
      if (RHS == ConstantInt::getTrue() && Op0->hasOneUse()) {
+      // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
        if (ICmpInst *ICI = dyn_cast<ICmpInst>(Op0))
          return new ICmpInst(ICI->getInversePredicate(),
                              ICI->getOperand(0), ICI->getOperand(1));
@@ -4864,11 +5011,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
          I.swapOperands();     // Simplified below.
          std::swap(Op0, Op1);
        }
-    } else if (match(Op1I, m_Xor(m_Value(A), m_Value(B)))) {
-      if (Op0 == A)                                          // A^(A^B) == B
-        return ReplaceInstUsesWith(I, B);
-      else if (Op0 == B)                                     // A^(B^A) == B
-        return ReplaceInstUsesWith(I, A);
+    } else if (match(Op1I, m_Xor(m_Specific(Op0), m_Value(B)))) {
+      return ReplaceInstUsesWith(I, B);                      // A^(A^B) == B
+    } else if (match(Op1I, m_Xor(m_Value(A), m_Specific(Op0)))) {
+      return ReplaceInstUsesWith(I, A);                      // A^(B^A) == B
      } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && Op1I->hasOneUse()){
        if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
          Op1I->swapOperands();
@@ -4892,11 +5038,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
            InsertNewInstBefore(BinaryOperator::CreateNot(Op1, "tmp"), I);
          return BinaryOperator::CreateAnd(A, NotB);
        }
-    } else if (match(Op0I, m_Xor(m_Value(A), m_Value(B)))) {
-      if (Op1 == A)                                          // (A^B)^A == B
-        return ReplaceInstUsesWith(I, B);
-      else if (Op1 == B)                                     // (B^A)^A == B
-        return ReplaceInstUsesWith(I, A);
+    } else if (match(Op0I, m_Xor(m_Specific(Op1), m_Value(B)))) {
+      return ReplaceInstUsesWith(I, B);                      // (A^B)^A == B
+    } else if (match(Op0I, m_Xor(m_Value(A), m_Specific(Op1)))) {
+      return ReplaceInstUsesWith(I, A);                      // (B^A)^A == B
      } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && Op0I->hasOneUse()){
        if (A == Op1)                                        // (A&B)^A -> (B&A)^A
          std::swap(A, B);
@@ -5034,7 +5179,7 @@ static Value *EmitGEPOffset(User *GEP, Instruction &I, InstCombiner &IC) {
    for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e;
         ++i, ++GTI) {
      Value *Op = *i;
-    uint64_t Size = TD.getABITypeSize(GTI.getIndexedType()) & PtrSizeMask;
+    uint64_t Size = TD.getTypePaddedSize(GTI.getIndexedType()) & PtrSizeMask;
      if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
        if (OpC->isZero()) continue;
        
@@ -5125,7 +5270,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
        if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
          Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
        } else {
-        uint64_t Size = TD.getABITypeSize(GTI.getIndexedType());
+        uint64_t Size = TD.getTypePaddedSize(GTI.getIndexedType());
          Offset += Size*CI->getSExtValue();
        }
      } else {
@@ -5141,7 +5286,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
    Value *VariableIdx = GEP->getOperand(i);
    // Determine the scale factor of the variable element.  For example, this is
    // 4 if the variable index is into an array of i32.
-  uint64_t VariableScale = TD.getABITypeSize(GTI.getIndexedType());
+  uint64_t VariableScale = TD.getTypePaddedSize(GTI.getIndexedType());
    
    // Verify that there are no other variable indices.  If so, emit the hard way.
    for (++i, ++GTI; i != e; ++i, ++GTI) {
@@ -5155,7 +5300,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
        Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
      } else {
-      uint64_t Size = TD.getABITypeSize(GTI.getIndexedType());
+      uint64_t Size = TD.getTypePaddedSize(GTI.getIndexedType());
        Offset += Size*CI->getSExtValue();
      }
    }
@@ -5373,9 +5518,9 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
      Pred = ICmpInst::ICMP_NE;
      break;
    case FCmpInst::FCMP_ORD:
-    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+    return ReplaceInstUsesWith(I, ConstantInt::getTrue());
    case FCmpInst::FCMP_UNO:
-    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+    return ReplaceInstUsesWith(I, ConstantInt::getFalse());
    }
    
    const IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
@@ -5395,8 +5540,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
      if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
        if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
            Pred == ICmpInst::ICMP_SLE)
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
-      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
      }
    } else {
      // If the RHS value is > UnsignedMax, fold the comparison. This handles
@@ -5407,8 +5552,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
      if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
        if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
            Pred == ICmpInst::ICMP_ULE)
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
-      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
      }
    }
    
@@ -5420,8 +5565,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
      if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
        if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
            Pred == ICmpInst::ICMP_SGE)
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
-      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+        return ReplaceInstUsesWith(I,ConstantInt::getTrue());
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
      }
    }
  
@@ -5438,14 +5583,14 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
      switch (Pred) {
      default: assert(0 && "Unexpected integer comparison!");
      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
-      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
-      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
      case ICmpInst::ICMP_ULE:
        // (float)int <= 4.4   --> int <= 4
        // (float)int <= -4.4  --> false
        if (RHS.isNegative())
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
        break;
      case ICmpInst::ICMP_SLE:
        // (float)int <= 4.4   --> int <= 4
@@ -5457,7 +5602,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
        // (float)int < -4.4   --> false
        // (float)int < 4.4    --> int <= 4
        if (RHS.isNegative())
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
        Pred = ICmpInst::ICMP_ULE;
        break;
      case ICmpInst::ICMP_SLT:
@@ -5470,7 +5615,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
        // (float)int > 4.4    --> int > 4
        // (float)int > -4.4   --> true
        if (RHS.isNegative())
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
        break;
      case ICmpInst::ICMP_SGT:
        // (float)int > 4.4    --> int > 4
@@ -5482,7 +5627,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
        // (float)int >= -4.4   --> true
        // (float)int >= 4.4    --> int > 4
        if (!RHS.isNegative())
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
        Pred = ICmpInst::ICMP_UGT;
        break;
      case ICmpInst::ICMP_SGE:
@@ -5505,9 +5650,9 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
  
    // Fold trivial predicates.
    if (I.getPredicate() == FCmpInst::FCMP_FALSE)
-    return ReplaceInstUsesWith(I, Constant::getNullValue(Type::Int1Ty));
+    return ReplaceInstUsesWith(I, ConstantInt::getFalse());
    if (I.getPredicate() == FCmpInst::FCMP_TRUE)
-    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+    return ReplaceInstUsesWith(I, ConstantInt::getTrue());
    
    // Simplify 'fcmp pred X, X'
    if (Op0 == Op1) {
@@ -5516,11 +5661,11 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
      case FCmpInst::FCMP_UEQ:    // True if unordered or equal
      case FCmpInst::FCMP_UGE:    // True if unordered, greater than, or equal
      case FCmpInst::FCMP_ULE:    // True if unordered, less than, or equal
-      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+      return ReplaceInstUsesWith(I, ConstantInt::getTrue());
      case FCmpInst::FCMP_OGT:    // True if ordered and greater than
      case FCmpInst::FCMP_OLT:    // True if ordered and less than
      case FCmpInst::FCMP_ONE:    // True if ordered and operands are unequal
-      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse());
        
      case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
      case FCmpInst::FCMP_ULT:    // True if unordered or less than
@@ -5551,11 +5696,11 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
      if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
        if (CFP->getValueAPF().isNaN()) {
          if (FCmpInst::isOrdered(I.getPredicate()))   // True if ordered and...
-          return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
          assert(FCmpInst::isUnordered(I.getPredicate()) &&
                 "Comparison must be either ordered or unordered!");
          // True if unordered.
-        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
        }
      }
      
@@ -5719,7 +5864,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
      bool UnusedBit;
      bool isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit);
      
-    if (SimplifyDemandedBits(Op0, 
+    if (SimplifyDemandedBits(I.getOperandUse(0), 
                               isSignBit ? APInt::getSignBit(BitWidth)
                                         : APInt::getAllOnesValue(BitWidth),
                               KnownZero, KnownOne, 0))
@@ -5954,18 +6099,39 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
      if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
        if (Op0I->getOpcode() == Op1I->getOpcode() && Op0I->hasOneUse() &&
-          Op1I->hasOneUse() && Op0I->getOperand(1) == Op1I->getOperand(1) &&
-          I.isEquality()) {
+          Op1I->hasOneUse() && Op0I->getOperand(1) == Op1I->getOperand(1)) {
          switch (Op0I->getOpcode()) {
          default: break;
          case Instruction::Add:
          case Instruction::Sub:
          case Instruction::Xor:
-          // a+x icmp eq/ne b+x --> a icmp b
-          return new ICmpInst(I.getPredicate(), Op0I->getOperand(0),
-                              Op1I->getOperand(0));
+          if (I.isEquality())    // a+x icmp eq/ne b+x --> a icmp b
+            return new ICmpInst(I.getPredicate(), Op0I->getOperand(0),
+                                Op1I->getOperand(0));
+          // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+            if (CI->getValue().isSignBit()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+            
+            if (CI->getValue().isMaxSignedValue()) {
+              ICmpInst::Predicate Pred = I.isSignedPredicate()
+                                             ? I.getUnsignedPredicate()
+                                             : I.getSignedPredicate();
+              Pred = I.getSwappedPredicate(Pred);
+              return new ICmpInst(Pred, Op0I->getOperand(0),
+                                  Op1I->getOperand(0));
+            }
+          }
            break;
          case Instruction::Mul:
+          if (!I.isEquality())
+            break;
+
            if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
              // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
              // Mask = -1 >> count-trailing-zeros(Cst).
@@ -6014,14 +6180,14 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
  
        if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
          // A^c1 == C^c2 --> A == C^(c1^c2)
-        if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
-          if (ConstantInt *C2 = dyn_cast<ConstantInt>(D))
-            if (Op1->hasOneUse()) {
-              Constant *NC = ConstantInt::get(C1->getValue() ^ C2->getValue());
-              Instruction *Xor = BinaryOperator::CreateXor(C, NC, "tmp");
-              return new ICmpInst(I.getPredicate(), A,
-                                  InsertNewInstBefore(Xor, I));
-            }
+        ConstantInt *C1, *C2;
+        if (match(B, m_ConstantInt(C1)) &&
+            match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
+          Constant *NC = ConstantInt::get(C1->getValue() ^ C2->getValue());
+          Instruction *Xor = BinaryOperator::CreateXor(C, NC, "tmp");
+          return new ICmpInst(I.getPredicate(), A,
+                              InsertNewInstBefore(Xor, I));
+        }
          
          // A^B == A^D -> B == D
          if (A == C) return new ICmpInst(I.getPredicate(), B, D);
@@ -6038,16 +6204,16 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
        return new ICmpInst(I.getPredicate(), OtherVal,
                            Constant::getNullValue(A->getType()));
      }
-    if (match(Op0, m_Sub(m_Value(A), m_Value(B))) && A == Op1) {
-      // (A-B) == A  ->  B == 0
-      return new ICmpInst(I.getPredicate(), B,
+
+    // (A-B) == A  ->  B == 0
+    if (match(Op0, m_Sub(m_Specific(Op1), m_Value(B))))
+      return new ICmpInst(I.getPredicate(), B, 
                            Constant::getNullValue(B->getType()));
-    }
-    if (match(Op1, m_Sub(m_Value(A), m_Value(B))) && A == Op0) {
-      // A == (A-B)  ->  B == 0
+
+    // A == (A-B)  ->  B == 0
+    if (match(Op1, m_Sub(m_Specific(Op0), m_Value(B))))
        return new ICmpInst(I.getPredicate(), B,
                            Constant::getNullValue(B->getType()));
-    }
      
      // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
      if (Op0->hasOneUse() && Op1->hasOneUse() &&
@@ -6235,6 +6401,28 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
    const APInt &RHSV = RHS->getValue();
    
    switch (LHSI->getOpcode()) {
+  case Instruction::Trunc:
+    if (ICI.isEquality() && LHSI->hasOneUse()) {
+      // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+      // of the high bits truncated out of x are known.
+      unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
+             SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits));
+      APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
+      ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne);
+      
+      // If all the high bits are known, we can do this xform.
+      if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
+        // Pull in the high bits from known-ones set.
+        APInt NewRHS(RHS->getValue());
+        NewRHS.zext(SrcBits);
+        NewRHS |= KnownOne;
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantInt::get(NewRHS));
+      }
+    }
+    break;
+      
    case Instruction::Xor:         // (icmp pred (xor X, XorCST), CI)
      if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
        // If this is a comparison that tests the signbit (X < 0) or (x > -1),
@@ -6262,6 +6450,29 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
          else
            return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal, AddOne(RHS));
        }
+
+      if (LHSI->hasOneUse()) {
+        // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit))
+        if (!ICI.isEquality() && XorCST->getValue().isSignBit()) {
+          const APInt &SignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ SignBit));
+        }
+
+        // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
+        if (!ICI.isEquality() && XorCST->getValue().isMaxSignedValue()) {
+          const APInt &NotSignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSignedPredicate()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          Pred = ICI.getSwappedPredicate(Pred);
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(RHSV ^ NotSignBit));
+        }
+      }
      }
      break;
    case Instruction::And:         // (icmp pred (and X, AndCST), RHS)
@@ -6657,29 +6868,6 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
          return &ICI;
        }
      }
-  } else {  // Not a ICMP_EQ/ICMP_NE
-            // If the LHS is a cast from an integral value of the same size, 
-            // then since we know the RHS is a constant, try to simlify.
-    if (CastInst *Cast = dyn_cast<CastInst>(LHSI)) {
-      Value *CastOp = Cast->getOperand(0);
-      const Type *SrcTy = CastOp->getType();
-      uint32_t SrcTySize = SrcTy->getPrimitiveSizeInBits();
-      if (SrcTy->isInteger() && 
-          SrcTySize == Cast->getType()->getPrimitiveSizeInBits()) {
-        // If this is an unsigned comparison, try to make the comparison use
-        // smaller constant values.
-        if (ICI.getPredicate() == ICmpInst::ICMP_ULT && RHSV.isSignBit()) {
-          // X u< 128 => X s> -1
-          return new ICmpInst(ICmpInst::ICMP_SGT, CastOp, 
-                           ConstantInt::get(APInt::getAllOnesValue(SrcTySize)));
-        } else if (ICI.getPredicate() == ICmpInst::ICMP_UGT &&
-                   RHSV == APInt::getSignedMaxValue(SrcTySize)) {
-          // X u> 127 => X s< 0
-          return new ICmpInst(ICmpInst::ICMP_SLT, CastOp, 
-                              Constant::getNullValue(SrcTy));
-        }
-      }
-    }
    }
    return 0;
  }
@@ -6843,7 +7031,12 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
        MaskedValueIsZero(Op0,
                        APInt::getSignBit(I.getType()->getPrimitiveSizeInBits())))
      return BinaryOperator::CreateLShr(Op0, I.getOperand(1));
-  
+
+  // Arithmetic shifting an all-sign-bit value is a no-op.
+  unsigned NumSignBits = ComputeNumSignBits(Op0);
+  if (NumSignBits == Op0->getType()->getPrimitiveSizeInBits())
+    return ReplaceInstUsesWith(I, Op0);
+
    return 0;
  }
  
@@ -6884,14 +7077,12 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
  
  Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
                                                 BinaryOperator &I) {
-  bool isLeftShift    = I.getOpcode() == Instruction::Shl;
+  bool isLeftShift = I.getOpcode() == Instruction::Shl;
  
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
    uint32_t TypeBits = Op0->getType()->getPrimitiveSizeInBits();
-  APInt KnownZero(TypeBits, 0), KnownOne(TypeBits, 0);
-  if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(TypeBits),
-                           KnownZero, KnownOne))
+  if (SimplifyDemandedInstructionBits(I))
      return &I;
    
    // shl uint X, 32 = 0 and shr ubyte Y, 9 = 0, ... just don't eliminate shr
@@ -6979,8 +7170,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
            // These operators commute.
            // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
            if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
-              match(Op0BO->getOperand(1),
-                    m_Shr(m_Value(V1), m_ConstantInt(CC))) && CC == Op1) {
+              match(Op0BO->getOperand(1), m_Shr(m_Value(V1), m_Specific(Op1)))){
              Instruction *YS = BinaryOperator::CreateShl(
                                              Op0BO->getOperand(0), Op1,
                                              Op0BO->getName());
@@ -6998,9 +7188,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
            Value *Op0BOOp1 = Op0BO->getOperand(1);
            if (isLeftShift && Op0BOOp1->hasOneUse() &&
                match(Op0BOOp1, 
-                    m_And(m_Shr(m_Value(V1), m_Value(V2)),m_ConstantInt(CC))) &&
-              cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse() &&
-              V2 == Op1) {
+                    m_And(m_Shr(m_Value(V1), m_Specific(Op1)),
+                          m_ConstantInt(CC))) &&
+              cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse()) {
              Instruction *YS = BinaryOperator::CreateShl(
                                                       Op0BO->getOperand(0), Op1,
                                                       Op0BO->getName());
@@ -7018,8 +7208,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
          case Instruction::Sub: {
            // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
            if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
-              match(Op0BO->getOperand(0),
-                    m_Shr(m_Value(V1), m_ConstantInt(CC))) && CC == Op1) {
+              match(Op0BO->getOperand(0), m_Shr(m_Value(V1), m_Specific(Op1)))){
              Instruction *YS = BinaryOperator::CreateShl(
                                                       Op0BO->getOperand(1), Op1,
                                                       Op0BO->getName());
@@ -7314,8 +7503,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
    // same, we open the door to infinite loops of various kinds.
    if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return 0;
  
-  uint64_t AllocElTySize = TD->getABITypeSize(AllocElTy);
-  uint64_t CastElTySize = TD->getABITypeSize(CastElTy);
+  uint64_t AllocElTySize = TD->getTypePaddedSize(AllocElTy);
+  uint64_t CastElTySize = TD->getTypePaddedSize(CastElTy);
    if (CastElTySize == 0 || AllocElTySize == 0) return 0;
  
    // See if we can satisfy the modulus by pulling a scale out of the array
@@ -7394,7 +7583,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
  /// the final result.
  bool InstCombiner::CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
                                                unsigned CastOpc,
-                                              int &NumCastsRemoved) {
+                                              int &NumCastsRemoved){
    // We can always evaluate constants in another type.
    if (isa<ConstantInt>(V))
      return true;
@@ -7422,7 +7611,8 @@ bool InstCombiner::CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
    // require duplicating the instruction in general, which isn't profitable.
    if (!I->hasOneUse()) return false;
  
-  switch (I->getOpcode()) {
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
    case Instruction::Add:
    case Instruction::Sub:
    case Instruction::Mul:
@@ -7468,7 +7658,11 @@ bool InstCombiner::CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
      // If this is the same kind of case as our original (e.g. zext+zext), we
      // can safely replace it.  Note that replacing it does not reduce the number
      // of casts in the input.
-    if (I->getOpcode() == CastOpc)
+    if (Opc == CastOpc)
+      return true;
+
+    // sext (zext ty1), ty2 -> zext ty2
+    if (CastOpc == Instruction::SExt && Opc == Instruction::ZExt)
        return true;
      break;
    case Instruction::Select: {
@@ -7506,7 +7700,8 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty,
    // Otherwise, it must be an instruction.
    Instruction *I = cast<Instruction>(V);
    Instruction *Res = 0;
-  switch (I->getOpcode()) {
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
    case Instruction::Add:
    case Instruction::Sub:
    case Instruction::Mul:
@@ -7518,8 +7713,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty,
    case Instruction::Shl: {
      Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
      Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
-    Res = BinaryOperator::Create((Instruction::BinaryOps)I->getOpcode(),
-                                 LHS, RHS);
+    Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
      break;
    }    
    case Instruction::Trunc:
@@ -7589,6 +7783,66 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
    return 0;
  }
  
+/// FindElementAtOffset - Given a type and a constant offset, determine whether
+/// or not there is a sequence of GEP indices into the type that will land us at
+/// the specified offset.  If so, fill them into NewIndices and return the
+/// resultant element type, otherwise return null.
+static const Type *FindElementAtOffset(const Type *Ty, int64_t Offset, 
+                                       SmallVectorImpl<Value*> &NewIndices,
+                                       const TargetData *TD) {
+  if (!Ty->isSized()) return 0;
+  
+  // Start with the index over the outer type.  Note that the type size
+  // might be zero (even if the offset isn't zero) if the indexed type
+  // is something like [0 x {int, int}]
+  const Type *IntPtrTy = TD->getIntPtrType();
+  int64_t FirstIdx = 0;
+  if (int64_t TySize = TD->getTypePaddedSize(Ty)) {
+    FirstIdx = Offset/TySize;
+    Offset -= FirstIdx*TySize;
+    
+    // Handle hosts where % returns negative instead of values [0..TySize).
+    if (Offset < 0) {
+      --FirstIdx;
+      Offset += TySize;
+      assert(Offset >= 0);
+    }
+    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
+  }
+  
+  NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+    
+  // Index into the types.  If we fail, set OrigBase to null.
+  while (Offset) {
+    // Indexing into tail padding between struct/array elements.
+    if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
+      return 0;
+    
+    if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = TD->getStructLayout(STy);
+      assert(Offset < (int64_t)SL->getSizeInBytes() &&
+             "Offset must stay within the indexed type");
+      
+      unsigned Elt = SL->getElementContainingOffset(Offset);
+      NewIndices.push_back(ConstantInt::get(Type::Int32Ty, Elt));
+      
+      Offset -= SL->getElementOffset(Elt);
+      Ty = STy->getElementType(Elt);
+    } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+      uint64_t EltSize = TD->getTypePaddedSize(AT->getElementType());
+      assert(EltSize && "Cannot index into a zero-sized array");
+      NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+      Offset %= EltSize;
+      Ty = AT->getElementType();
+    } else {
+      // Otherwise, we can't index into the middle of this atomic type, bail.
+      return 0;
+    }
+  }
+  
+  return Ty;
+}
+
  /// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
  Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
    Value *Src = CI.getOperand(0);
@@ -7619,74 +7873,21 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
          Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
          const Type *GEPIdxTy =
            cast<PointerType>(OrigBase->getType())->getElementType();
-        if (GEPIdxTy->isSized()) {
-          SmallVector<Value*, 8> NewIndices;
-          
-          // Start with the index over the outer type.  Note that the type size
-          // might be zero (even if the offset isn't zero) if the indexed type
-          // is something like [0 x {int, int}]
-          const Type *IntPtrTy = TD->getIntPtrType();
-          int64_t FirstIdx = 0;
-          if (int64_t TySize = TD->getABITypeSize(GEPIdxTy)) {
-            FirstIdx = Offset/TySize;
-            Offset %= TySize;
+        SmallVector<Value*, 8> NewIndices;
+        if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices, TD)) {
+          // If we were able to index down into an element, create the GEP
+          // and bitcast the result.  This eliminates one bitcast, potentially
+          // two.
+          Instruction *NGEP = GetElementPtrInst::Create(OrigBase, 
+                                                        NewIndices.begin(),
+                                                        NewIndices.end(), "");
+          InsertNewInstBefore(NGEP, CI);
+          NGEP->takeName(GEP);
            
-            // Handle silly modulus not returning values values [0..TySize).
-            if (Offset < 0) {
-              --FirstIdx;
-              Offset += TySize;
-              assert(Offset >= 0);
-            }
-            assert((uint64_t)Offset < (uint64_t)TySize &&"Out of range offset");
-          }
-          
-          NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
-
-          // Index into the types.  If we fail, set OrigBase to null.
-          while (Offset) {
-            if (const StructType *STy = dyn_cast<StructType>(GEPIdxTy)) {
-              const StructLayout *SL = TD->getStructLayout(STy);
-              if (Offset < (int64_t)SL->getSizeInBytes()) {
-                unsigned Elt = SL->getElementContainingOffset(Offset);
-                NewIndices.push_back(ConstantInt::get(Type::Int32Ty, Elt));
-              
-                Offset -= SL->getElementOffset(Elt);
-                GEPIdxTy = STy->getElementType(Elt);
-              } else {
-                // Otherwise, we can't index into this, bail out.
-                Offset = 0;
-                OrigBase = 0;
-              }
-            } else if (isa<ArrayType>(GEPIdxTy) || isa<VectorType>(GEPIdxTy)) {
-              const SequentialType *STy = cast<SequentialType>(GEPIdxTy);
-              if (uint64_t EltSize = TD->getABITypeSize(STy->getElementType())){
-                NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
-                Offset %= EltSize;
-              } else {
-                NewIndices.push_back(ConstantInt::get(IntPtrTy, 0));
-              }
-              GEPIdxTy = STy->getElementType();
-            } else {
-              // Otherwise, we can't index into this, bail out.
-              Offset = 0;
-              OrigBase = 0;
-            }
-          }
-          if (OrigBase) {
-            // If we were able to index down into an element, create the GEP
-            // and bitcast the result.  This eliminates one bitcast, potentially
-            // two.
-            Instruction *NGEP = GetElementPtrInst::Create(OrigBase, 
-                                                          NewIndices.begin(),
-                                                          NewIndices.end(), "");
-            InsertNewInstBefore(NGEP, CI);
-            NGEP->takeName(GEP);
-            
-            if (isa<BitCastInst>(CI))
-              return new BitCastInst(NGEP, CI.getType());
-            assert(isa<PtrToIntInst>(CI));
-            return new PtrToIntInst(NGEP, CI.getType());
-          }
+          if (isa<BitCastInst>(CI))
+            return new BitCastInst(NGEP, CI.getType());
+          assert(isa<PtrToIntInst>(CI));
+          return new PtrToIntInst(NGEP, CI.getType());
          }
        }      
      }
@@ -7696,7 +7897,6 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
  }
  
  
-
  /// Only the TRUNC, ZEXT, SEXT, and BITCAST can both operand and result as
  /// integer types. This function implements the common transforms for all those
  /// cases.
@@ -7713,9 +7913,7 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
  
    // See if we can simplify any instructions used by the LHS whose sole 
    // purpose is to compute bits we don't care about.
-  APInt KnownZero(DestBitSize, 0), KnownOne(DestBitSize, 0);
-  if (SimplifyDemandedBits(&CI, APInt::getAllOnesValue(DestBitSize),
-                           KnownZero, KnownOne))
+  if (SimplifyDemandedInstructionBits(CI))
      return &CI;
  
    // If the source isn't an instruction or has more than one use then we
@@ -7735,7 +7933,8 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
      // so we require that the input have eliminated at least one cast.  If this
      // is a sign extension, we insert two new casts (to do the extension) so we
      // require that two casts have been eliminated.
-    bool DoXForm;
+    bool DoXForm = false;
+    bool JustReplace = false;
      switch (CI.getOpcode()) {
      default:
        // All the others use floating point so we shouldn't actually 
@@ -7744,17 +7943,56 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
      case Instruction::Trunc:
        DoXForm = true;
        break;
-    case Instruction::ZExt:
+    case Instruction::ZExt: {
        DoXForm = NumCastsRemoved >= 1;
+      if (!DoXForm && 0) {
+        // If it's unnecessary to issue an AND to clear the high bits, it's
+        // always profitable to do this xform.
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, false);
+        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
+        if (MaskedValueIsZero(TryRes, Mask))
+          return ReplaceInstUsesWith(CI, TryRes);
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+          if (TryI->use_empty())
+            EraseInstFromFunction(*TryI);
+      }
        break;
-    case Instruction::SExt:
+    }
+    case Instruction::SExt: {
        DoXForm = NumCastsRemoved >= 2;
+      if (!DoXForm && !isa<TruncInst>(SrcI) && 0) {
+        // If we do not have to emit the truncate + sext pair, then it's always
+        // profitable to do this xform.
+        //
+        // It's not safe to eliminate the trunc + sext pair if one of the
+        // eliminated cast is a truncate. e.g.
+        // t2 = trunc i32 t1 to i16
+        // t3 = sext i16 t2 to i32
+        // !=
+        // i32 t1
+        Value *TryRes = EvaluateInDifferentType(SrcI, DestTy, true);
+        unsigned NumSignBits = ComputeNumSignBits(TryRes);
+        if (NumSignBits > (DestBitSize - SrcBitSize))
+          return ReplaceInstUsesWith(CI, TryRes);
+        
+        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
+          if (TryI->use_empty())
+            EraseInstFromFunction(*TryI);
+      }
        break;
      }
+    }
      
      if (DoXForm) {
+      DOUT << "ICE: EvaluateInDifferentType converting expression type to avoid"
+           << " cast: " << CI;
        Value *Res = EvaluateInDifferentType(SrcI, DestTy, 
                                             CI.getOpcode() == Instruction::SExt);
+      if (JustReplace)
+        // Just replace this cast with the result.
+        return ReplaceInstUsesWith(CI, Res);
+
        assert(Res->getType() == DestTy);
        switch (CI.getOpcode()) {
        default: assert(0 && "Unknown cast type!");
@@ -7763,18 +8001,32 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
          // Just replace this cast with the result.
          return ReplaceInstUsesWith(CI, Res);
        case Instruction::ZExt: {
-        // We need to emit an AND to clear the high bits.
          assert(SrcBitSize < DestBitSize && "Not a zext?");
+
+        // If the high bits are already zero, just replace this cast with the
+        // result.
+        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
+        if (MaskedValueIsZero(Res, Mask))
+          return ReplaceInstUsesWith(CI, Res);
+
+        // We need to emit an AND to clear the high bits.
          Constant *C = ConstantInt::get(APInt::getLowBitsSet(DestBitSize,
                                                              SrcBitSize));
          return BinaryOperator::CreateAnd(Res, C);
        }
-      case Instruction::SExt:
+      case Instruction::SExt: {
+        // If the high bits are already filled with sign bit, just replace this
+        // cast with the result.
+        unsigned NumSignBits = ComputeNumSignBits(Res);
+        if (NumSignBits > (DestBitSize - SrcBitSize))
+          return ReplaceInstUsesWith(CI, Res);
+
          // We need to emit a cast to truncate, then a cast to sext.
          return CastInst::Create(Instruction::SExt,
              InsertCastBefore(Instruction::Trunc, Res, Src->getType(), 
                               CI), DestTy);
        }
+      }
      }
    }
    
@@ -7796,8 +8048,8 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
            !ValueRequiresCast(CI.getOpcode(), Op1, DestTy,TD) ||
            !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
          Instruction::CastOps opcode = CI.getOpcode();
-        Value *Op0c = InsertOperandCastBefore(opcode, Op0, DestTy, SrcI);
-        Value *Op1c = InsertOperandCastBefore(opcode, Op1, DestTy, SrcI);
+        Value *Op0c = InsertCastBefore(opcode, Op0, DestTy, *SrcI);
+        Value *Op1c = InsertCastBefore(opcode, Op1, DestTy, *SrcI);
          return BinaryOperator::Create(
              cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
        }
@@ -7808,7 +8060,7 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
          SrcI->getOpcode() == Instruction::Xor &&
          Op1 == ConstantInt::getTrue() &&
          (!Op0->hasOneUse() || !isa<CmpInst>(Op0))) {
-      Value *New = InsertOperandCastBefore(Instruction::ZExt, Op0, DestTy, &CI);
+      Value *New = InsertCastBefore(Instruction::ZExt, Op0, DestTy, CI);
        return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
      }
      break;
@@ -7823,10 +8075,10 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
        // only be converting signedness, which is a noop.
        if (!ValueRequiresCast(CI.getOpcode(), Op1, DestTy, TD) || 
            !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
-        Value *Op0c = InsertOperandCastBefore(Instruction::BitCast, 
-                                              Op0, DestTy, SrcI);
-        Value *Op1c = InsertOperandCastBefore(Instruction::BitCast, 
-                                              Op1, DestTy, SrcI);
+        Value *Op0c = InsertCastBefore(Instruction::BitCast, 
+                                       Op0, DestTy, *SrcI);
+        Value *Op1c = InsertCastBefore(Instruction::BitCast, 
+                                       Op1, DestTy, *SrcI);
          return BinaryOperator::Create(
            cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
        }
@@ -7843,8 +8095,8 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
          (DestBitSize < SrcBitSize && isa<Constant>(Op1))) {
        Instruction::CastOps opcode = (DestBitSize == SrcBitSize ?
            Instruction::BitCast : Instruction::Trunc);
-      Value *Op0c = InsertOperandCastBefore(opcode, Op0, DestTy, SrcI);
-      Value *Op1c = InsertOperandCastBefore(opcode, Op1, DestTy, SrcI);
+      Value *Op0c = InsertCastBefore(opcode, Op0, DestTy, *SrcI);
+      Value *Op1c = InsertCastBefore(opcode, Op1, DestTy, *SrcI);
        return BinaryOperator::CreateShl(Op0c, Op1c);
      }
      break;
@@ -8024,32 +8276,35 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
  
    Value *Src = CI.getOperand(0);
  
-  // If this is a cast of a cast
-  if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
-    // If this is a TRUNC followed by a ZEXT then we are dealing with integral
-    // types and if the sizes are just right we can convert this into a logical
-    // 'and' which will be much cheaper than the pair of casts.
-    if (isa<TruncInst>(CSrc)) {
-      // Get the sizes of the types involved
-      Value *A = CSrc->getOperand(0);
-      uint32_t SrcSize = A->getType()->getPrimitiveSizeInBits();
-      uint32_t MidSize = CSrc->getType()->getPrimitiveSizeInBits();
-      uint32_t DstSize = CI.getType()->getPrimitiveSizeInBits();
-      // If we're actually extending zero bits and the trunc is a no-op
-      if (MidSize < DstSize && SrcSize == DstSize) {
-        // Replace both of the casts with an And of the type mask.
-        APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
-        Constant *AndConst = ConstantInt::get(AndValue);
-        Instruction *And = 
-          BinaryOperator::CreateAnd(CSrc->getOperand(0), AndConst);
-        // Unfortunately, if the type changed, we need to cast it back.
-        if (And->getType() != CI.getType()) {
-          And->setName(CSrc->getName()+".mask");
-          InsertNewInstBefore(And, CI);
-          And = CastInst::CreateIntegerCast(And, CI.getType(), false/*ZExt*/);
-        }
-        return And;
-      }
+  // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+  // types and if the sizes are just right we can convert this into a logical
+  // 'and' which will be much cheaper than the pair of casts.
+  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // Get the sizes of the types involved.  We know that the intermediate type
+    // will be smaller than A or C, but don't know the relation between A and C.
+    Value *A = CSrc->getOperand(0);
+    unsigned SrcSize = A->getType()->getPrimitiveSizeInBits();
+    unsigned MidSize = CSrc->getType()->getPrimitiveSizeInBits();
+    unsigned DstSize = CI.getType()->getPrimitiveSizeInBits();
+    // If we're actually extending zero bits, then if
+    // SrcSize <  DstSize: zext(a & mask)
+    // SrcSize == DstSize: a & mask
+    // SrcSize  > DstSize: trunc(a) & mask
+    if (SrcSize < DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      Constant *AndConst = ConstantInt::get(AndValue);
+      Instruction *And =
+        BinaryOperator::CreateAnd(A, AndConst, CSrc->getName()+".mask");
+      InsertNewInstBefore(And, CI);
+      return new ZExtInst(And, CI.getType());
+    } else if (SrcSize == DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      return BinaryOperator::CreateAnd(A, ConstantInt::get(AndValue));
+    } else if (SrcSize > DstSize) {
+      Instruction *Trunc = new TruncInst(A, CI.getType(), "tmp");
+      InsertNewInstBefore(Trunc, CI);
+      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+      return BinaryOperator::CreateAnd(Trunc, ConstantInt::get(AndValue));
      }
    }
  
@@ -8295,7 +8550,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
      // is a single-index GEP.
      if (X->getType() == CI.getType()) {
        // Get the size of the pointee type.
-      uint64_t Size = TD->getABITypeSize(DestPointee);
+      uint64_t Size = TD->getTypePaddedSize(DestPointee);
  
        // Convert the constant to intptr type.
        APInt Offset = Cst->getValue();
@@ -8315,7 +8570,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
      // "inttoptr+GEP" instead of "add+intptr".
      
      // Get the size of the pointee type.
-    uint64_t Size = TD->getABITypeSize(DestPointee);
+    uint64_t Size = TD->getTypePaddedSize(DestPointee);
      
      // Convert the constant to intptr type.
      APInt Offset = Cst->getValue();
@@ -8410,10 +8665,10 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
               Tmp->getOperand(0)->getType() == DestTy) ||
              ((Tmp = dyn_cast<CastInst>(SVI->getOperand(1))) && 
               Tmp->getOperand(0)->getType() == DestTy)) {
-          Value *LHS = InsertOperandCastBefore(Instruction::BitCast,
-                                               SVI->getOperand(0), DestTy, &CI);
-          Value *RHS = InsertOperandCastBefore(Instruction::BitCast,
-                                               SVI->getOperand(1), DestTy, &CI);
+          Value *LHS = InsertCastBefore(Instruction::BitCast,
+                                        SVI->getOperand(0), DestTy, CI);
+          Value *RHS = InsertCastBefore(Instruction::BitCast,
+                                        SVI->getOperand(1), DestTy, CI);
            // Return a new shuffle vector.  Use the same element ID's, as we
            // know the vector types match #elts.
            return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
@@ -8606,13 +8861,14 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
  
        // (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed
        // (x >s -1) ? -1 : 0 -> ashr x, 31  -> all ones if not signed
-      CmpInst::Predicate Pred = ICI->getPredicate();
-      if (match(TrueVal, m_ConstantInt(0)) &&
-          match(FalseVal, m_ConstantInt(-1)))
-        Pred = CmpInst::getInversePredicate(Pred);
-      else if (!match(TrueVal, m_ConstantInt(-1)) ||
-               !match(FalseVal, m_ConstantInt(0)))
-        Pred = CmpInst::BAD_ICMP_PREDICATE;
+      CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+      if (match(TrueVal, m_ConstantInt<-1>()) &&
+          match(FalseVal, m_ConstantInt<0>()))
+        Pred = ICI->getPredicate();
+      else if (match(TrueVal, m_ConstantInt<0>()) &&
+               match(FalseVal, m_ConstantInt<-1>()))
+        Pred = CmpInst::getInversePredicate(ICI->getPredicate());
+      
        if (Pred != CmpInst::BAD_ICMP_PREDICATE) {
          // If we are just checking for a icmp eq of a single bit and zext'ing it
          // to an integer, then shift the bit to the appropriate place and then
@@ -8622,7 +8878,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
          // sext (x <s  0) to i32 --> x>>s31      true if signbit set.
          // sext (x >s -1) to i32 --> (x>>s31)^-1  true if signbit clear.
          if ((Pred == ICmpInst::ICMP_SLT && Op1CV == 0) ||
-            (Pred == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) {
+            (Pred == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
            Value *In = ICI->getOperand(0);
            Value *Sh = ConstantInt::get(In->getType(),
                                         In->getType()->getPrimitiveSizeInBits()-1);
@@ -8737,8 +8993,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
                                                 "not."+CondVal->getName()), SI);
          return CastInst::Create(Instruction::ZExt, NotCond, SI.getType());
        }
-      
-      // FIXME: Turn select 0/-1 and -1/0 into sext from condition!
  
        if (ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition())) {
  
@@ -8754,17 +9008,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
                Instruction *SRA = BinaryOperator::Create(Instruction::AShr, X,
                                                          ShAmt, "ones");
                InsertNewInstBefore(SRA, SI);
-              
-              // Finally, convert to the type of the select RHS.  We figure out
-              // if this requires a SExt, Trunc or BitCast based on the sizes.
-              Instruction::CastOps opc = Instruction::BitCast;
-              uint32_t SRASize = SRA->getType()->getPrimitiveSizeInBits();
-              uint32_t SISize  = SI.getType()->getPrimitiveSizeInBits();
-              if (SRASize < SISize)
-                opc = Instruction::SExt;
-              else if (SRASize > SISize)
-                opc = Instruction::Trunc;
-              return CastInst::Create(opc, SRA, SI.getType());
+
+              // Then cast to the appropriate width.
+              return CastInst::CreateIntegerCast(SRA, SI.getType(), true);
              }
            }
  
@@ -8999,15 +9245,23 @@ static unsigned EnforceKnownAlignment(Value *V,
      // If there is a large requested alignment and we can, bump up the alignment
      // of the global.
      if (!GV->isDeclaration()) {
-      GV->setAlignment(PrefAlign);
-      Align = PrefAlign;
+      if (GV->getAlignment() >= PrefAlign)
+        Align = GV->getAlignment();
+      else {
+        GV->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
      }
    } else if (AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
      // If there is a requested alignment and if this is an alloca, round up.  We
      // don't do this for malloc, because some systems can't respect the request.
      if (isa<AllocaInst>(AI)) {
-      AI->setAlignment(PrefAlign);
-      Align = PrefAlign;
+      if (AI->getAlignment() >= PrefAlign)
+        Align = AI->getAlignment();
+      else {
+        AI->setAlignment(PrefAlign);
+        Align = PrefAlign;
+      }
      }
    }
  
@@ -9186,12 +9440,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
        if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
          if (GVSrc->isConstant()) {
            Module *M = CI.getParent()->getParent()->getParent();
-          Intrinsic::ID MemCpyID;
-          if (CI.getOperand(3)->getType() == Type::Int32Ty)
-            MemCpyID = Intrinsic::memcpy_i32;
-          else
-            MemCpyID = Intrinsic::memcpy_i64;
-          CI.setOperand(0, Intrinsic::getDeclaration(M, MemCpyID));
+          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+          const Type *Tys[1];
+          Tys[0] = CI.getOperand(3)->getType();
+          CI.setOperand(0, 
+                        Intrinsic::getDeclaration(M, MemCpyID, Tys, 1));
            Changed = true;
          }
  
@@ -9260,8 +9513,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    case Intrinsic::x86_sse_cvttss2si: {
      // These intrinsics only demands the 0th element of its input vector.  If
      // we can simplify the input based on that, do so now.
-    uint64_t UndefElts;
-    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, 
+    unsigned VWidth =
+      cast<VectorType>(II->getOperand(1)->getType())->getNumElements();
+    APInt DemandedElts(VWidth, 1);
+    APInt UndefElts(VWidth, 0);
+    if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
                                                UndefElts)) {
        II->setOperand(1, V);
        return II;
@@ -9390,7 +9646,7 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
    const Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
    if (!SrcTy->isSized() || !DstTy->isSized())
      return false;
-  if (TD->getABITypeSize(SrcTy) != TD->getABITypeSize(DstTy))
+  if (TD->getTypePaddedSize(SrcTy) != TD->getTypePaddedSize(DstTy))
      return false;
    return true;
  }
@@ -9849,8 +10105,7 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
  /// and a single binop.
  Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
    Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
-  assert(isa<BinaryOperator>(FirstInst) || isa<GetElementPtrInst>(FirstInst) ||
-         isa<CmpInst>(FirstInst));
+  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
    unsigned Opc = FirstInst->getOpcode();
    Value *LHSVal = FirstInst->getOperand(0);
    Value *RHSVal = FirstInst->getOperand(1);
@@ -9860,7 +10115,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
    
    // Scan to see if all operands are the same opcode, all have one use, and all
    // kill their operands (i.e. the operands have one use).
-  for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
      Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
      if (!I || I->getOpcode() != Opc || !I->hasOneUse() ||
          // Verify type of the LHS matches so we don't fold cmp's of different
@@ -9880,13 +10135,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
      if (I->getOperand(1) != RHSVal) RHSVal = 0;
    }
    
-  // Otherwise, this is safe to transform, determine if it is profitable.
-
-  // If this is a GEP, and if the index (not the pointer) needs a PHI, bail out.
-  // Indexes are often folded into load/store instructions, so we don't want to
-  // hide them behind a phi.
-  if (isa<GetElementPtrInst>(FirstInst) && RHSVal == 0)
-    return 0;
+  // Otherwise, this is safe to transform!
    
    Value *InLHS = FirstInst->getOperand(0);
    Value *InRHS = FirstInst->getOperand(1);
@@ -9910,37 +10159,126 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
    }
    
    // Add all operands to the new PHIs.
-  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
-    if (NewLHS) {
-      Value *NewInLHS =cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
-      NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
-    }
-    if (NewRHS) {
-      Value *NewInRHS =cast<Instruction>(PN.getIncomingValue(i))->getOperand(1);
-      NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+  if (NewLHS || NewRHS) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
+      if (NewLHS) {
+        Value *NewInLHS = InInst->getOperand(0);
+        NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+      }
+      if (NewRHS) {
+        Value *NewInRHS = InInst->getOperand(1);
+        NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+      }
      }
    }
      
    if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
      return BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
-  else if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
-    return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), LHSVal, 
-                           RHSVal);
-  else {
-    assert(isa<GetElementPtrInst>(FirstInst));
-    return GetElementPtrInst::Create(LHSVal, RHSVal);
+  CmpInst *CIOp = cast<CmpInst>(FirstInst);
+  return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), LHSVal,
+                         RHSVal);
+}
+
+Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
+  
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
+                                        FirstInst->op_end());
+  // This is true if all GEP bases are allocas and if all indices into them are
+  // constants.
+  bool AllBasePointersAreAllocas = true;
+  
+  // Scan to see if all operands are the same opcode, all have one use, and all
+  // kill their operands (i.e. the operands have one use).
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
+    if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() ||
+      GEP->getNumOperands() != FirstInst->getNumOperands())
+      return 0;
+
+    // Keep track of whether or not all GEPs are of alloca pointers.
+    if (AllBasePointersAreAllocas &&
+        (!isa<AllocaInst>(GEP->getOperand(0)) ||
+         !GEP->hasAllConstantIndices()))
+      AllBasePointersAreAllocas = false;
+    
+    // Compare the operand lists.
+    for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
+      if (FirstInst->getOperand(op) == GEP->getOperand(op))
+        continue;
+      
+      // Don't merge two GEPs when two operands differ (introducing phi nodes)
+      // if one of the PHIs has a constant for the index.  The index may be
+      // substantially cheaper to compute for the constants, so making it a
+      // variable index could pessimize the path.  This also handles the case
+      // for struct indices, which must always be constant.
+      if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
+          isa<ConstantInt>(GEP->getOperand(op)))
+        return 0;
+      
+      if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
+        return 0;
+      FixedOperands[op] = 0;  // Needs a PHI.
+    }
+  }
+  
+  // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+  // bother doing this transformation.  At best, this will just save a bit of
+  // offset calculation, but all the predecessors will have to materialize the
+  // stack address into a register anyway.  We'd actually rather *clone* the
+  // load up into the predecessors so that we have a load of a gep of an alloca,
+  // which can usually all be folded into the load.
+  if (AllBasePointersAreAllocas)
+    return 0;
+  
+  // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
+  // that is variable.
+  SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
+  
+  bool HasAnyPHIs = false;
+  for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
+    if (FixedOperands[i]) continue;  // operand doesn't need a phi.
+    Value *FirstOp = FirstInst->getOperand(i);
+    PHINode *NewPN = PHINode::Create(FirstOp->getType(),
+                                     FirstOp->getName()+".pn");
+    InsertNewInstBefore(NewPN, PN);
+    
+    NewPN->reserveOperandSpace(e);
+    NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
+    OperandPhis[i] = NewPN;
+    FixedOperands[i] = NewPN;
+    HasAnyPHIs = true;
+  }
+
+  
+  // Add all operands to the new PHIs.
+  if (HasAnyPHIs) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
+      BasicBlock *InBB = PN.getIncomingBlock(i);
+      
+      for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
+        if (PHINode *OpPhi = OperandPhis[op])
+          OpPhi->addIncoming(InGEP->getOperand(op), InBB);
+    }
    }
+  
+  Value *Base = FixedOperands[0];
+  return GetElementPtrInst::Create(Base, FixedOperands.begin()+1,
+                                   FixedOperands.end());
  }
  
-/// isSafeToSinkLoad - Return true if we know that it is safe sink the load out
-/// of the block that defines it.  This means that it must be obvious the value
-/// of the load is not changed from the point of the load to the end of the
-/// block it is in.
+
+/// isSafeAndProfitableToSinkLoad - Return true if we know that it is safe to
+/// sink the load out of the block that defines it.  This means that it must be
+/// obvious the value of the load is not changed from the point of the load to
+/// the end of the block it is in.
  ///
  /// Finally, it is safe, but not profitable, to sink a load targetting a
  /// non-address-taken alloca.  Doing so will cause us to not promote the alloca
  /// to a register.
-static bool isSafeToSinkLoad(LoadInst *L) {
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
    BasicBlock::iterator BBI = L, E = L->getParent()->end();
    
    for (++BBI; BBI != E; ++BBI)
@@ -9962,10 +10300,20 @@ static bool isSafeToSinkLoad(LoadInst *L) {
        break;
      }
      
-    if (!isAddressTaken)
+    if (!isAddressTaken && AI->isStaticAlloca())
        return false;
    }
    
+  // If this load is a load from a GEP with a constant offset from an alloca,
+  // then we don't want to sink it.  In its present form, it will be
+  // load [constant stack offset].  Sinking it will cause us to have to
+  // materialize the stack addresses in each predecessor in a register only to
+  // do a shared load from register in the successor.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+        return false;
+  
    return true;
  }
  
@@ -9996,7 +10344,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
      // We can't sink the load if the loaded value could be modified between the
      // load and the PHI.
      if (LI->getParent() != PN.getIncomingBlock(0) ||
-        !isSafeToSinkLoad(LI))
+        !isSafeAndProfitableToSinkLoad(LI))
        return 0;
      
      // If the PHI is of volatile loads and the load block has multiple
@@ -10007,10 +10355,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
        return 0;
      
    } else if (isa<GetElementPtrInst>(FirstInst)) {
-    if (FirstInst->getNumOperands() == 2)
-      return FoldPHIArgBinOpIntoPHI(PN);
-    // Can't handle general GEPs yet.
-    return 0;
+    return FoldPHIArgGEPIntoPHI(PN);
    } else {
      return 0;  // Cannot fold this operation.
    }
@@ -10029,7 +10374,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
        // the load and the PHI.
        if (LI->isVolatile() != isVolatile ||
            LI->getParent() != PN.getIncomingBlock(i) ||
-          !isSafeToSinkLoad(LI))
+          !isSafeAndProfitableToSinkLoad(LI))
          return 0;
        
        // If the PHI is of volatile loads and the load block has multiple
@@ -10038,7 +10383,6 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
        if (isVolatile &&
            LI->getParent()->getTerminator()->getNumSuccessors() != 1)
          return 0;
-
        
      } else if (I->getOperand(1) != ConstantOp) {
        return 0;
@@ -10154,6 +10498,11 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
    // If all PHI operands are the same operation, pull them through the PHI,
    // reducing code size.
    if (isa<Instruction>(PN.getIncomingValue(0)) &&
+      isa<Instruction>(PN.getIncomingValue(1)) &&
+      cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
+      cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
+      // FIXME: The hasOneUse check will fail for PHIs that use the value more
+      // than themselves more than once.
        PN.getIncomingValue(0)->hasOneUse())
      if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
        return Result;
@@ -10302,28 +10651,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
    }
    if (MadeChange) return &GEP;
  
-  // If this GEP instruction doesn't move the pointer, and if the input operand
-  // is a bitcast of another pointer, just replace the GEP with a bitcast of the
-  // real input to the dest type.
-  if (GEP.hasAllZeroIndices()) {
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(GEP.getOperand(0))) {
-      // If the bitcast is of an allocation, and the allocation will be
-      // converted to match the type of the cast, don't touch this.
-      if (isa<AllocationInst>(BCI->getOperand(0))) {
-        // See if the bitcast simplifies, if so, don't nuke this GEP yet.
-        if (Instruction *I = visitBitCast(*BCI)) {
-          if (I != BCI) {
-            I->takeName(BCI);
-            BCI->getParent()->getInstList().insert(BCI, I);
-            ReplaceInstUsesWith(*BCI, I);
-          }
-          return &GEP;
-        }
-      }
-      return new BitCastInst(BCI->getOperand(0), GEP.getType());
-    }
-  }
-  
    // Combine Indices - If the source pointer to this getelementptr instruction
    // is a getelementptr instruction, combine the indices of the two
    // getelementptr instructions into a single instruction.
@@ -10462,8 +10789,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
        const Type *SrcElTy = cast<PointerType>(X->getType())->getElementType();
        const Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
        if (isa<ArrayType>(SrcElTy) &&
-          TD->getABITypeSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
-          TD->getABITypeSize(ResElTy)) {
+          TD->getTypePaddedSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypePaddedSize(ResElTy)) {
          Value *Idx[2];
          Idx[0] = Constant::getNullValue(Type::Int32Ty);
          Idx[1] = GEP.getOperand(1);
@@ -10480,7 +10807,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
        
        if (isa<ArrayType>(SrcElTy) && ResElTy == Type::Int8Ty) {
          uint64_t ArrayEltSize =
-            TD->getABITypeSize(cast<ArrayType>(SrcElTy)->getElementType());
+            TD->getTypePaddedSize(cast<ArrayType>(SrcElTy)->getElementType());
          
          // Check to see if "tmp" is a scale by a multiple of ArrayEltSize.  We
          // allow either a mul, shift, or constant here.
@@ -10534,7 +10861,56 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
        }
      }
    }
-
+  
+  /// See if we can simplify:
+  ///   X = bitcast A to B*
+  ///   Y = gep X, <...constant indices...>
+  /// into a gep of the original struct.  This is important for SROA and alias
+  /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
+    if (!isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices()) {
+      // Determine how much the GEP moves the pointer.  We are guaranteed to get
+      // a constant back from EmitGEPOffset.
+      ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP, GEP, *this));
+      int64_t Offset = OffsetV->getSExtValue();
+      
+      // If this GEP instruction doesn't move the pointer, just replace the GEP
+      // with a bitcast of the real input to the dest type.
+      if (Offset == 0) {
+        // If the bitcast is of an allocation, and the allocation will be
+        // converted to match the type of the cast, don't touch this.
+        if (isa<AllocationInst>(BCI->getOperand(0))) {
+          // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+          if (Instruction *I = visitBitCast(*BCI)) {
+            if (I != BCI) {
+              I->takeName(BCI);
+              BCI->getParent()->getInstList().insert(BCI, I);
+              ReplaceInstUsesWith(*BCI, I);
+            }
+            return &GEP;
+          }
+        }
+        return new BitCastInst(BCI->getOperand(0), GEP.getType());
+      }
+      
+      // Otherwise, if the offset is non-zero, we need to find out if there is a
+      // field at Offset in 'A's type.  If so, we can pull the cast through the
+      // GEP.
+      SmallVector<Value*, 8> NewIndices;
+      const Type *InTy =
+        cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
+      if (FindElementAtOffset(InTy, Offset, NewIndices, TD)) {
+        Instruction *NGEP =
+           GetElementPtrInst::Create(BCI->getOperand(0), NewIndices.begin(),
+                                     NewIndices.end());
+        if (NGEP->getType() == GEP.getType()) return NGEP;
+        InsertNewInstBefore(NGEP, GEP);
+        NGEP->takeName(&GEP);
+        return new BitCastInst(NGEP, GEP.getType());
+      }
+    }
+  }    
+    
    return 0;
  }
  
@@ -10580,12 +10956,17 @@ Instruction *InstCombiner::visitAllocationInst(AllocationInst &AI) {
      }
    }
  
-  // If alloca'ing a zero byte object, replace the alloca with a null pointer.
-  // Note that we only do this for alloca's, because malloc should allocate and
-  // return a unique pointer, even for a zero byte allocation.
-  if (isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized() &&
-      TD->getABITypeSize(AI.getAllocatedType()) == 0)
-    return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+  if (isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) {
+    // If alloca'ing a zero byte object, replace the alloca with a null pointer.
+    // Note that we only do this for alloca's, because malloc should allocate and
+    // return a unique pointer, even for a zero byte allocation.
+    if (TD->getTypePaddedSize(AI.getAllocatedType()) == 0)
+      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+    // If the alignment is 0 (unspecified), assign it the preferred alignment.
+    if (AI.getAlignment() == 0)
+      AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
+  }
  
    return 0;
  }
@@ -10652,12 +11033,12 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
          APInt SingleChar(numBits, 0);
          if (TD->isLittleEndian()) {
            for (signed i = len-1; i >= 0; i--) {
-            SingleChar = (uint64_t) Str[i];
+            SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
              StrVal = (StrVal << 8) | SingleChar;
            }
          } else {
            for (unsigned i = 0; i < len; i++) {
-            SingleChar = (uint64_t) Str[i];
+            SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
              StrVal = (StrVal << 8) | SingleChar;
            }
            // Append NULL at the end.
@@ -10670,8 +11051,14 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
      }
    }
  
-  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  const PointerType *DestTy = cast<PointerType>(CI->getType());
+  const Type *DestPTy = DestTy->getElementType();
    if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+
+    // If the address spaces don't match, don't eliminate the cast.
+    if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
+      return 0;
+
      const Type *SrcPTy = SrcTy->getElementType();
  
      if (DestPTy->isInteger() || isa<PointerType>(DestPTy) || 
@@ -10749,36 +11136,12 @@ static bool isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom) {
    return false;
  }
  
-/// equivalentAddressValues - Test if A and B will obviously have the same
-/// value. This includes recognizing that %t0 and %t1 will have the same
-/// value in code like this:
-///   %t0 = getelementptr @a, 0, 3
-///   store i32 0, i32* %t0
-///   %t1 = getelementptr @a, 0, 3
-///   %t2 = load i32* %t1
-///
-static bool equivalentAddressValues(Value *A, Value *B) {
-  // Test if the values are trivially equivalent.
-  if (A == B) return true;
-
-  // Test if the values come form identical arithmetic instructions.
-  if (isa<BinaryOperator>(A) ||
-      isa<CastInst>(A) ||
-      isa<PHINode>(A) ||
-      isa<GetElementPtrInst>(A))
-    if (Instruction *BI = dyn_cast<Instruction>(B))
-      if (cast<Instruction>(A)->isIdenticalTo(BI))
-        return true;
-
-  // Otherwise they may not be equivalent.
-  return false;
-}
-
  Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
    Value *Op = LI.getOperand(0);
  
    // Attempt to improve the alignment.
-  unsigned KnownAlign = GetOrEnforceKnownAlignment(Op);
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()));
    if (KnownAlign >
        (LI.getAlignment() == 0 ? TD->getABITypeAlignment(LI.getType()) :
                                  LI.getAlignment()))
@@ -10796,22 +11159,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
    // where there are several consequtive memory accesses to the same location,
    // separated by a few arithmetic operations.
    BasicBlock::iterator BBI = &LI;
-  for (unsigned ScanInsts = 6; BBI != LI.getParent()->begin() && ScanInsts;
-       --ScanInsts) {
-    --BBI;
-    
-    if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
-      if (equivalentAddressValues(SI->getOperand(1), LI.getOperand(0)))
-        return ReplaceInstUsesWith(LI, SI->getOperand(0));
-    } else if (LoadInst *LIB = dyn_cast<LoadInst>(BBI)) {
-      if (equivalentAddressValues(LIB->getOperand(0), LI.getOperand(0)))
-        return ReplaceInstUsesWith(LI, LIB);
-    }
-
-    // Don't skip over things that can modify memory.
-    if (BBI->mayWriteToMemory())
-      break;
-  }
+  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
+    return ReplaceInstUsesWith(LI, AvailableVal);
  
    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
      const Value *GEPI0 = GEPI->getOperand(0);
@@ -10923,59 +11272,123 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
  }
  
  /// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
-/// when possible.
+/// when possible.  This makes it generally easy to do alias analysis and/or
+/// SROA/mem2reg of the memory object.
  static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
    User *CI = cast<User>(SI.getOperand(1));
    Value *CastOp = CI->getOperand(0);
  
    const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
-  if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
-    const Type *SrcPTy = SrcTy->getElementType();
-
-    if (DestPTy->isInteger() || isa<PointerType>(DestPTy)) {
-      // If the source is an array, the code below will not succeed.  Check to
-      // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
-      // constants.
-      if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
-        if (Constant *CSrc = dyn_cast<Constant>(CastOp))
-          if (ASrcTy->getNumElements() != 0) {
-            Value* Idxs[2];
-            Idxs[0] = Idxs[1] = Constant::getNullValue(Type::Int32Ty);
-            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
-            SrcTy = cast<PointerType>(CastOp->getType());
-            SrcPTy = SrcTy->getElementType();
-          }
-
-      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy)) &&
-          IC.getTargetData().getTypeSizeInBits(SrcPTy) ==
-               IC.getTargetData().getTypeSizeInBits(DestPTy)) {
+  const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
+  if (SrcTy == 0) return 0;
+  
+  const Type *SrcPTy = SrcTy->getElementType();
  
-        // Okay, we are casting from one integer or pointer type to another of
-        // the same size.  Instead of casting the pointer before 
-        // the store, cast the value to be stored.
-        Value *NewCast;
-        Value *SIOp0 = SI.getOperand(0);
-        Instruction::CastOps opcode = Instruction::BitCast;
-        const Type* CastSrcTy = SIOp0->getType();
-        const Type* CastDstTy = SrcPTy;
-        if (isa<PointerType>(CastDstTy)) {
-          if (CastSrcTy->isInteger())
-            opcode = Instruction::IntToPtr;
-        } else if (isa<IntegerType>(CastDstTy)) {
-          if (isa<PointerType>(SIOp0->getType()))
-            opcode = Instruction::PtrToInt;
-        }
-        if (Constant *C = dyn_cast<Constant>(SIOp0))
-          NewCast = ConstantExpr::getCast(opcode, C, CastDstTy);
-        else
-          NewCast = IC.InsertNewInstBefore(
-            CastInst::Create(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"), 
-            SI);
-        return new StoreInst(NewCast, CastOp);
+  if (!DestPTy->isInteger() && !isa<PointerType>(DestPTy))
+    return 0;
+  
+  /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
+  /// to its first element.  This allows us to handle things like:
+  ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
+  /// on 32-bit hosts.
+  SmallVector<Value*, 4> NewGEPIndices;
+  
+  // If the source is an array, the code below will not succeed.  Check to
+  // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+  // constants.
+  if (isa<ArrayType>(SrcPTy) || isa<StructType>(SrcPTy)) {
+    // Index through pointer.
+    Constant *Zero = Constant::getNullValue(Type::Int32Ty);
+    NewGEPIndices.push_back(Zero);
+    
+    while (1) {
+      if (const StructType *STy = dyn_cast<StructType>(SrcPTy)) {
+        if (!STy->getNumElements()) /* Struct can be empty {} */
+          break;
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = STy->getElementType(0);
+      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = ATy->getElementType();
+      } else {
+        break;
        }
      }
+    
+    SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
    }
-  return 0;
+
+  if (!SrcPTy->isInteger() && !isa<PointerType>(SrcPTy))
+    return 0;
+  
+  // If the pointers point into different address spaces or if they point to
+  // values with different sizes, we can't do the transformation.
+  if (SrcTy->getAddressSpace() != 
+        cast<PointerType>(CI->getType())->getAddressSpace() ||
+      IC.getTargetData().getTypeSizeInBits(SrcPTy) !=
+      IC.getTargetData().getTypeSizeInBits(DestPTy))
+    return 0;
+
+  // Okay, we are casting from one integer or pointer type to another of
+  // the same size.  Instead of casting the pointer before 
+  // the store, cast the value to be stored.
+  Value *NewCast;
+  Value *SIOp0 = SI.getOperand(0);
+  Instruction::CastOps opcode = Instruction::BitCast;
+  const Type* CastSrcTy = SIOp0->getType();
+  const Type* CastDstTy = SrcPTy;
+  if (isa<PointerType>(CastDstTy)) {
+    if (CastSrcTy->isInteger())
+      opcode = Instruction::IntToPtr;
+  } else if (isa<IntegerType>(CastDstTy)) {
+    if (isa<PointerType>(SIOp0->getType()))
+      opcode = Instruction::PtrToInt;
+  }
+  
+  // SIOp0 is a pointer to aggregate and this is a store to the first field,
+  // emit a GEP to index into its first field.
+  if (!NewGEPIndices.empty()) {
+    if (Constant *C = dyn_cast<Constant>(CastOp))
+      CastOp = ConstantExpr::getGetElementPtr(C, &NewGEPIndices[0], 
+                                              NewGEPIndices.size());
+    else
+      CastOp = IC.InsertNewInstBefore(
+              GetElementPtrInst::Create(CastOp, NewGEPIndices.begin(),
+                                        NewGEPIndices.end()), SI);
+  }
+  
+  if (Constant *C = dyn_cast<Constant>(SIOp0))
+    NewCast = ConstantExpr::getCast(opcode, C, CastDstTy);
+  else
+    NewCast = IC.InsertNewInstBefore(
+      CastInst::Create(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"), 
+      SI);
+  return new StoreInst(NewCast, CastOp);
+}
+
+/// equivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr @a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr @a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool equivalentAddressValues(Value *A, Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+  
+  // Test if the values come form identical arithmetic instructions.
+  if (isa<BinaryOperator>(A) ||
+      isa<CastInst>(A) ||
+      isa<PHINode>(A) ||
+      isa<GetElementPtrInst>(A))
+    if (Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalTo(BI))
+        return true;
+  
+  // Otherwise they may not be equivalent.
+  return false;
  }
  
  Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
@@ -11007,7 +11420,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
    }
  
    // Attempt to improve the alignment.
-  unsigned KnownAlign = GetOrEnforceKnownAlignment(Ptr);
+  unsigned KnownAlign =
+    GetOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()));
    if (KnownAlign >
        (SI.getAlignment() == 0 ? TD->getABITypeAlignment(Val->getType()) :
                                  SI.getAlignment()))
@@ -11023,8 +11437,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
      
      if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
        // Prev store isn't volatile, and stores to the same location?
-      if (!PrevSI->isVolatile() && equivalentAddressValues(PrevSI->getOperand(1),
-          SI.getOperand(1))) {
+      if (!PrevSI->isVolatile() &&equivalentAddressValues(PrevSI->getOperand(1),
+                                                          SI.getOperand(1))) {
          ++NumDeadStore;
          ++BBI;
          EraseInstFromFunction(*PrevSI);
@@ -11512,10 +11926,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
      // If the input vector has a single use, simplify it based on this use
      // property.
      if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) {
-      uint64_t UndefElts;
+      APInt UndefElts(VectorWidth, 0);
+      APInt DemandedMask(VectorWidth, 1 << IndexVal);
        if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
-                                                1 << IndexVal,
-                                                UndefElts)) {
+                                                DemandedMask, UndefElts)) {
          EI.setOperand(0, V);
          return &EI;
        }
@@ -11814,15 +12228,14 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    if (isa<UndefValue>(SVI.getOperand(2)))
      return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
  
-  uint64_t UndefElts;
    unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
  
    if (VWidth != cast<VectorType>(LHS->getType())->getNumElements())
      return 0;
  
-  uint64_t AllOnesEltMask = ~0ULL >> (64-VWidth);
-  if (VWidth <= 64 &&
-      SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
      LHS = SVI.getOperand(0);
      RHS = SVI.getOperand(1);
      MadeChange = true;
@@ -11899,9 +12312,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
        // If the result mask is equal to the src shuffle or this shuffle mask, do
        // the replacement.
        if (NewMask == LHSMask || NewMask == Mask) {
+        unsigned LHSInNElts =
+          cast<VectorType>(LHSSVI->getOperand(0)->getType())->getNumElements();
          std::vector<Constant*> Elts;
          for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
-          if (NewMask[i] >= e*2) {
+          if (NewMask[i] >= LHSInNElts*2) {
              Elts.push_back(UndefValue::get(Type::Int32Ty));
            } else {
              Elts.push_back(ConstantInt::get(Type::Int32Ty, NewMask[i]));
@@ -11975,7 +12390,8 @@ static void AddReachableCodeToWorklist(BasicBlock *BB,
      
      // We have now visited this block!  If we've already been here, ignore it.
      if (!Visited.insert(BB)) continue;
-    
+
+    DbgInfoIntrinsic *DBI_Prev = NULL;
      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
        Instruction *Inst = BBI++;
        
@@ -11996,6 +12412,22 @@ static void AddReachableCodeToWorklist(BasicBlock *BB,
          continue;
        }
       
+      // If there are two consecutive llvm.dbg.stoppoint calls then
+      // it is likely that the optimizer deleted code in between these
+      // two intrinsics. 
+      DbgInfoIntrinsic *DBI_Next = dyn_cast<DbgInfoIntrinsic>(Inst);
+      if (DBI_Next) {
+        if (DBI_Prev
+            && DBI_Prev->getIntrinsicID() == llvm::Intrinsic::dbg_stoppoint
+            && DBI_Next->getIntrinsicID() == llvm::Intrinsic::dbg_stoppoint) {
+          IC.RemoveFromWorkList(DBI_Prev);
+          DBI_Prev->eraseFromParent();
+        }
+        DBI_Prev = DBI_Next;
+      } else {
+        DBI_Prev = 0;
+      }
+
        IC.AddToWorkList(Inst);
      }
  
@@ -12059,6 +12491,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
            if (!I->use_empty())
              I->replaceAllUsesWith(UndefValue::get(I->getType()));
            I->eraseFromParent();
+          Changed = true;
          }
        }
    }
@@ -12078,6 +12511,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
  
        I->eraseFromParent();
        RemoveFromWorkList(I);
+      Changed = true;
        continue;
      }
  
@@ -12092,17 +12526,19 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
        ++NumConstProp;
        I->eraseFromParent();
        RemoveFromWorkList(I);
+      Changed = true;
        continue;
      }
  
      if (TD && I->getType()->getTypeID() == Type::VoidTyID) {
        // See if we can constant fold its operands.
-      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
-        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(i)) {
+      for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(i))
            if (Constant *NewC = ConstantFoldConstantExpression(CE, TD))
-            i->set(NewC);
-        }
-      }
+            if (NewC != CE) {
+              i->set(NewC);
+              Changed = true;
+            }
      }
  
      // See if we can trivially sink this instruction to a successor basic block.
@@ -12219,5 +12655,3 @@ bool InstCombiner::runOnFunction(Function &F) {
  FunctionPass *llvm::createInstructionCombiningPass() {
    return new InstCombiner();
  }
-
-