Minor cleanup related to my latest scheduler changes.

[oota-llvm.git] / lib / Transforms / InstCombine / InstCombineCasts.cpp
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp

index 52c36ddd3fe7009468061cd9b95e090f02db2d04..88ee1453faf0c81eb6e4a725a047107d9635782f 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -22,19 +22,18 @@ using namespace PatternMatch;
  /// X*Scale+Offset.
  ///
  static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
-                                        int &Offset) {
-  assert(Val->getType()->isInteger(32) && "Unexpected allocation size type!");
+                                        uint64_t &Offset) {
    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
      Offset = CI->getZExtValue();
      Scale  = 0;
-    return ConstantInt::get(Type::getInt32Ty(Val->getContext()), 0);
+    return ConstantInt::get(Val->getType(), 0);
    }
    
    if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
      if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
        if (I->getOpcode() == Instruction::Shl) {
          // This is a value scaled by '1 << the shift amt'.
-        Scale = 1U << RHS->getZExtValue();
+        Scale = UINT64_C(1) << RHS->getZExtValue();
          Offset = 0;
          return I->getOperand(0);
        }
@@ -100,7 +99,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
    // See if we can satisfy the modulus by pulling a scale out of the array
    // size argument.
    unsigned ArraySizeScale;
-  int ArrayOffset;
+  uint64_t ArrayOffset;
    Value *NumElements = // See if the array size is a decomposable linear expr.
      DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
   
@@ -114,13 +113,13 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
    if (Scale == 1) {
      Amt = NumElements;
    } else {
-    Amt = ConstantInt::get(Type::getInt32Ty(CI.getContext()), Scale);
+    Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
      // Insert before the alloca, not before the cast.
      Amt = AllocaBuilder.CreateMul(Amt, NumElements, "tmp");
    }
    
-  if (int Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
-    Value *Off = ConstantInt::get(Type::getInt32Ty(CI.getContext()),
+  if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                    Offset, true);
      Amt = AllocaBuilder.CreateAdd(Amt, Off, "tmp");
    }
@@ -147,156 +146,19 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
  }
  
  
-/// CanEvaluateInDifferentType - Return true if we can take the specified value
-/// and return it as type Ty without inserting any new casts and without
-/// changing the computed value.  This is used by code that tries to decide
-/// whether promoting or shrinking integer operations to wider or smaller types
-/// will allow us to eliminate a truncate or extend.
-///
-/// This is a truncation operation if Ty is smaller than V->getType(), or an
-/// extension operation if Ty is larger.
-///
-/// If CastOpc is a truncation, then Ty will be a type smaller than V.  We
-/// should return true if trunc(V) can be computed by computing V in the smaller
-/// type.  If V is an instruction, then trunc(inst(x,y)) can be computed as
-/// inst(trunc(x),trunc(y)), which only makes sense if x and y can be
-/// efficiently truncated.
-///
-/// If CastOpc is a sext or zext, we are asking if the low bits of the value can
-/// bit computed in a larger type, which is then and'd or sext_in_reg'd to get
-/// the final result.
-bool InstCombiner::CanEvaluateInDifferentType(Value *V, const Type *Ty,
-                                              unsigned CastOpc,
-                                              int &NumCastsRemoved){
-  // We can always evaluate constants in another type.
-  if (isa<Constant>(V))
-    return true;
-  
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
-  
-  const Type *OrigTy = V->getType();
-  
-  // If this is an extension or truncate, we can often eliminate it.
-  if (isa<TruncInst>(I) || isa<ZExtInst>(I) || isa<SExtInst>(I)) {
-    // If this is a cast from the destination type, we can trivially eliminate
-    // it, and this will remove a cast overall.
-    if (I->getOperand(0)->getType() == Ty) {
-      // If the first operand is itself a cast, and is eliminable, do not count
-      // this as an eliminable cast.  We would prefer to eliminate those two
-      // casts first.
-      if (!isa<CastInst>(I->getOperand(0)) && I->hasOneUse())
-        ++NumCastsRemoved;
-      return true;
-    }
-  }
-
-  // We can't extend or shrink something that has multiple uses: doing so would
-  // require duplicating the instruction in general, which isn't profitable.
-  if (!I->hasOneUse()) return false;
-
-  unsigned Opc = I->getOpcode();
-  switch (Opc) {
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Mul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-    // These operators can all arbitrarily be extended or truncated.
-    return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
-                                      NumCastsRemoved) &&
-           CanEvaluateInDifferentType(I->getOperand(1), Ty, CastOpc,
-                                      NumCastsRemoved);
-
-  case Instruction::UDiv:
-  case Instruction::URem: {
-    // UDiv and URem can be truncated if all the truncated bits are zero.
-    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
-    uint32_t BitWidth = Ty->getScalarSizeInBits();
-    if (BitWidth < OrigBitWidth) {
-      APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
-      if (MaskedValueIsZero(I->getOperand(0), Mask) &&
-          MaskedValueIsZero(I->getOperand(1), Mask)) {
-        return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
-                                          NumCastsRemoved) &&
-               CanEvaluateInDifferentType(I->getOperand(1), Ty, CastOpc,
-                                          NumCastsRemoved);
-      }
-    }
-    break;
-  }
-  case Instruction::Shl:
-    // If we are truncating the result of this SHL, and if it's a shift of a
-    // constant amount, we can always perform a SHL in a smaller type.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (BitWidth < OrigTy->getScalarSizeInBits() &&
-          CI->getLimitedValue(BitWidth) < BitWidth)
-        return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
-                                          NumCastsRemoved);
-    }
-    break;
-  case Instruction::LShr:
-    // If this is a truncate of a logical shr, we can truncate it to a smaller
-    // lshr iff we know that the bits we would otherwise be shifting in are
-    // already zeros.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (BitWidth < OrigBitWidth &&
-          MaskedValueIsZero(I->getOperand(0),
-            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
-          CI->getLimitedValue(BitWidth) < BitWidth) {
-        return CanEvaluateInDifferentType(I->getOperand(0), Ty, CastOpc,
-                                          NumCastsRemoved);
-      }
-    }
-    break;
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::Trunc:
-    // If this is the same kind of case as our original (e.g. zext+zext), we
-    // can safely replace it.  Note that replacing it does not reduce the number
-    // of casts in the input.
-    if (Opc == CastOpc)
-      return true;
-
-    // sext (zext ty1), ty2 -> zext ty2
-    if (CastOpc == Instruction::SExt && Opc == Instruction::ZExt)
-      return true;
-    break;
-  case Instruction::Select: {
-    SelectInst *SI = cast<SelectInst>(I);
-    return CanEvaluateInDifferentType(SI->getTrueValue(), Ty, CastOpc,
-                                      NumCastsRemoved) &&
-           CanEvaluateInDifferentType(SI->getFalseValue(), Ty, CastOpc,
-                                      NumCastsRemoved);
-  }
-  case Instruction::PHI: {
-    // We can change a phi if we can change all operands.
-    PHINode *PN = cast<PHINode>(I);
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!CanEvaluateInDifferentType(PN->getIncomingValue(i), Ty, CastOpc,
-                                      NumCastsRemoved))
-        return false;
-    return true;
-  }
-  default:
-    // TODO: Can handle more cases here.
-    break;
-  }
-  
-  return false;
-}
  
  /// EvaluateInDifferentType - Given an expression that 
-/// CanEvaluateInDifferentType returns true for, actually insert the code to
-/// evaluate the expression.
+/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
+/// insert the code to evaluate the expression.
  Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, 
                                               bool isSigned) {
-  if (Constant *C = dyn_cast<Constant>(V))
-    return ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+    // If we got a constantexpr back, try to simplify it with TD info.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+      C = ConstantFoldConstantExpression(CE, TD);
+    return C;
+  }
  
    // Otherwise, it must be an instruction.
    Instruction *I = cast<Instruction>(V);
@@ -329,7 +191,9 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty,
        return I->getOperand(0);
      
      // Otherwise, must be the same type of cast, so just reinsert a new one.
-    Res = CastInst::Create(cast<CastInst>(I)->getOpcode(), I->getOperand(0),Ty);
+    // This also handles the case of zext(trunc(x)) -> zext(x).
+    Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
+                                      Opc == Instruction::SExt);
      break;
    case Instruction::Select: {
      Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
@@ -390,17 +254,26 @@ isEliminableCastPair(
    return Instruction::CastOps(Res);
  }
  
-/// ValueRequiresCast - Return true if the cast from "V to Ty" actually results
-/// in any code being generated.  It does not require codegen if V is simple
-/// enough or if the cast can be folded into other casts.
-bool InstCombiner::ValueRequiresCast(Instruction::CastOps opcode,const Value *V,
-                                     const Type *Ty) {
+/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
+/// results in any code being generated and is interesting to optimize out. If
+/// the cast can be eliminated by some other simple transformation, we prefer
+/// to do the simplification first.
+bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
+                                      const Type *Ty) {
+  // Noop casts and casts of constants should be eliminated trivially.
    if (V->getType() == Ty || isa<Constant>(V)) return false;
    
-  // If this is another cast that can be eliminated, it isn't codegen either.
+  // If this is another cast that can be eliminated, we prefer to have it
+  // eliminated.
    if (const CastInst *CI = dyn_cast<CastInst>(V))
-    if (isEliminableCastPair(CI, opcode, Ty, TD))
+    if (isEliminableCastPair(CI, opc, Ty, TD))
        return false;
+  
+  // If this is a vector sext from a compare, then we don't want to break the
+  // idiom where each element of the extended vector is either zero or all ones.
+  if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy())
+    return false;
+  
    return true;
  }
  
@@ -429,8 +302,8 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
    if (isa<PHINode>(Src)) {
      // We don't do this if this would create a PHI node with an illegal type if
      // it is currently legal.
-    if (!isa<IntegerType>(Src->getType()) ||
-        !isa<IntegerType>(CI.getType()) ||
+    if (!Src->getType()->isIntegerTy() ||
+        !CI.getType()->isIntegerTy() ||
          ShouldChangeType(CI.getType(), Src->getType()))
        if (Instruction *NV = FoldOpIntoPhi(CI))
          return NV;
@@ -439,147 +312,145 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
    return 0;
  }
  
-/// commonIntCastTransforms - This function implements the common transforms
-/// for trunc, zext, and sext.
-Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
-  if (Instruction *Result = commonCastTransforms(CI))
-    return Result;
-
-  // See if we can simplify any instructions used by the LHS whose sole 
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
+/// CanEvaluateTruncated - Return true if we can evaluate the specified
+/// expression tree as type Ty instead of its larger type, and arrive with the
+/// same value.  This is used by code that tries to eliminate truncates.
+///
+/// Ty will always be a type smaller than V.  We should return true if trunc(V)
+/// can be computed by computing V in the smaller type.  If V is an instruction,
+/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only
+/// makes sense if x and y can be efficiently truncated.
+///
+/// This function works on both vectors and scalars.
+///
+static bool CanEvaluateTruncated(Value *V, const Type *Ty) {
+  // We can always evaluate constants in another type.
+  if (isa<Constant>(V))
+    return true;
    
-  // If the source isn't an instruction or has more than one use then we
-  // can't do anything more. 
-  Instruction *Src = dyn_cast<Instruction>(CI.getOperand(0));
-  if (!Src || !Src->hasOneUse())
-    return 0;
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
    
-  const Type *SrcTy = Src->getType();
-  const Type *DestTy = CI.getType();
-  uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
-  uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+  const Type *OrigTy = V->getType();
+  
+  // If this is an extension from the dest type, we can eliminate it, even if it
+  // has multiple uses.
+  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 
+      I->getOperand(0)->getType() == Ty)
+    return true;
  
-  // Attempt to propagate the cast into the instruction for int->int casts.
-  int NumCastsRemoved = 0;
-  // Only do this if the dest type is a simple type, don't convert the
-  // expression tree to something weird like i93 unless the source is also
-  // strange.
-  if ((isa<VectorType>(DestTy) ||
-       ShouldChangeType(Src->getType(), DestTy)) &&
-      CanEvaluateInDifferentType(Src, DestTy,
-                                 CI.getOpcode(), NumCastsRemoved)) {
-    // If this cast is a truncate, evaluting in a different type always
-    // eliminates the cast, so it is always a win.  If this is a zero-extension,
-    // we need to do an AND to maintain the clear top-part of the computation,
-    // so we require that the input have eliminated at least one cast.  If this
-    // is a sign extension, we insert two new casts (to do the extension) so we
-    // require that two casts have been eliminated.
-    bool DoXForm = false;
-    bool JustReplace = false;
-    switch (CI.getOpcode()) {
-    default:
-      // All the others use floating point so we shouldn't actually 
-      // get here because of the check above.
-      llvm_unreachable("Unknown cast type");
-    case Instruction::Trunc:
-      DoXForm = true;
-      break;
-    case Instruction::ZExt: {
-      DoXForm = NumCastsRemoved >= 1;
-      
-      if (!DoXForm && 0) {
-        // If it's unnecessary to issue an AND to clear the high bits, it's
-        // always profitable to do this xform.
-        Value *TryRes = EvaluateInDifferentType(Src, DestTy, false);
-        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
-        if (MaskedValueIsZero(TryRes, Mask))
-          return ReplaceInstUsesWith(CI, TryRes);
-        
-        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
-          if (TryI->use_empty())
-            EraseInstFromFunction(*TryI);
-      }
-      break;
-    }
-    case Instruction::SExt: {
-      DoXForm = NumCastsRemoved >= 2;
-      if (!DoXForm && !isa<TruncInst>(Src) && 0) {
-        // If we do not have to emit the truncate + sext pair, then it's always
-        // profitable to do this xform.
-        //
-        // It's not safe to eliminate the trunc + sext pair if one of the
-        // eliminated cast is a truncate. e.g.
-        // t2 = trunc i32 t1 to i16
-        // t3 = sext i16 t2 to i32
-        // !=
-        // i32 t1
-        Value *TryRes = EvaluateInDifferentType(Src, DestTy, true);
-        unsigned NumSignBits = ComputeNumSignBits(TryRes);
-        if (NumSignBits > (DestBitSize - SrcBitSize))
-          return ReplaceInstUsesWith(CI, TryRes);
-        
-        if (Instruction *TryI = dyn_cast<Instruction>(TryRes))
-          if (TryI->use_empty())
-            EraseInstFromFunction(*TryI);
-      }
-      break;
-    }
-    }
-    
-    if (DoXForm) {
-      DEBUG(errs() << "ICE: EvaluateInDifferentType converting expression type"
-            " to avoid cast: " << CI);
-      Value *Res = EvaluateInDifferentType(Src, DestTy, 
-                                           CI.getOpcode() == Instruction::SExt);
-      if (JustReplace)
-        // Just replace this cast with the result.
-        return ReplaceInstUsesWith(CI, Res);
-
-      assert(Res->getType() == DestTy);
-      switch (CI.getOpcode()) {
-      default: llvm_unreachable("Unknown cast type!");
-      case Instruction::Trunc:
-        // Just replace this cast with the result.
-        return ReplaceInstUsesWith(CI, Res);
-      case Instruction::ZExt: {
-        assert(SrcBitSize < DestBitSize && "Not a zext?");
-
-        // If the high bits are already zero, just replace this cast with the
-        // result.
-        APInt Mask(APInt::getBitsSet(DestBitSize, SrcBitSize, DestBitSize));
-        if (MaskedValueIsZero(Res, Mask))
-          return ReplaceInstUsesWith(CI, Res);
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
  
-        // We need to emit an AND to clear the high bits.
-        Constant *C = ConstantInt::get(CI.getContext(), 
-                                 APInt::getLowBitsSet(DestBitSize, SrcBitSize));
-        return BinaryOperator::CreateAnd(Res, C);
-      }
-      case Instruction::SExt: {
-        // If the high bits are already filled with sign bit, just replace this
-        // cast with the result.
-        unsigned NumSignBits = ComputeNumSignBits(Res);
-        if (NumSignBits > (DestBitSize - SrcBitSize))
-          return ReplaceInstUsesWith(CI, Res);
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // These operators can all arbitrarily be extended or truncated.
+    return CanEvaluateTruncated(I->getOperand(0), Ty) &&
+           CanEvaluateTruncated(I->getOperand(1), Ty);
  
-        // We need to emit a cast to truncate, then a cast to sext.
-        return new SExtInst(Builder->CreateTrunc(Res, Src->getType()), DestTy);
+  case Instruction::UDiv:
+  case Instruction::URem: {
+    // UDiv and URem can be truncated if all the truncated bits are zero.
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    if (BitWidth < OrigBitWidth) {
+      APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
+      if (MaskedValueIsZero(I->getOperand(0), Mask) &&
+          MaskedValueIsZero(I->getOperand(1), Mask)) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty) &&
+               CanEvaluateTruncated(I->getOperand(1), Ty);
        }
+    }
+    break;
+  }
+  case Instruction::Shl:
+    // If we are truncating the result of this SHL, and if it's a shift of a
+    // constant amount, we can always perform a SHL in a smaller type.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (CI->getLimitedValue(BitWidth) < BitWidth)
+        return CanEvaluateTruncated(I->getOperand(0), Ty);
+    }
+    break;
+  case Instruction::LShr:
+    // If this is a truncate of a logical shr, we can truncate it to a smaller
+    // lshr iff we know that the bits we would otherwise be shifting in are
+    // already zeros.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty);
        }
      }
+    break;
+  case Instruction::Trunc:
+    // trunc(trunc(x)) -> trunc(x)
+    return true;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+    // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest
+    return true;
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return CanEvaluateTruncated(SI->getTrueValue(), Ty) &&
+           CanEvaluateTruncated(SI->getFalseValue(), Ty);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateTruncated(PN->getIncomingValue(i), Ty))
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
    }
    
-  return 0;
+  return false;
  }
  
  Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
-  if (Instruction *Result = commonIntCastTransforms(CI))
+  if (Instruction *Result = commonCastTransforms(CI))
      return Result;
    
+  // See if we can simplify any instructions used by the input whose sole 
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+  
    Value *Src = CI.getOperand(0);
-  const Type *DestTy = CI.getType();
+  const Type *DestTy = CI.getType(), *SrcTy = Src->getType();
+  
+  // Attempt to truncate the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+      CanEvaluateTruncated(Src, DestTy)) {
+      
+    // If this cast is a truncate, evaluting in a different type always
+    // eliminates the cast, so it is always a win.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid cast: " << CI << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+    return ReplaceInstUsesWith(CI, Res);
+  }
  
    // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
    if (DestTy->getScalarSizeInBits() == 1) {
@@ -588,6 +459,29 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
      Value *Zero = Constant::getNullValue(Src->getType());
      return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
    }
+  
+  // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
+  Value *A = 0; ConstantInt *Cst = 0;
+  if (match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst))) &&
+      Src->hasOneUse()) {
+    // We have three types to worry about here, the type of A, the source of
+    // the truncate (MidSize), and the destination of the truncate. We know that
+    // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
+    // between ASize and ResultSize.
+    unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    
+    // If the shift amount is larger than the size of A, then the result is
+    // known to be zero because all the input bits got shifted out.
+    if (Cst->getZExtValue() >= ASize)
+      return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType()));
+
+    // Since we're doing an lshr and a zero extend, and know that the shift
+    // amount is smaller than ASize, it is always safe to do the shift in A's
+    // type, then zero extend or truncate to the result.
+    Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue());
+    Shift->takeName(Src);
+    return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
+  }
  
    return 0;
  }
@@ -672,8 +566,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
            
          if (CI.getType() == In->getType())
            return ReplaceInstUsesWith(CI, In);
-        else
-          return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
+        return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
        }
      }
    }
@@ -722,17 +615,177 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
    return 0;
  }
  
+/// CanEvaluateZExtd - Determine if the specified value can be computed in the
+/// specified wider type and produce the same low bits.  If not, return false.
+///
+/// If this function returns true, it can also return a non-zero number of bits
+/// (in BitsToClear) which indicates that the value it computes is correct for
+/// the zero extend, but that the additional BitsToClear bits need to be zero'd
+/// out.  For example, to promote something like:
+///
+///   %B = trunc i64 %A to i32
+///   %C = lshr i32 %B, 8
+///   %E = zext i32 %C to i64
+///
+/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be
+/// set to 8 to indicate that the promoted value needs to have bits 24-31
+/// cleared in addition to bits 32-63.  Since an 'and' will be generated to
+/// clear the top bits anyway, doing this has no extra cost.
+///
+/// This function works on both vectors and scalars.
+static bool CanEvaluateZExtd(Value *V, const Type *Ty, unsigned &BitsToClear) {
+  BitsToClear = 0;
+  if (isa<Constant>(V))
+    return true;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // If the input is a truncate from the destination type, we can trivially
+  // eliminate it, even if it has multiple uses.
+  // FIXME: This is currently disabled until codegen can handle this without
+  // pessimizing code, PR5997.
+  if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+    return true;
+  
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+  
+  unsigned Opc = I->getOpcode(), Tmp;
+  switch (Opc) {
+  case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
+  case Instruction::SExt:  // zext(sext(x)) -> sext(x).
+  case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+    if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) ||
+        !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp))
+      return false;
+    // These can all be promoted if neither operand has 'bits to clear'.
+    if (BitsToClear == 0 && Tmp == 0)
+      return true;
+      
+    // If the operation is an AND/OR/XOR and the bits to clear are zero in the
+    // other side, BitsToClear is ok.
+    if (Tmp == 0 &&
+        (Opc == Instruction::And || Opc == Instruction::Or ||
+         Opc == Instruction::Xor)) {
+      // We use MaskedValueIsZero here for generality, but the case we care
+      // about the most is constant RHS.
+      unsigned VSize = V->getType()->getScalarSizeInBits();
+      if (MaskedValueIsZero(I->getOperand(1),
+                            APInt::getHighBitsSet(VSize, BitsToClear)))
+        return true;
+    }
+      
+    // Otherwise, we don't know how to analyze this BitsToClear case yet.
+    return false;
+      
+  case Instruction::LShr:
+    // We can promote lshr(x, cst) if we can promote x.  This requires the
+    // ultimate 'and' to clear out the high zero bits we're clearing out though.
+    if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear))
+        return false;
+      BitsToClear += Amt->getZExtValue();
+      if (BitsToClear > V->getType()->getScalarSizeInBits())
+        BitsToClear = V->getType()->getScalarSizeInBits();
+      return true;
+    }
+    // Cannot promote variable LSHR.
+    return false;
+  case Instruction::Select:
+    if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp) ||
+        !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear) ||
+        // TODO: If important, we could handle the case when the BitsToClear are
+        // known zero in the disagreeing side.
+        Tmp != BitsToClear)
+      return false;
+    return true;
+      
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear))
+      return false;
+    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp) ||
+          // TODO: If important, we could handle the case when the BitsToClear
+          // are known zero in the disagreeing input.
+          Tmp != BitsToClear)
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    return false;
+  }
+}
+
  Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+  // If this zero extend is only used by a truncate, let the truncate by
+  // eliminated before we try to optimize this zext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
+    return 0;
+  
    // If one of the common conversion will work, do it.
-  if (Instruction *Result = commonIntCastTransforms(CI))
+  if (Instruction *Result = commonCastTransforms(CI))
      return Result;
  
+  // See if we can simplify any instructions used by the input whose sole 
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+  
    Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+  
+  // Attempt to extend the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  unsigned BitsToClear;
+  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+      CanEvaluateZExtd(Src, DestTy, BitsToClear)) { 
+    assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
+           "Unreasonable BitsToClear");
+    
+    // Okay, we can transform this!  Insert the new expression now.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid zero extend: " << CI);
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+    
+    uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+    
+    // If the high bits are already filled with zeros, just replace this
+    // cast with the result.
+    if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize,
+                                                     DestBitSize-SrcBitsKept)))
+      return ReplaceInstUsesWith(CI, Res);
+    
+    // We need to emit an AND to clear the high bits.
+    Constant *C = ConstantInt::get(Res->getType(),
+                               APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
+    return BinaryOperator::CreateAnd(Res, C);
+  }
  
    // If this is a TRUNC followed by a ZEXT then we are dealing with integral
    // types and if the sizes are just right we can convert this into a logical
    // 'and' which will be much cheaper than the pair of casts.
    if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // TODO: Subsume this into EvaluateInDifferentType.
+    
      // Get the sizes of the types involved.  We know that the intermediate type
      // will be smaller than A or C, but don't know the relation between A and C.
      Value *A = CSrc->getOperand(0);
@@ -760,7 +813,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
        APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
        return BinaryOperator::CreateAnd(Trunc, 
                                         ConstantInt::get(Trunc->getType(),
-                                                               AndValue));
+                                                        AndValue));
      }
    }
  
@@ -810,7 +863,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
  
    // zext (xor i1 X, true) to i32  --> xor (zext i1 X to i32), 1
    Value *X;
-  if (SrcI && SrcI->hasOneUse() && SrcI->getType()->isInteger(1) &&
+  if (SrcI && SrcI->hasOneUse() && SrcI->getType()->isIntegerTy(1) &&
        match(SrcI, m_Not(m_Value(X))) &&
        (!X->hasOneUse() || !isa<CmpInst>(X))) {
      Value *New = Builder->CreateZExt(X, CI.getType());
@@ -820,42 +873,168 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
    return 0;
  }
  
+/// CanEvaluateSExtd - Return true if we can take the specified value
+/// and return it as type Ty without inserting any new casts and without
+/// changing the value of the common low bits.  This is used by code that tries
+/// to promote integer operations to a wider types will allow us to eliminate
+/// the extension.
+///
+/// This function works on both vectors and scalars.
+///
+static bool CanEvaluateSExtd(Value *V, const Type *Ty) {
+  assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
+         "Can't sign extend type to a smaller type");
+  // If this is a constant, it can be trivially promoted.
+  if (isa<Constant>(V))
+    return true;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // If this is a truncate from the dest type, we can trivially eliminate it,
+  // even if it has multiple uses.
+  // FIXME: This is currently disabled until codegen can handle this without
+  // pessimizing code, PR5997.
+  if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+    return true;
+  
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::SExt:  // sext(sext(x)) -> sext(x)
+  case Instruction::ZExt:  // sext(zext(x)) -> zext(x)
+  case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // These operators can all arbitrarily be extended if their inputs can.
+    return CanEvaluateSExtd(I->getOperand(0), Ty) &&
+           CanEvaluateSExtd(I->getOperand(1), Ty);
+      
+  //case Instruction::Shl:   TODO
+  //case Instruction::LShr:  TODO
+      
+  case Instruction::Select:
+    return CanEvaluateSExtd(I->getOperand(1), Ty) &&
+           CanEvaluateSExtd(I->getOperand(2), Ty);
+      
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateSExtd(PN->getIncomingValue(i), Ty)) return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+  
+  return false;
+}
+
  Instruction *InstCombiner::visitSExt(SExtInst &CI) {
-  if (Instruction *I = commonIntCastTransforms(CI))
+  // If this sign extend is only used by a truncate, let the truncate by
+  // eliminated before we try to optimize this zext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
+    return 0;
+  
+  if (Instruction *I = commonCastTransforms(CI))
      return I;
    
+  // See if we can simplify any instructions used by the input whose sole 
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+  
    Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+  // Attempt to extend the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+      CanEvaluateSExtd(Src, DestTy)) {
+    // Okay, we can transform this!  Insert the new expression now.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid sign extend: " << CI);
+    Value *Res = EvaluateInDifferentType(Src, DestTy, true);
+    assert(Res->getType() == DestTy);
+
+    uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+    // If the high bits are already filled with sign bit, just replace this
+    // cast with the result.
+    if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize)
+      return ReplaceInstUsesWith(CI, Res);
+    
+    // We need to emit a shl + ashr to do the sign extend.
+    Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"),
+                                      ShAmt);
+  }
+
+  // If this input is a trunc from our destination, then turn sext(trunc(x))
+  // into shifts.
+  if (TruncInst *TI = dyn_cast<TruncInst>(Src))
+    if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
+      uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
+      uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+      
+      // We need to emit a shl + ashr to do the sign extend.
+      Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
+      Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
+      return BinaryOperator::CreateAShr(Res, ShAmt);
+    }
    
-  // Canonicalize sign-extend from i1 to a select.
-  if (Src->getType()->isInteger(1))
-    return SelectInst::Create(Src,
-                              Constant::getAllOnesValue(CI.getType()),
-                              Constant::getNullValue(CI.getType()));
-
-  // See if the value being truncated is already sign extended.  If so, just
-  // eliminate the trunc/sext pair.
-  if (Operator::getOpcode(Src) == Instruction::Trunc) {
-    Value *Op = cast<User>(Src)->getOperand(0);
-    unsigned OpBits   = Op->getType()->getScalarSizeInBits();
-    unsigned MidBits  = Src->getType()->getScalarSizeInBits();
-    unsigned DestBits = CI.getType()->getScalarSizeInBits();
-    unsigned NumSignBits = ComputeNumSignBits(Op);
-
-    if (OpBits == DestBits) {
-      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
-      // bits, it is already ready.
-      if (NumSignBits > DestBits-MidBits)
-        return ReplaceInstUsesWith(CI, Op);
-    } else if (OpBits < DestBits) {
-      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
-      // bits, just sext from i32.
-      if (NumSignBits > OpBits-MidBits)
-        return new SExtInst(Op, CI.getType(), "tmp");
-    } else {
-      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
-      // bits, just truncate to i32.
-      if (NumSignBits > OpBits-MidBits)
-        return new TruncInst(Op, CI.getType(), "tmp");
+  
+  // (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed
+  // (x >s -1) ? -1 : 0 -> ashr x, 31  -> all ones if not signed
+  {
+  ICmpInst::Predicate Pred; Value *CmpLHS; ConstantInt *CmpRHS;
+  if (match(Src, m_ICmp(Pred, m_Value(CmpLHS), m_ConstantInt(CmpRHS)))) {
+    // sext (x <s  0) to i32 --> x>>s31       true if signbit set.
+    // sext (x >s -1) to i32 --> (x>>s31)^-1  true if signbit clear.
+    if ((Pred == ICmpInst::ICMP_SLT && CmpRHS->isZero()) ||
+        (Pred == ICmpInst::ICMP_SGT && CmpRHS->isAllOnesValue())) {
+      Value *Sh = ConstantInt::get(CmpLHS->getType(),
+                                   CmpLHS->getType()->getScalarSizeInBits()-1);
+      Value *In = Builder->CreateAShr(CmpLHS, Sh, CmpLHS->getName()+".lobit");
+      if (In->getType() != CI.getType())
+        In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/, "tmp");
+      
+      if (Pred == ICmpInst::ICMP_SGT)
+        In = Builder->CreateNot(In, In->getName()+".not");
+      return ReplaceInstUsesWith(CI, In);
+    }
+  }
+  }
+
+  // vector (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed.
+  if (const VectorType *VTy = dyn_cast<VectorType>(DestTy)) {
+    ICmpInst::Predicate Pred; Value *CmpLHS;
+    if (match(Src, m_ICmp(Pred, m_Value(CmpLHS), m_Zero()))) {
+      if (Pred == ICmpInst::ICMP_SLT && CmpLHS->getType() == DestTy) {
+        const Type *EltTy = VTy->getElementType();
+
+        // splat the shift constant to a cosntant vector
+        Constant *Sh = ConstantInt::get(EltTy, EltTy->getScalarSizeInBits()-1);
+        std::vector<Constant *> Elts(VTy->getNumElements(), Sh);
+        Constant *VSh = ConstantVector::get(Elts);
+
+        Value *In = Builder->CreateAShr(CmpLHS, VSh,CmpLHS->getName()+".lobit");
+        return ReplaceInstUsesWith(CI, In);
+      }
      }
    }
  
@@ -873,19 +1052,17 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
    //   %a = shl i32 %i, 30
    //   %d = ashr i32 %a, 30
    Value *A = 0;
+  // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
    ConstantInt *BA = 0, *CA = 0;
-  if (match(Src, m_AShr(m_Shl(m_Value(A), m_ConstantInt(BA)),
+  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)),
                          m_ConstantInt(CA))) &&
-      BA == CA && isa<TruncInst>(A)) {
-    Value *I = cast<TruncInst>(A)->getOperand(0);
-    if (I->getType() == CI.getType()) {
-      unsigned MidSize = Src->getType()->getScalarSizeInBits();
-      unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
-      unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
-      Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
-      I = Builder->CreateShl(I, ShAmtV, CI.getName());
-      return BinaryOperator::CreateAShr(I, ShAmtV);
-    }
+      BA == CA && A->getType() == CI.getType()) {
+    unsigned MidSize = Src->getType()->getScalarSizeInBits();
+    unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
+    unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
+    Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
+    A = Builder->CreateShl(A, ShAmtV, CI.getName());
+    return BinaryOperator::CreateAShr(A, ShAmtV);
    }
    
    return 0;
@@ -964,6 +1141,38 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
        break;  
      }
    }
+  
+  // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
+  // NOTE: This should be disabled by -fno-builtin-sqrt if we ever support it.
+  CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
+  if (Call && Call->getCalledFunction() &&
+      Call->getCalledFunction()->getName() == "sqrt" &&
+      Call->getNumArgOperands() == 1) {
+    CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
+    if (Arg && Arg->getOpcode() == Instruction::FPExt &&
+        CI.getType()->isFloatTy() &&
+        Call->getType()->isDoubleTy() &&
+        Arg->getType()->isDoubleTy() &&
+        Arg->getOperand(0)->getType()->isFloatTy()) {
+      Function *Callee = Call->getCalledFunction();
+      Module *M = CI.getParent()->getParent()->getParent();
+      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", 
+                                                   Callee->getAttributes(),
+                                                   Builder->getFloatTy(),
+                                                   Builder->getFloatTy(),
+                                                   NULL);
+      CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
+                                       "sqrtfcall");
+      ret->setAttributes(Callee->getAttributes());
+      
+      
+      // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
+      Call->replaceAllUsesWith(UndefValue::get(Call->getType()));
+      EraseInstFromFunction(*Call);
+      return ret;
+    }
+  }
+  
    return 0;
  }
  
@@ -1020,16 +1229,22 @@ Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
  }
  
  Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
-  // If the source integer type is larger than the intptr_t type for
-  // this target, do a trunc to the intptr_t type, then inttoptr of it.  This
-  // allows the trunc to be exposed to other transforms.  Don't do this for
-  // extending inttoptr's, because we don't know if the target sign or zero
-  // extends to pointers.
-  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() >
-      TD->getPointerSizeInBits()) {
-    Value *P = Builder->CreateTrunc(CI.getOperand(0),
-                                    TD->getIntPtrType(CI.getContext()), "tmp");
-    return new IntToPtrInst(P, CI.getType());
+  // If the source integer type is not the intptr_t type for this target, do a
+  // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
+  // cast to be exposed to other transforms.
+  if (TD) {
+    if (CI.getOperand(0)->getType()->getScalarSizeInBits() >
+        TD->getPointerSizeInBits()) {
+      Value *P = Builder->CreateTrunc(CI.getOperand(0),
+                                      TD->getIntPtrType(CI.getContext()), "tmp");
+      return new IntToPtrInst(P, CI.getType());
+    }
+    if (CI.getOperand(0)->getType()->getScalarSizeInBits() <
+        TD->getPointerSizeInBits()) {
+      Value *P = Builder->CreateZExt(CI.getOperand(0),
+                                     TD->getIntPtrType(CI.getContext()), "tmp");
+      return new IntToPtrInst(P, CI.getType());
+    }
    }
    
    if (Instruction *I = commonCastTransforms(CI))
@@ -1091,22 +1306,278 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
  }
  
  Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
-  // If the destination integer type is smaller than the intptr_t type for
-  // this target, do a ptrtoint to intptr_t then do a trunc.  This allows the
-  // trunc to be exposed to other transforms.  Don't do this for extending
-  // ptrtoint's, because we don't know if the target sign or zero extends its
-  // pointers.
-  if (TD &&
-      CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) {
-    Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                       TD->getIntPtrType(CI.getContext()),
-                                       "tmp");
-    return new TruncInst(P, CI.getType());
+  // If the destination integer type is not the intptr_t type for this target,
+  // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
+  // to be exposed to other transforms.
+  if (TD) {
+    if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) {
+      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
+                                         TD->getIntPtrType(CI.getContext()),
+                                         "tmp");
+      return new TruncInst(P, CI.getType());
+    }
+    if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits()) {
+      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
+                                         TD->getIntPtrType(CI.getContext()),
+                                         "tmp");
+      return new ZExtInst(P, CI.getType());
+    }
    }
    
    return commonPointerCastTransforms(CI);
  }
  
+/// OptimizeVectorResize - This input value (which is known to have vector type)
+/// is being zero extended or truncated to the specified vector type.  Try to
+/// replace it with a shuffle (and vector/vector bitcast) if possible.
+///
+/// The source and destination vector types may have different element types.
+static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
+                                         InstCombiner &IC) {
+  // We can only do this optimization if the output is a multiple of the input
+  // element size, or the input is a multiple of the output element size.
+  // Convert the input type to have the same element type as the output.
+  const VectorType *SrcTy = cast<VectorType>(InVal->getType());
+  
+  if (SrcTy->getElementType() != DestTy->getElementType()) {
+    // The input types don't need to be identical, but for now they must be the
+    // same size.  There is no specific reason we couldn't handle things like
+    // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
+    // there yet. 
+    if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
+        DestTy->getElementType()->getPrimitiveSizeInBits())
+      return 0;
+    
+    SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
+    InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
+  }
+  
+  // Now that the element types match, get the shuffle mask and RHS of the
+  // shuffle to use, which depends on whether we're increasing or decreasing the
+  // size of the input.
+  SmallVector<Constant*, 16> ShuffleMask;
+  Value *V2;
+  const IntegerType *Int32Ty = Type::getInt32Ty(SrcTy->getContext());
+  
+  if (SrcTy->getNumElements() > DestTy->getNumElements()) {
+    // If we're shrinking the number of elements, just shuffle in the low
+    // elements from the input and use undef as the second shuffle input.
+    V2 = UndefValue::get(SrcTy);
+    for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
+      ShuffleMask.push_back(ConstantInt::get(Int32Ty, i));
+    
+  } else {
+    // If we're increasing the number of elements, shuffle in all of the
+    // elements from InVal and fill the rest of the result elements with zeros
+    // from a constant zero.
+    V2 = Constant::getNullValue(SrcTy);
+    unsigned SrcElts = SrcTy->getNumElements();
+    for (unsigned i = 0, e = SrcElts; i != e; ++i)
+      ShuffleMask.push_back(ConstantInt::get(Int32Ty, i));
+
+    // The excess elements reference the first element of the zero input.
+    ShuffleMask.append(DestTy->getNumElements()-SrcElts,
+                       ConstantInt::get(Int32Ty, SrcElts));
+  }
+  
+  Constant *Mask = ConstantVector::get(ShuffleMask.data(), ShuffleMask.size());
+  return new ShuffleVectorInst(InVal, V2, Mask);
+}
+
+static bool isMultipleOfTypeSize(unsigned Value, const Type *Ty) {
+  return Value % Ty->getPrimitiveSizeInBits() == 0;
+}
+
+static unsigned getTypeSizeIndex(unsigned Value, const Type *Ty) {
+  return Value / Ty->getPrimitiveSizeInBits();
+}
+
+/// CollectInsertionElements - V is a value which is inserted into a vector of
+/// VecEltTy.  Look through the value to see if we can decompose it into
+/// insertions into the vector.  See the example in the comment for
+/// OptimizeIntegerToVectorInsertions for the pattern this handles.
+/// The type of V is always a non-zero multiple of VecEltTy's size.
+///
+/// This returns false if the pattern can't be matched or true if it can,
+/// filling in Elements with the elements found here.
+static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
+                                     SmallVectorImpl<Value*> &Elements,
+                                     const Type *VecEltTy) {
+  // Undef values never contribute useful bits to the result.
+  if (isa<UndefValue>(V)) return true;
+  
+  // If we got down to a value of the right type, we win, try inserting into the
+  // right element.
+  if (V->getType() == VecEltTy) {
+    // Inserting null doesn't actually insert any elements.
+    if (Constant *C = dyn_cast<Constant>(V))
+      if (C->isNullValue())
+        return true;
+    
+    // Fail if multiple elements are inserted into this slot.
+    if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
+      return false;
+    
+    Elements[ElementIndex] = V;
+    return true;
+  }
+  
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Figure out the # elements this provides, and bitcast it or slice it up
+    // as required.
+    unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(),
+                                        VecEltTy);
+    // If the constant is the size of a vector element, we just need to bitcast
+    // it to the right type so it gets properly inserted.
+    if (NumElts == 1)
+      return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
+                                      ElementIndex, Elements, VecEltTy);
+    
+    // Okay, this is a constant that covers multiple elements.  Slice it up into
+    // pieces and insert each element-sized piece into the vector.
+    if (!isa<IntegerType>(C->getType()))
+      C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
+                                       C->getType()->getPrimitiveSizeInBits()));
+    unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
+    const Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+    
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
+                                                               i*ElementSize));
+      Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
+      if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy))
+        return false;
+    }
+    return true;
+  }
+  
+  if (!V->hasOneUse()) return false;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return false;
+  switch (I->getOpcode()) {
+  default: return false; // Unhandled case.
+  case Instruction::BitCast:
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy);  
+  case Instruction::ZExt:
+    if (!isMultipleOfTypeSize(
+                          I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
+                              VecEltTy))
+      return false;
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy);  
+  case Instruction::Or:
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy) &&
+           CollectInsertionElements(I->getOperand(1), ElementIndex,
+                                    Elements, VecEltTy);
+  case Instruction::Shl: {
+    // Must be shifting by a constant that is a multiple of the element size.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (CI == 0) return false;
+    if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
+    unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
+    
+    return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
+                                    Elements, VecEltTy);
+  }
+      
+  }
+}
+
+
+/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we
+/// may be doing shifts and ors to assemble the elements of the vector manually.
+/// Try to rip the code out and replace it with insertelements.  This is to
+/// optimize code like this:
+///
+///    %tmp37 = bitcast float %inc to i32
+///    %tmp38 = zext i32 %tmp37 to i64
+///    %tmp31 = bitcast float %inc5 to i32
+///    %tmp32 = zext i32 %tmp31 to i64
+///    %tmp33 = shl i64 %tmp32, 32
+///    %ins35 = or i64 %tmp33, %tmp38
+///    %tmp43 = bitcast i64 %ins35 to <2 x float>
+///
+/// Into two insertelements that do "buildvector{%inc, %inc5}".
+static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
+                                                InstCombiner &IC) {
+  const VectorType *DestVecTy = cast<VectorType>(CI.getType());
+  Value *IntInput = CI.getOperand(0);
+
+  SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
+  if (!CollectInsertionElements(IntInput, 0, Elements,
+                                DestVecTy->getElementType()))
+    return 0;
+
+  // If we succeeded, we know that all of the element are specified by Elements
+  // or are zero if Elements has a null entry.  Recast this as a set of
+  // insertions.
+  Value *Result = Constant::getNullValue(CI.getType());
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (Elements[i] == 0) continue;  // Unset element.
+    
+    Result = IC.Builder->CreateInsertElement(Result, Elements[i],
+                                             IC.Builder->getInt32(i));
+  }
+  
+  return Result;
+}
+
+
+/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
+/// bitcast.  The various long double bitcasts can't get in here.
+static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
+  Value *Src = CI.getOperand(0);
+  const Type *DestTy = CI.getType();
+
+  // If this is a bitcast from int to float, check to see if the int is an
+  // extraction from a vector.
+  Value *VecInput = 0;
+  // bitcast(trunc(bitcast(somevector)))
+  if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) &&
+      isa<VectorType>(VecInput->getType())) {
+    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+
+    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) {
+      // If the element type of the vector doesn't match the result type,
+      // bitcast it to be a vector type we can extract from.
+      if (VecTy->getElementType() != DestTy) {
+        VecTy = VectorType::get(DestTy,
+                                VecTy->getPrimitiveSizeInBits() / DestWidth);
+        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
+      }
+    
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
+    }
+  }
+  
+  // bitcast(trunc(lshr(bitcast(somevector), cst))
+  ConstantInt *ShAmt = 0;
+  if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
+                                m_ConstantInt(ShAmt)))) &&
+      isa<VectorType>(VecInput->getType())) {
+    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 &&
+        ShAmt->getZExtValue() % DestWidth == 0) {
+      // If the element type of the vector doesn't match the result type,
+      // bitcast it to be a vector type we can extract from.
+      if (VecTy->getElementType() != DestTy) {
+        VecTy = VectorType::get(DestTy,
+                                VecTy->getPrimitiveSizeInBits() / DestWidth);
+        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
+      }
+      
+      unsigned Elt = ShAmt->getZExtValue() / DestWidth;
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
+    }
+  }
+  return 0;
+}
+
  Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
    // If the operands are integer typed then apply the integer transforms,
    // otherwise just apply the common ones.
@@ -1144,7 +1615,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
        Constant::getNullValue(Type::getInt32Ty(CI.getContext()));
      unsigned NumZeros = 0;
      while (SrcElTy != DstElTy && 
-           isa<CompositeType>(SrcElTy) && !isa<PointerType>(SrcElTy) &&
+           isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
             SrcElTy->getNumContainedTypes() /* not "{}" */) {
        SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
        ++NumZeros;
@@ -1157,18 +1628,43 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                                                 ((Instruction*)NULL));
      }
    }
+  
+  // Try to optimize int -> float bitcasts.
+  if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
+    if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
+      return I;
  
    if (const VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
-    if (DestVTy->getNumElements() == 1 && !isa<VectorType>(SrcTy)) {
+    if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
        Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType());
        return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
                       Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
        // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
      }
+    
+    if (isa<IntegerType>(SrcTy)) {
+      // If this is a cast from an integer to vector, check to see if the input
+      // is a trunc or zext of a bitcast from vector.  If so, we can replace all
+      // the casts with a shuffle and (potentially) a bitcast.
+      if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) {
+        CastInst *SrcCast = cast<CastInst>(Src);
+        if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
+          if (isa<VectorType>(BCIn->getOperand(0)->getType()))
+            if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0),
+                                               cast<VectorType>(DestTy), *this))
+              return I;
+      }
+      
+      // If the input is an 'or' instruction, we may be doing shifts and ors to
+      // assemble the elements of the vector manually.  Try to rip the code out
+      // and replace it with insertelements.
+      if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this))
+        return ReplaceInstUsesWith(CI, V);
+    }
    }
  
    if (const VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
-    if (SrcVTy->getNumElements() == 1 && !isa<VectorType>(DestTy)) {
+    if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) {
        Value *Elem = 
          Builder->CreateExtractElement(Src,
                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
@@ -1178,8 +1674,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
  
    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
      // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
-    // a bitconvert to a vector with the same # elts.
-    if (SVI->hasOneUse() && isa<VectorType>(DestTy) && 
+    // a bitcast to a vector with the same # elts.
+    if (SVI->hasOneUse() && DestTy->isVectorTy() && 
          cast<VectorType>(DestTy)->getNumElements() ==
                SVI->getType()->getNumElements() &&
          SVI->getType()->getNumElements() ==
@@ -1201,7 +1697,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
      }
    }
    
-  if (isa<PointerType>(SrcTy))
+  if (SrcTy->isPointerTy())
      return commonPointerCastTransforms(CI);
    return commonCastTransforms(CI);
  }