[X86][Haswell][SchedModel] Add architecture specific scheduling models.

[oota-llvm.git] / lib / Transforms / InstCombine / InstCombineLoadStoreAlloca.cpp
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

index 0579c27db8a61bf370f45f9da32594b3d056a71e..9e46041886d95b7bcd001809d456fb7e6a4b77db 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -16,19 +16,11 @@
  #include "llvm/Analysis/Loads.h"
  #include "llvm/IR/DataLayout.h"
  #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  #include "llvm/Transforms/Utils/Local.h"
  using namespace llvm;
  
-/// Hidden option to stress test load slicing, i.e., when this option
-/// is enabled, load slicing bypasses most of its profitability guards.
-/// It will also generate, uncanonalized form of slicing.
-static cl::opt<bool>
-StressLoadSlicing("instcombine-stress-load-slicing", cl::Hidden,
-                  cl::desc("Bypass the profitability model of load "
-                           "slicing"),
-                  cl::init(false));
+#define DEBUG_TYPE "instcombine"
  
  STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
  STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
@@ -39,10 +31,13 @@ STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
  static bool pointsToConstantGlobal(Value *V) {
    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
      return GV->isConstant();
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
      if (CE->getOpcode() == Instruction::BitCast ||
+        CE->getOpcode() == Instruction::AddrSpaceCast ||
          CE->getOpcode() == Instruction::GetElementPtr)
        return pointsToConstantGlobal(CE->getOperand(0));
+  }
    return false;
  }
  
@@ -55,95 +50,102 @@ static bool pointsToConstantGlobal(Value *V) {
  /// can optimize this.
  static bool
  isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
-                               SmallVectorImpl<Instruction *> &ToDelete,
-                               bool IsOffset = false) {
+                               SmallVectorImpl<Instruction *> &ToDelete) {
    // We track lifetime intrinsics as we encounter them.  If we decide to go
    // ahead and replace the value with the global, this lets the caller quickly
    // eliminate the markers.
  
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
-    User *U = cast<Instruction>(*UI);
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      // Ignore non-volatile loads, they are always ok.
-      if (!LI->isSimple()) return false;
-      continue;
-    }
-
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      // If uses of the bitcast are ok, we are ok.
-      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, ToDelete, IsOffset))
-        return false;
-      continue;
-    }
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
-      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
-      // doesn't, it does.
-      if (!isOnlyCopiedFromConstantGlobal(
-              GEP, TheCopy, ToDelete, IsOffset || !GEP->hasAllZeroIndices()))
-        return false;
-      continue;
-    }
-
-    if (CallSite CS = U) {
-      // If this is the function being called then we treat it like a load and
-      // ignore it.
-      if (CS.isCallee(UI))
+  SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
+  ValuesToInspect.push_back(std::make_pair(V, false));
+  while (!ValuesToInspect.empty()) {
+    auto ValuePair = ValuesToInspect.pop_back_val();
+    const bool IsOffset = ValuePair.second;
+    for (auto &U : ValuePair.first->uses()) {
+      Instruction *I = cast<Instruction>(U.getUser());
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        // Ignore non-volatile loads, they are always ok.
+        if (!LI->isSimple()) return false;
          continue;
+      }
  
-      // If this is a readonly/readnone call site, then we know it is just a
-      // load (but one that potentially returns the value itself), so we can
-      // ignore it if we know that the value isn't captured.
-      unsigned ArgNo = CS.getArgumentNo(UI);
-      if (CS.onlyReadsMemory() &&
-          (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
+      if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
+        // If uses of the bitcast are ok, we are ok.
+        ValuesToInspect.push_back(std::make_pair(I, IsOffset));
          continue;
-
-      // If this is being passed as a byval argument, the caller is making a
-      // copy, so it is only a read of the alloca.
-      if (CS.isByValArgument(ArgNo))
+      }
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        // If the GEP has all zero indices, it doesn't offset the pointer. If it
+        // doesn't, it does.
+        ValuesToInspect.push_back(
+            std::make_pair(I, IsOffset || !GEP->hasAllZeroIndices()));
          continue;
-    }
+      }
  
-    // Lifetime intrinsics can be handled by the caller.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        assert(II->use_empty() && "Lifetime markers have no result to use!");
-        ToDelete.push_back(II);
-        continue;
+      if (CallSite CS = I) {
+        // If this is the function being called then we treat it like a load and
+        // ignore it.
+        if (CS.isCallee(&U))
+          continue;
+
+        // Inalloca arguments are clobbered by the call.
+        unsigned ArgNo = CS.getArgumentNo(&U);
+        if (CS.isInAllocaArgument(ArgNo))
+          return false;
+
+        // If this is a readonly/readnone call site, then we know it is just a
+        // load (but one that potentially returns the value itself), so we can
+        // ignore it if we know that the value isn't captured.
+        if (CS.onlyReadsMemory() &&
+            (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
+          continue;
+
+        // If this is being passed as a byval argument, the caller is making a
+        // copy, so it is only a read of the alloca.
+        if (CS.isByValArgument(ArgNo))
+          continue;
        }
-    }
  
-    // If this is isn't our memcpy/memmove, reject it as something we can't
-    // handle.
-    MemTransferInst *MI = dyn_cast<MemTransferInst>(U);
-    if (MI == 0)
-      return false;
+      // Lifetime intrinsics can be handled by the caller.
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          assert(II->use_empty() && "Lifetime markers have no result to use!");
+          ToDelete.push_back(II);
+          continue;
+        }
+      }
  
-    // If the transfer is using the alloca as a source of the transfer, then
-    // ignore it since it is a load (unless the transfer is volatile).
-    if (UI.getOperandNo() == 1) {
-      if (MI->isVolatile()) return false;
-      continue;
-    }
+      // If this is isn't our memcpy/memmove, reject it as something we can't
+      // handle.
+      MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
+      if (!MI)
+        return false;
  
-    // If we already have seen a copy, reject the second one.
-    if (TheCopy) return false;
+      // If the transfer is using the alloca as a source of the transfer, then
+      // ignore it since it is a load (unless the transfer is volatile).
+      if (U.getOperandNo() == 1) {
+        if (MI->isVolatile()) return false;
+        continue;
+      }
  
-    // If the pointer has been offset from the start of the alloca, we can't
-    // safely handle this.
-    if (IsOffset) return false;
+      // If we already have seen a copy, reject the second one.
+      if (TheCopy) return false;
  
-    // If the memintrinsic isn't using the alloca as the dest, reject it.
-    if (UI.getOperandNo() != 0) return false;
+      // If the pointer has been offset from the start of the alloca, we can't
+      // safely handle this.
+      if (IsOffset) return false;
  
-    // If the source of the memcpy/move is not a constant global, reject it.
-    if (!pointsToConstantGlobal(MI->getSource()))
-      return false;
+      // If the memintrinsic isn't using the alloca as the dest, reject it.
+      if (U.getOperandNo() != 0) return false;
+
+      // If the source of the memcpy/move is not a constant global, reject it.
+      if (!pointsToConstantGlobal(MI->getSource()))
+        return false;
  
-    // Otherwise, the transform is safe.  Remember the copy instruction.
-    TheCopy = MI;
+      // Otherwise, the transform is safe.  Remember the copy instruction.
+      TheCopy = MI;
+    }
    }
    return true;
  }
@@ -154,17 +156,17 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
  static MemTransferInst *
  isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
                                 SmallVectorImpl<Instruction *> &ToDelete) {
-  MemTransferInst *TheCopy = 0;
+  MemTransferInst *TheCopy = nullptr;
    if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
      return TheCopy;
-  return 0;
+  return nullptr;
  }
  
  Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
    // Ensure that the alloca array size argument has type intptr_t, so that
    // any casting is exposed early.
-  if (TD) {
-    Type *IntPtrTy = TD->getIntPtrType(AI.getType());
+  if (DL) {
+    Type *IntPtrTy = DL->getIntPtrType(AI.getType());
      if (AI.getArraySize()->getType() != IntPtrTy) {
        Value *V = Builder->CreateIntCast(AI.getArraySize(),
                                          IntPtrTy, false);
@@ -178,7 +180,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
      if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
        Type *NewTy =
          ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-      AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
+      AllocaInst *New = Builder->CreateAlloca(NewTy, nullptr, AI.getName());
        New->setAlignment(AI.getAlignment());
  
        // Scan to the end of the allocation instructions, to skip over a block of
@@ -190,8 +192,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
        // Now that I is pointing to the first non-allocation-inst in the block,
        // insert our getelementptr instruction...
        //
-      Type *IdxTy = TD
-                  ? TD->getIntPtrType(AI.getType())
+      Type *IdxTy = DL
+                  ? DL->getIntPtrType(AI.getType())
                    : Type::getInt64Ty(AI.getContext());
        Value *NullIdx = Constant::getNullValue(IdxTy);
        Value *Idx[2] = { NullIdx, NullIdx };
@@ -207,15 +209,15 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
      }
    }
  
-  if (TD && AI.getAllocatedType()->isSized()) {
+  if (DL && AI.getAllocatedType()->isSized()) {
      // If the alignment is 0 (unspecified), assign it the preferred alignment.
      if (AI.getAlignment() == 0)
-      AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
+      AI.setAlignment(DL->getPrefTypeAlignment(AI.getAllocatedType()));
  
      // Move all alloca's of zero byte objects to the entry block and merge them
      // together.  Note that we only do this for alloca's, because malloc should
      // allocate and return a unique pointer, even for a zero byte allocation.
-    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) {
+    if (DL->getTypeAllocSize(AI.getAllocatedType()) == 0) {
        // For a zero sized alloca there is no point in doing an array allocation.
        // This is helpful if the array size is a complicated expression not used
        // elsewhere.
@@ -233,7 +235,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
          // dominance as the array size was forced to a constant earlier already.
          AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
          if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
-            TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
+            DL->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
            AI.moveBefore(FirstInst);
            return &AI;
          }
@@ -242,7 +244,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
          // assign it the preferred alignment.
          if (EntryAI->getAlignment() == 0)
            EntryAI->setAlignment(
-            TD->getPrefTypeAlignment(EntryAI->getAllocatedType()));
+            DL->getPrefTypeAlignment(EntryAI->getAllocatedType()));
          // Replace this zero-sized alloca with the one at the start of the entry
          // block after ensuring that the address will be aligned enough for both
          // types.
@@ -266,16 +268,16 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
      SmallVector<Instruction *, 4> ToDelete;
      if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
        unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(),
-                                                        AI.getAlignment(), TD);
+                                                        AI.getAlignment(), DL);
        if (AI.getAlignment() <= SourceAlign) {
          DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
          DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
          for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
            EraseInstFromFunction(*ToDelete[i]);
          Constant *TheSrc = cast<Constant>(Copy->getSource());
-        Instruction *NewI
-          = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc,
-                                                             AI.getType()));
+        Constant *Cast
+          = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
+        Instruction *NewI = ReplaceInstUsesWith(AI, Cast);
          EraseInstFromFunction(*Copy);
          ++NumGlobalCopies;
          return NewI;
@@ -291,7 +293,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
  
  /// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
  static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
-                                        const DataLayout *TD) {
+                                        const DataLayout *DL) {
    User *CI = cast<User>(LI.getOperand(0));
    Value *CastOp = CI->getOperand(0);
  
@@ -301,7 +303,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
  
      // If the address spaces don't match, don't eliminate the cast.
      if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
-      return 0;
+      return nullptr;
  
      Type *SrcPTy = SrcTy->getElementType();
  
@@ -313,8 +315,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
        if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
          if (Constant *CSrc = dyn_cast<Constant>(CastOp))
            if (ASrcTy->getNumElements() != 0) {
-            Type *IdxTy = TD
-                        ? TD->getIntPtrType(SrcTy)
+            Type *IdxTy = DL
+                        ? DL->getIntPtrType(SrcTy)
                          : Type::getInt64Ty(SrcTy->getContext());
              Value *Idx = Constant::getNullValue(IdxTy);
              Value *Idxs[2] = { Idx, Idx };
@@ -328,7 +330,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
              SrcPTy->isVectorTy()) &&
            // Do not allow turning this into a load of an integer, which is then
            // casted to a pointer, this pessimizes pointer analysis a lot.
-          (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) &&
+          (SrcPTy->isPtrOrPtrVectorTy() ==
+           LI.getType()->isPtrOrPtrVectorTy()) &&
            IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
                 IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
  
@@ -340,291 +343,30 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
          NewLoad->setAlignment(LI.getAlignment());
          NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
          // Now cast the result of the load.
-        return new BitCastInst(NewLoad, LI.getType());
-      }
-    }
-  }
-  return 0;
-}
-
-namespace {
-  /// \brief Helper structure used to slice a load in smaller loads.
-  struct LoadedSlice {
-    // The last instruction that represent the slice. This should be a
-    // truncate instruction.
-    Instruction *Inst;
-    // The original load instruction.
-    LoadInst *Origin;
-    // The right shift amount in bits from the original load.
-    unsigned Shift;
-
-    LoadedSlice(Instruction *Inst = NULL, LoadInst *Origin = NULL,
-                unsigned Shift = 0)
-    : Inst(Inst), Origin(Origin), Shift(Shift) {}
-
-    LoadedSlice(const LoadedSlice& LS) : Inst(LS.Inst), Origin(LS.Origin),
-      Shift(LS.Shift) {}
-
-    /// \brief Get the bits used in a chunk of bits \p BitWidth large.
-    /// \return Result is \p BitWidth and has used bits set to 1 and
-    ///         not used bits set to 0.
-    APInt getUsedBits() const {
-      // Reproduce the trunc(lshr) sequence:
-      // - Start from the truncated value.
-      // - Zero extend to the desired bit width.
-      // - Shift left.
-      assert(Origin && "No original load to compare against.");
-      unsigned BitWidth = Origin->getType()->getPrimitiveSizeInBits();
-      assert(Inst && "This slice is not bound to an instruction");
-      assert(Inst->getType()->getPrimitiveSizeInBits() <= BitWidth &&
-             "Extracted slice is smaller than the whole type!");
-      APInt UsedBits(Inst->getType()->getPrimitiveSizeInBits(), 0);
-      UsedBits.setAllBits();
-      UsedBits = UsedBits.zext(BitWidth);
-      UsedBits <<= Shift;
-      return UsedBits;
-    }
-
-    /// \brief Get the size of the slice to be loaded in bytes.
-    unsigned getLoadedSize() const {
-      unsigned SliceSize = getUsedBits().countPopulation();
-      assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
-      return SliceSize / 8;
-    }
-
-    /// \brief Get the offset in bytes of this slice in the original chunk of
-    /// bits, whose layout is defined by \p IsBigEndian.
-    uint64_t getOffsetFromBase(bool IsBigEndian) const {
-      assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not support.");
-      uint64_t Offset = Shift / 8;
-      unsigned TySizeInBytes = Origin->getType()->getPrimitiveSizeInBits() / 8;
-      assert(!(Origin->getType()->getPrimitiveSizeInBits() & 0x7) &&
-             "The size of the original loaded type is not a multiple of a"
-             " byte.");
-      // If Offset is bigger than TySizeInBytes, it means we are loading all
-      // zeros. This should have been optimized before in the process.
-      assert(TySizeInBytes > Offset &&
-             "Invalid shift amount for given loaded size");
-      if (IsBigEndian)
-        Offset = TySizeInBytes - Offset - getLoadedSize();
-      return Offset;
-    }
-
-    /// \brief Generate the sequence of instructions to load the slice
-    /// represented by this object and redirect the uses of this slice to
-    /// this new sequence of instructions.
-    /// \pre this->Inst && this->Origin are valid Instructions.
-    /// \return The last instruction of the sequence used to load the slice.
-    Instruction *loadSlice(InstCombiner::BuilderTy &Builder,
-                           bool IsBigEndian) const {
-      assert(Inst && Origin && "Unable to replace a non-existing slice.");
-      Value *BaseAddr = Origin->getOperand(0);
-      unsigned Alignment = Origin->getAlignment();
-      Builder.SetInsertPoint(Origin);
-      // Assume we are looking at a chunk of bytes.
-      // BaseAddr = (i8*)BaseAddr.
-      BaseAddr = Builder.CreateBitCast(BaseAddr, Builder.getInt8PtrTy(),
-                                       "raw_cast");
-      // Get the offset in that chunk of bytes w.r.t. the endianess.
-      uint64_t Offset = getOffsetFromBase(IsBigEndian);
-      if (Offset) {
-        APInt APOffset(64, Offset);
-        // BaseAddr = BaseAddr + Offset.
-        BaseAddr = Builder.CreateInBoundsGEP(BaseAddr, Builder.getInt(APOffset),
-                                             "raw_idx");
-      }
-
-      // Create the type of the loaded slice according to its size.
-      Type *SliceType =
-        Type::getIntNTy(Origin->getContext(), getLoadedSize() * 8);
-
-      // Bit cast the raw pointer to the pointer type of the slice.
-      BaseAddr = Builder.CreateBitCast(BaseAddr, SliceType->getPointerTo(),
-                                       "cast");
-
-      // Compute the new alignment.
-      if (Offset != 0)
-        Alignment = MinAlign(Alignment, Alignment + Offset);
-
-      // Create the load for the slice.
-      Instruction *LastInst = Builder.CreateAlignedLoad(BaseAddr, Alignment,
-                                                        Inst->getName()+".val");
-      // If the final type is not the same as the loaded type, this means that
-      // we have to pad with zero. Create a zero extend for that.
-      Type * FinalType = Inst->getType();
-      if (SliceType != FinalType)
-        LastInst = cast<Instruction>(Builder.CreateZExt(LastInst, FinalType));
-
-      // Update the IR to reflect the new access to the slice.
-      Inst->replaceAllUsesWith(LastInst);
-
-      return LastInst;
-    }
-
-    /// \brief Check if it would be profitable to expand this slice as an
-    /// independant load.
-    bool isProfitable() const {
-      // Slicing is assumed to be profitable iff the chains leads to arithmetic
-      // operations.
-      SmallVector<const Instruction *, 8> Uses;
-      Uses.push_back(Inst);
-      do {
-        const Instruction *Use = Uses.pop_back_val();
-        for (Value::const_use_iterator UseIt = Use->use_begin(),
-             UseItEnd = Use->use_end(); UseIt != UseItEnd; ++UseIt) {
-          const Instruction *UseOfUse = cast<Instruction>(*UseIt);
-          // Consider these instructions as arithmetic operations.
-          if (isa<BinaryOperator>(UseOfUse) ||
-              isa<CastInst>(UseOfUse) ||
-              isa<PHINode>(UseOfUse) ||
-              isa<GetElementPtrInst>(UseOfUse))
-            return true;
-          // No need to check if the Use has already been checked as we do not
-          // insert any PHINode.
-          Uses.push_back(UseOfUse);
+        PointerType *OldTy = dyn_cast<PointerType>(NewLoad->getType());
+        PointerType *NewTy = dyn_cast<PointerType>(LI.getType());
+        if (OldTy && NewTy &&
+            OldTy->getAddressSpace() != NewTy->getAddressSpace()) {
+          return new AddrSpaceCastInst(NewLoad, LI.getType());
          }
-      } while (!Uses.empty());
-      DEBUG(dbgs() << "IC: Not a profitable slice " << *Inst << '\n');
-      return false;
-    }
-  };
-}
-
-/// \brief Check the profitability of all involved LoadedSlice.
-/// Unless StressLoadSlicing is specified, this also returns false
-/// when slicing is not in the canonical form.
-/// The canonical form of sliced load is (1) two loads,
-/// which are (2) next to each other in memory.
-///
-/// FIXME: We may want to allow more slices to be created but
-/// this means other passes should know how to deal with all those
-/// slices.
-/// FIXME: We may want to split loads to different types, e.g.,
-/// int vs. float.
-static bool
-isSlicingProfitable(const SmallVectorImpl<LoadedSlice> &LoadedSlices,
-                    const APInt &UsedBits) {
-  unsigned NbOfSlices = LoadedSlices.size();
-  // Check (1).
-  if (!StressLoadSlicing && NbOfSlices != 2)
-    return false;
-
-  // Check (2).
-  if (!StressLoadSlicing && !UsedBits.isAllOnesValue()) {
-    // Get rid of the unused bits on the right.
-    APInt MemoryLayout = UsedBits.lshr(UsedBits.countTrailingZeros());
-    // Get rid of the unused bits on the left.
-    if (MemoryLayout.countLeadingZeros())
-      MemoryLayout = MemoryLayout.trunc(MemoryLayout.getActiveBits());
-    // Check that the chunk of memory is completely used.
-    if (!MemoryLayout.isAllOnesValue())
-      return false;
-  }
  
-  unsigned NbOfProfitableSlices = 0;
-  for (unsigned CurrSlice = 0; CurrSlice < NbOfSlices; ++CurrSlice) {
-    if (LoadedSlices[CurrSlice].isProfitable())
-      ++NbOfProfitableSlices;
-    else if (!StressLoadSlicing)
-      return false;
-  }
-  // In Stress mode, we may have 0 profitable slice.
-  // Check that here.
-  // In non-Stress mode, all the slices are profitable at this point.
-  return NbOfProfitableSlices > 0;
-}
-
-/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
-/// operations, split it in the various pieces being extracted.
-///
-/// This sort of thing is introduced by SROA.
-/// This slicing takes care not to insert overlapping loads.
-/// \pre LI is a simple load (i.e., not an atomic or volatile load).
-static Instruction *sliceUpLoadInst(LoadInst &LI,
-                                    InstCombiner::BuilderTy &Builder,
-                                    DataLayout &TD) {
-  assert(LI.isSimple() && "We are trying to transform a non-simple load!");
-
-  // FIXME: If we want to support floating point and vector types, we should
-  // support bitcast and extract/insert element instructions.
-  Type *LITy = LI.getType();
-  if (!LITy->isIntegerTy()) return 0;
-
-  // Keep track of already used bits to detect overlapping values.
-  // In that case, we will just abort the transformation.
-  APInt UsedBits(LITy->getPrimitiveSizeInBits(), 0);
-
-  SmallVector<LoadedSlice, 4> LoadedSlices;
-
-  // Check if this load is used as several smaller chunks of bits.
-  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
-  // of computation for each trunc.
-  for (Value::use_iterator UI = LI.use_begin(), UIEnd = LI.use_end();
-       UI != UIEnd; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-    unsigned Shift = 0;
-
-    // Check if this is a trunc(lshr).
-    if (User->getOpcode() == Instruction::LShr && User->hasOneUse() &&
-        isa<ConstantInt>(User->getOperand(1))) {
-      Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue();
-      User = User->use_back();
+        return new BitCastInst(NewLoad, LI.getType());
+      }
      }
-
-    // At this point, User is a TruncInst, iff we encountered, trunc or
-    // trunc(lshr).
-    if (!isa<TruncInst>(User))
-      return 0;
-
-    // The width of the type must be a power of 2 and greater than 8-bits.
-    // Otherwise the load cannot be represented in LLVM IR.
-    // Moreover, if we shifted with a non 8-bits multiple, the slice
-    // will be accross several bytes. We do not support that.
-    unsigned Width = User->getType()->getPrimitiveSizeInBits();
-    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
-      return 0;
-
-    // Build the slice for this chain of computations.
-    LoadedSlice LS(User, &LI, Shift);
-    APInt CurrentUsedBits = LS.getUsedBits();
-
-    // Check if this slice overlaps with another.
-    if ((CurrentUsedBits & UsedBits) != 0)
-      return 0;
-    // Update the bits used globally.
-    UsedBits |= CurrentUsedBits;
-
-    // Record the slice.
-    LoadedSlices.push_back(LS);
-  }
-
-  // Abort slicing if it does not seem to be profitable.
-  if (!isSlicingProfitable(LoadedSlices, UsedBits))
-    return 0;
-
-  // Rewrite each chain to use an independent load.
-  // By construction, each chain can be represented by a unique load.
-  bool IsBigEndian = TD.isBigEndian();
-  for (SmallVectorImpl<LoadedSlice>::const_iterator LSIt = LoadedSlices.begin(),
-       LSItEnd = LoadedSlices.end(); LSIt != LSItEnd; ++LSIt) {
-    Instruction *SliceInst = LSIt->loadSlice(Builder, IsBigEndian);
-    (void)SliceInst;
-    DEBUG(dbgs() << "IC: Replacing " << *LSIt->Inst << "\n"
-                    "    with " << *SliceInst << '\n');
    }
-  return 0; // Don't do anything with LI.
+  return nullptr;
  }
  
  Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
    Value *Op = LI.getOperand(0);
  
    // Attempt to improve the alignment.
-  if (TD) {
+  if (DL) {
      unsigned KnownAlign =
-      getOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()),TD);
+      getOrEnforceKnownAlignment(Op, DL->getPrefTypeAlignment(LI.getType()),DL);
      unsigned LoadAlign = LI.getAlignment();
      unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign :
-      TD->getABITypeAlignment(LI.getType());
+      DL->getABITypeAlignment(LI.getType());
  
      if (KnownAlign > EffectiveLoadAlign)
        LI.setAlignment(KnownAlign);
@@ -634,12 +376,12 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
  
    // load (cast X) --> cast (load X) iff safe.
    if (isa<CastInst>(Op))
-    if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+    if (Instruction *Res = InstCombineLoadCast(*this, LI, DL))
        return Res;
  
    // None of the following transforms are legal for volatile/atomic loads.
    // FIXME: Some of it is okay for atomic loads; needs refactoring.
-  if (!LI.isSimple()) return 0;
+  if (!LI.isSimple()) return nullptr;
  
    // Do really simple store-to-load forwarding and load CSE, to catch cases
    // where there are several consecutive memory accesses to the same location,
@@ -678,7 +420,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
    // Instcombine load (constantexpr_cast global) -> cast (load global)
    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op))
      if (CE->isCast())
-      if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+      if (Instruction *Res = InstCombineLoadCast(*this, LI, DL))
          return Res;
  
    if (Op->hasOneUse()) {
@@ -695,8 +437,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
      if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
        // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
        unsigned Align = LI.getAlignment();
-      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, TD) &&
-          isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, TD)) {
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, DL) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, DL)) {
          LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1),
                                             SI->getOperand(1)->getName()+".val");
          LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2),
@@ -721,14 +463,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
          }
      }
    }
-
-  // Try to split a load in smaller non-overlapping loads to expose independant
-  // chain of computations and get rid of trunc/lshr sequence of code.
-  // The data layout is required for that operation, as code generation will
-  // change with respect to endianess.
-  if (TD)
-    return sliceUpLoadInst(LI, *Builder, *TD);
-  return 0;
+  return nullptr;
  }
  
  /// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
@@ -738,14 +473,14 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
    User *CI = cast<User>(SI.getOperand(1));
    Value *CastOp = CI->getOperand(0);
  
-  Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  Type *DestPTy = CI->getType()->getPointerElementType();
    PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
-  if (SrcTy == 0) return 0;
+  if (!SrcTy) return nullptr;
  
    Type *SrcPTy = SrcTy->getElementType();
  
    if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
-    return 0;
+    return nullptr;
  
    /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
    /// to its first element.  This allows us to handle things like:
@@ -779,30 +514,40 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
    }
  
    if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
-    return 0;
+    return nullptr;
+
+  // If the pointers point into different address spaces don't do the
+  // transformation.
+  if (SrcTy->getAddressSpace() != CI->getType()->getPointerAddressSpace())
+    return nullptr;
  
-  // If the pointers point into different address spaces or if they point to
-  // values with different sizes, we can't do the transformation.
+  // If the pointers point to values of different sizes don't do the
+  // transformation.
    if (!IC.getDataLayout() ||
-      SrcTy->getAddressSpace() !=
-        cast<PointerType>(CI->getType())->getAddressSpace() ||
        IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
        IC.getDataLayout()->getTypeSizeInBits(DestPTy))
-    return 0;
+    return nullptr;
+
+  // If the pointers point to pointers to different address spaces don't do the
+  // transformation. It is not safe to introduce an addrspacecast instruction in
+  // this case since, depending on the target, addrspacecast may not be a no-op
+  // cast.
+  if (SrcPTy->isPointerTy() && DestPTy->isPointerTy() &&
+      SrcPTy->getPointerAddressSpace() != DestPTy->getPointerAddressSpace())
+    return nullptr;
  
    // Okay, we are casting from one integer or pointer type to another of
    // the same size.  Instead of casting the pointer before
    // the store, cast the value to be stored.
    Value *NewCast;
-  Value *SIOp0 = SI.getOperand(0);
    Instruction::CastOps opcode = Instruction::BitCast;
-  Type* CastSrcTy = SIOp0->getType();
+  Type* CastSrcTy = DestPTy;
    Type* CastDstTy = SrcPTy;
    if (CastDstTy->isPointerTy()) {
      if (CastSrcTy->isIntegerTy())
        opcode = Instruction::IntToPtr;
    } else if (CastDstTy->isIntegerTy()) {
-    if (SIOp0->getType()->isPointerTy())
+    if (CastSrcTy->isPointerTy())
        opcode = Instruction::PtrToInt;
    }
  
@@ -811,6 +556,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
    if (!NewGEPIndices.empty())
      CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
  
+  Value *SIOp0 = SI.getOperand(0);
    NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
                                     SIOp0->getName()+".c");
    SI.setOperand(0, NewCast);
@@ -852,13 +598,13 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
    Value *Ptr = SI.getOperand(1);
  
    // Attempt to improve the alignment.
-  if (TD) {
+  if (DL) {
      unsigned KnownAlign =
-      getOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()),
-                                 TD);
+      getOrEnforceKnownAlignment(Ptr, DL->getPrefTypeAlignment(Val->getType()),
+                                 DL);
      unsigned StoreAlign = SI.getAlignment();
      unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign :
-      TD->getABITypeAlignment(Val->getType());
+      DL->getABITypeAlignment(Val->getType());
  
      if (KnownAlign > EffectiveStoreAlign)
        SI.setAlignment(KnownAlign);
@@ -868,7 +614,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
  
    // Don't hack volatile/atomic stores.
    // FIXME: Some bits are legal for atomic stores; needs refactoring.
-  if (!SI.isSimple()) return 0;
+  if (!SI.isSimple()) return nullptr;
  
    // If the RHS is an alloca with a single use, zapify the store, making the
    // alloca dead.
@@ -935,7 +681,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
        if (Instruction *U = dyn_cast<Instruction>(Val))
          Worklist.Add(U);  // Dropped a use.
      }
-    return 0;  // Do not modify these!
+    return nullptr;  // Do not modify these!
    }
  
    // store undef, Ptr -> noop
@@ -964,9 +710,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
    if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
      if (BI->isUnconditional())
        if (SimplifyStoreAtEndOfBlock(SI))
-        return 0;  // xform done!
+        return nullptr;  // xform done!
  
-  return 0;
+  return nullptr;
  }
  
  /// SimplifyStoreAtEndOfBlock - Turn things like:
@@ -989,7 +735,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
    // the other predecessor.
    pred_iterator PI = pred_begin(DestBB);
    BasicBlock *P = *PI;
-  BasicBlock *OtherBB = 0;
+  BasicBlock *OtherBB = nullptr;
  
    if (P != StoreBB)
      OtherBB = P;
@@ -1019,7 +765,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
  
    // If the other block ends in an unconditional branch, check for the 'if then
    // else' case.  there is an instruction before the branch.
-  StoreInst *OtherStore = 0;
+  StoreInst *OtherStore = nullptr;
    if (OtherBr->isUnconditional()) {
      --BBI;
      // Skip over debugging info.
@@ -1090,12 +836,13 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
    InsertNewInstBefore(NewSI, *BBI);
    NewSI->setDebugLoc(OtherStore->getDebugLoc());
  
-  // If the two stores had the same TBAA tag, preserve it.
-  if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa))
-    if ((TBAATag = MDNode::getMostGenericTBAA(TBAATag,
-                               OtherStore->getMetadata(LLVMContext::MD_tbaa))))
-      NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-
+  // If the two stores had AA tags, merge them.
+  AAMDNodes AATags;
+  SI.getAAMetadata(AATags);
+  if (AATags) {
+    OtherStore->getAAMetadata(AATags, /* Merge = */ true);
+    NewSI->setAAMetadata(AATags);
+  }
  
    // Nuke the old stores.
    EraseInstFromFunction(SI);