AArch64/ARM64: move ARM64 into AArch64's place

[oota-llvm.git] / lib / Transforms / InstCombine / InstCombineCalls.cpp
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp

index fe8c1b0baf6e1626ce738f8babbaf9711155fa4e..be1b5aa50b187981b4619a9f9614e771e40362e9 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -14,14 +14,16 @@
  #include "InstCombine.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/Analysis/MemoryBuiltins.h"
  #include "InstCombine.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/CallSite.h"
  #include "llvm/IR/DataLayout.h"
  #include "llvm/IR/DataLayout.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/PatternMatch.h"
+#include "llvm/IR/PatternMatch.h"
  #include "llvm/Transforms/Utils/BuildLibCalls.h"
  #include "llvm/Transforms/Utils/Local.h"
  using namespace llvm;
  using namespace PatternMatch;
  
  #include "llvm/Transforms/Utils/BuildLibCalls.h"
  #include "llvm/Transforms/Utils/Local.h"
  using namespace llvm;
  using namespace PatternMatch;
  
+#define DEBUG_TYPE "instcombine"
+
  STATISTIC(NumSimplified, "Number of library calls simplified");
  
  /// getPromotedType - Return the specified type promoted as it would be to pass
  STATISTIC(NumSimplified, "Number of library calls simplified");
  
  /// getPromotedType - Return the specified type promoted as it would be to pass
@@ -56,8 +58,8 @@ static Type *reduceToSingleValueType(Type *T) {
  }
  
  Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
  }
  
  Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
-  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), TD);
-  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), TD);
+  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL);
+  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL);
    unsigned MinAlign = std::min(DstAlign, SrcAlign);
    unsigned CopyAlign = MI->getAlignment();
  
    unsigned MinAlign = std::min(DstAlign, SrcAlign);
    unsigned CopyAlign = MI->getAlignment();
  
@@ -70,7 +72,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
    // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
    // load/store.
    ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
    // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
    // load/store.
    ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
-  if (MemOpLength == 0) return 0;
+  if (!MemOpLength) return nullptr;
  
    // Source and destination pointer types are always "i8*" for intrinsic.  See
    // if the size is something we can handle with a single primitive load/store.
  
    // Source and destination pointer types are always "i8*" for intrinsic.  See
    // if the size is something we can handle with a single primitive load/store.
@@ -80,7 +82,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
    assert(Size && "0-sized memory transferring should be removed already.");
  
    if (Size > 8 || (Size&(Size-1)))
    assert(Size && "0-sized memory transferring should be removed already.");
  
    if (Size > 8 || (Size&(Size-1)))
-    return 0;  // If not 1/2/4/8 bytes, exit.
+    return nullptr;  // If not 1/2/4/8 bytes, exit.
  
    // Use an integer load+store unless we can find something better.
    unsigned SrcAddrSp =
  
    // Use an integer load+store unless we can find something better.
    unsigned SrcAddrSp =
@@ -99,11 +101,11 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
    // dest address will be promotable.  See if we can find a better type than the
    // integer datatype.
    Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
    // dest address will be promotable.  See if we can find a better type than the
    // integer datatype.
    Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
-  MDNode *CopyMD = 0;
+  MDNode *CopyMD = nullptr;
    if (StrippedDest != MI->getArgOperand(0)) {
      Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                      ->getElementType();
    if (StrippedDest != MI->getArgOperand(0)) {
      Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                      ->getElementType();
-    if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
+    if (DL && SrcETy->isSized() && DL->getTypeStoreSize(SrcETy) == Size) {
        // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
        // down through these levels if so.
        SrcETy = reduceToSingleValueType(SrcETy);
        // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
        // down through these levels if so.
        SrcETy = reduceToSingleValueType(SrcETy);
@@ -152,7 +154,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
  }
  
  Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
  }
  
  Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
-  unsigned Alignment = getKnownAlignment(MI->getDest(), TD);
+  unsigned Alignment = getKnownAlignment(MI->getDest(), DL);
    if (MI->getAlignment() < Alignment) {
      MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
                                               Alignment, false));
    if (MI->getAlignment() < Alignment) {
      MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
                                               Alignment, false));
@@ -163,7 +165,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
    ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
    ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
    if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
    ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
    ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
    if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
-    return 0;
+    return nullptr;
    uint64_t Len = LenC->getLimitedValue();
    Alignment = MI->getAlignment();
    assert(Len && "0-sized memory setting should be removed already.");
    uint64_t Len = LenC->getLimitedValue();
    Alignment = MI->getAlignment();
    assert(Len && "0-sized memory setting should be removed already.");
@@ -191,7 +193,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
      return MI;
    }
  
      return MI;
    }
  
-  return 0;
+  return nullptr;
  }
  
  /// visitCallInst - CallInst simplification.  This mostly only handles folding
  }
  
  /// visitCallInst - CallInst simplification.  This mostly only handles folding
@@ -233,7 +235,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
  
      // No other transformations apply to volatile transfers.
      if (MI->isVolatile())
  
      // No other transformations apply to volatile transfers.
      if (MI->isVolatile())
-      return 0;
+      return nullptr;
  
      // If we have a memmove and the source operation is a constant global,
      // then the source and dest pointers can't alias, so we can change this
  
      // If we have a memmove and the source operation is a constant global,
      // then the source and dest pointers can't alias, so we can change this
@@ -274,13 +276,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    default: break;
    case Intrinsic::objectsize: {
      uint64_t Size;
    default: break;
    case Intrinsic::objectsize: {
      uint64_t Size;
-    if (getObjectSize(II->getArgOperand(0), Size, TD, TLI))
+    if (getObjectSize(II->getArgOperand(0), Size, DL, TLI))
        return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
        return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
-    return 0;
+    return nullptr;
    }
    case Intrinsic::bswap: {
      Value *IIOperand = II->getArgOperand(0);
    }
    case Intrinsic::bswap: {
      Value *IIOperand = II->getArgOperand(0);
-    Value *X = 0;
+    Value *X = nullptr;
  
      // bswap(bswap(x)) -> x
      if (match(IIOperand, m_BSwap(m_Value(X))))
  
      // bswap(bswap(x)) -> x
      if (match(IIOperand, m_BSwap(m_Value(X))))
@@ -320,7 +322,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      uint32_t BitWidth = IT->getBitWidth();
      APInt KnownZero(BitWidth, 0);
      APInt KnownOne(BitWidth, 0);
      uint32_t BitWidth = IT->getBitWidth();
      APInt KnownZero(BitWidth, 0);
      APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
+    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne);
      unsigned TrailingZeros = KnownOne.countTrailingZeros();
      APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
      if ((Mask & KnownZero) == Mask)
      unsigned TrailingZeros = KnownOne.countTrailingZeros();
      APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
      if ((Mask & KnownZero) == Mask)
@@ -338,7 +340,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      uint32_t BitWidth = IT->getBitWidth();
      APInt KnownZero(BitWidth, 0);
      APInt KnownOne(BitWidth, 0);
      uint32_t BitWidth = IT->getBitWidth();
      APInt KnownZero(BitWidth, 0);
      APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
+    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne);
      unsigned LeadingZeros = KnownOne.countLeadingZeros();
      APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
      if ((Mask & KnownZero) == Mask)
      unsigned LeadingZeros = KnownOne.countLeadingZeros();
      APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
      if ((Mask & KnownZero) == Mask)
@@ -353,14 +355,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      uint32_t BitWidth = IT->getBitWidth();
      APInt LHSKnownZero(BitWidth, 0);
      APInt LHSKnownOne(BitWidth, 0);
      uint32_t BitWidth = IT->getBitWidth();
      APInt LHSKnownZero(BitWidth, 0);
      APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
      bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
      bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
  
      if (LHSKnownNegative || LHSKnownPositive) {
        APInt RHSKnownZero(BitWidth, 0);
        APInt RHSKnownOne(BitWidth, 0);
      bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
      bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
  
      if (LHSKnownNegative || LHSKnownPositive) {
        APInt RHSKnownZero(BitWidth, 0);
        APInt RHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
        bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
        bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
        if (LHSKnownNegative && RHSKnownNegative) {
        bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
        bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
        if (LHSKnownNegative && RHSKnownNegative) {
@@ -447,10 +449,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
  
      APInt LHSKnownZero(BitWidth, 0);
      APInt LHSKnownOne(BitWidth, 0);
  
      APInt LHSKnownZero(BitWidth, 0);
      APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
      APInt RHSKnownZero(BitWidth, 0);
      APInt RHSKnownOne(BitWidth, 0);
      APInt RHSKnownZero(BitWidth, 0);
      APInt RHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
  
      // Get the largest possible values for each operand.
      APInt LHSMax = ~LHSKnownZero;
  
      // Get the largest possible values for each operand.
      APInt LHSMax = ~LHSKnownZero;
@@ -504,7 +506,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    case Intrinsic::ppc_altivec_lvx:
    case Intrinsic::ppc_altivec_lvxl:
      // Turn PPC lvx -> load if the pointer is known aligned.
    case Intrinsic::ppc_altivec_lvx:
    case Intrinsic::ppc_altivec_lvxl:
      // Turn PPC lvx -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL) >= 16) {
        Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                           PointerType::getUnqual(II->getType()));
        return new LoadInst(Ptr);
        Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                           PointerType::getUnqual(II->getType()));
        return new LoadInst(Ptr);
@@ -513,7 +515,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    case Intrinsic::ppc_altivec_stvx:
    case Intrinsic::ppc_altivec_stvxl:
      // Turn stvx -> store if the pointer is known aligned.
    case Intrinsic::ppc_altivec_stvx:
    case Intrinsic::ppc_altivec_stvxl:
      // Turn stvx -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, TD) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL) >= 16) {
        Type *OpPtrTy =
          PointerType::getUnqual(II->getArgOperand(0)->getType());
        Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
        Type *OpPtrTy =
          PointerType::getUnqual(II->getArgOperand(0)->getType());
        Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
@@ -524,7 +526,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    case Intrinsic::x86_sse2_storeu_pd:
    case Intrinsic::x86_sse2_storeu_dq:
      // Turn X86 storeu -> store if the pointer is known aligned.
    case Intrinsic::x86_sse2_storeu_pd:
    case Intrinsic::x86_sse2_storeu_dq:
      // Turn X86 storeu -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL) >= 16) {
        Type *OpPtrTy =
          PointerType::getUnqual(II->getArgOperand(1)->getType());
        Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
        Type *OpPtrTy =
          PointerType::getUnqual(II->getArgOperand(1)->getType());
        Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
@@ -554,6 +556,79 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      break;
    }
  
      break;
    }
  
+  // Constant fold <A x Bi> << Ci.
+  // FIXME: We don't handle _dq because it's a shift of an i128, but is
+  // represented in the IR as <2 x i64>. A per element shift is wrong.
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w: {
+    // Simplify if count is constant. To 0 if >= BitWidth,
+    // otherwise to shl/lshr.
+    auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1));
+    auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (!CDV && !CInt)
+      break;
+    ConstantInt *Count;
+    if (CDV)
+      Count = cast<ConstantInt>(CDV->getElementAsConstant(0));
+    else
+      Count = CInt;
+
+    auto Vec = II->getArgOperand(0);
+    auto VT = cast<VectorType>(Vec->getType());
+    if (Count->getZExtValue() >
+        VT->getElementType()->getPrimitiveSizeInBits() - 1)
+      return ReplaceInstUsesWith(
+          CI, ConstantAggregateZero::get(Vec->getType()));
+
+    bool isPackedShiftLeft = true;
+    switch (II->getIntrinsicID()) {
+    default : break;
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+    case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break;
+    }
+
+    unsigned VWidth = VT->getNumElements();
+    // Get a constant vector of the same type as the first operand.
+    auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue());
+    if (isPackedShiftLeft)
+      return BinaryOperator::CreateShl(Vec,
+          Builder->CreateVectorSplat(VWidth, VTCI));
+
+    return BinaryOperator::CreateLShr(Vec,
+        Builder->CreateVectorSplat(VWidth, VTCI));
+  }
  
    case Intrinsic::x86_sse41_pmovsxbw:
    case Intrinsic::x86_sse41_pmovsxwd:
  
    case Intrinsic::x86_sse41_pmovsxbw:
    case Intrinsic::x86_sse41_pmovsxwd:
@@ -576,6 +651,113 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      break;
    }
  
      break;
    }
  
+  case Intrinsic::x86_sse4a_insertqi: {
+    // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
+    // ones undef
+    // TODO: eventually we should lower this intrinsic to IR
+    if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+      if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
+        if (CIWidth->equalsInt(64) && CIStart->isZero()) {
+          Value *Vec = II->getArgOperand(1);
+          Value *Undef = UndefValue::get(Vec->getType());
+          const uint32_t Mask[] = { 0, 2 };
+          return ReplaceInstUsesWith(
+              CI,
+              Builder->CreateShuffleVector(
+                  Vec, Undef, ConstantDataVector::get(
+                                  II->getContext(), ArrayRef<uint32_t>(Mask))));
+
+        } else if (auto Source =
+                       dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+          if (Source->hasOneUse() &&
+              Source->getArgOperand(1) == II->getArgOperand(1)) {
+            // If the source of the insert has only one use and it's another
+            // insert (and they're both inserting from the same vector), try to
+            // bundle both together.
+            auto CISourceWidth =
+                dyn_cast<ConstantInt>(Source->getArgOperand(2));
+            auto CISourceStart =
+                dyn_cast<ConstantInt>(Source->getArgOperand(3));
+            if (CISourceStart && CISourceWidth) {
+              unsigned Start = CIStart->getZExtValue();
+              unsigned Width = CIWidth->getZExtValue();
+              unsigned End = Start + Width;
+              unsigned SourceStart = CISourceStart->getZExtValue();
+              unsigned SourceWidth = CISourceWidth->getZExtValue();
+              unsigned SourceEnd = SourceStart + SourceWidth;
+              unsigned NewStart, NewWidth;
+              bool ShouldReplace = false;
+              if (Start <= SourceStart && SourceStart <= End) {
+                NewStart = Start;
+                NewWidth = std::max(End, SourceEnd) - NewStart;
+                ShouldReplace = true;
+              } else if (SourceStart <= Start && Start <= SourceEnd) {
+                NewStart = SourceStart;
+                NewWidth = std::max(SourceEnd, End) - NewStart;
+                ShouldReplace = true;
+              }
+
+              if (ShouldReplace) {
+                Constant *ConstantWidth = ConstantInt::get(
+                    II->getArgOperand(2)->getType(), NewWidth, false);
+                Constant *ConstantStart = ConstantInt::get(
+                    II->getArgOperand(3)->getType(), NewStart, false);
+                Value *Args[4] = { Source->getArgOperand(0),
+                                   II->getArgOperand(1), ConstantWidth,
+                                   ConstantStart };
+                Module *M = CI.getParent()->getParent()->getParent();
+                Value *F =
+                    Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+                return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
+              }
+            }
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  case Intrinsic::x86_avx_vpermilvar_ps:
+  case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx_vpermilvar_pd:
+  case Intrinsic::x86_avx_vpermilvar_pd_256: {
+    // Convert vpermil* to shufflevector if the mask is constant.
+    Value *V = II->getArgOperand(1);
+    unsigned Size = cast<VectorType>(V->getType())->getNumElements();
+    assert(Size == 8 || Size == 4 || Size == 2);
+    uint32_t Indexes[8];
+    if (auto C = dyn_cast<ConstantDataVector>(V)) {
+      // The intrinsics only read one or two bits, clear the rest.
+      for (unsigned I = 0; I < Size; ++I) {
+        uint32_t Index = C->getElementAsInteger(I) & 0x3;
+        if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd ||
+            II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256)
+          Index >>= 1;
+        Indexes[I] = Index;
+      }
+    } else if (isa<ConstantAggregateZero>(V)) {
+      for (unsigned I = 0; I < Size; ++I)
+        Indexes[I] = 0;
+    } else {
+      break;
+    }
+    // The _256 variants are a bit trickier since the mask bits always index
+    // into the corresponding 128 half. In order to convert to a generic
+    // shuffle, we have to make that explicit.
+    if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 ||
+        II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) {
+      for (unsigned I = Size / 2; I < Size; ++I)
+        Indexes[I] += Size / 2;
+    }
+    auto NewC =
+        ConstantDataVector::get(V->getContext(), makeArrayRef(Indexes, Size));
+    auto V1 = II->getArgOperand(0);
+    auto V2 = UndefValue::get(V1->getType());
+    auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC);
+    return ReplaceInstUsesWith(CI, Shuffle);
+  }
+
    case Intrinsic::ppc_altivec_vperm:
      // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
      if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
    case Intrinsic::ppc_altivec_vperm:
      // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
      if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
@@ -586,8 +768,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
        bool AllEltsOk = true;
        for (unsigned i = 0; i != 16; ++i) {
          Constant *Elt = Mask->getAggregateElement(i);
        bool AllEltsOk = true;
        for (unsigned i = 0; i != 16; ++i) {
          Constant *Elt = Mask->getAggregateElement(i);
-        if (Elt == 0 ||
-            !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
            AllEltsOk = false;
            break;
          }
            AllEltsOk = false;
            break;
          }
@@ -612,7 +793,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
              cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
            Idx &= 31;  // Match the hardware behavior.
  
              cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
            Idx &= 31;  // Match the hardware behavior.
  
-          if (ExtractedElts[Idx] == 0) {
+          if (!ExtractedElts[Idx]) {
              ExtractedElts[Idx] =
                Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
                                              Builder->getInt32(Idx&15));
              ExtractedElts[Idx] =
                Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
                                              Builder->getInt32(Idx&15));
@@ -641,7 +822,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    case Intrinsic::arm_neon_vst2lane:
    case Intrinsic::arm_neon_vst3lane:
    case Intrinsic::arm_neon_vst4lane: {
    case Intrinsic::arm_neon_vst2lane:
    case Intrinsic::arm_neon_vst3lane:
    case Intrinsic::arm_neon_vst4lane: {
-    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), TD);
+    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL);
      unsigned AlignArg = II->getNumArgOperands() - 1;
      ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
      if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
      unsigned AlignArg = II->getNumArgOperands() - 1;
      ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
      if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
@@ -654,7 +835,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
    }
  
    case Intrinsic::arm_neon_vmulls:
    }
  
    case Intrinsic::arm_neon_vmulls:
-  case Intrinsic::arm_neon_vmullu: {
+  case Intrinsic::arm_neon_vmullu:
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
      Value *Arg0 = II->getArgOperand(0);
      Value *Arg1 = II->getArgOperand(1);
  
      Value *Arg0 = II->getArgOperand(0);
      Value *Arg1 = II->getArgOperand(1);
  
@@ -664,7 +847,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      }
  
      // Check for constant LHS & RHS - in this case we just simplify.
      }
  
      // Check for constant LHS & RHS - in this case we just simplify.
-    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu);
+    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu ||
+                 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull);
      VectorType *NewVT = cast<VectorType>(II->getType());
      if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
        if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
      VectorType *NewVT = cast<VectorType>(II->getType());
      if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
        if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
@@ -747,7 +931,7 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
  /// passed through the varargs area, we can eliminate the use of the cast.
  static bool isSafeToEliminateVarargsCast(const CallSite CS,
                                           const CastInst * const CI,
  /// passed through the varargs area, we can eliminate the use of the cast.
  static bool isSafeToEliminateVarargsCast(const CallSite CS,
                                           const CastInst * const CI,
-                                         const DataLayout * const TD,
+                                         const DataLayout * const DL,
                                           const int ix) {
    if (!CI->isLosslessCast())
      return false;
                                           const int ix) {
    if (!CI->isLosslessCast())
      return false;
@@ -763,7 +947,7 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
    Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
    if (!SrcTy->isSized() || !DstTy->isSized())
      return false;
    Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
    if (!SrcTy->isSized() || !DstTy->isSized())
      return false;
-  if (!TD || TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy))
+  if (!DL || DL->getTypeAllocSize(SrcTy) != DL->getTypeAllocSize(DstTy))
      return false;
    return true;
  }
      return false;
    return true;
  }
@@ -772,15 +956,15 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
  // Currently we're only working with the checking functions, memcpy_chk,
  // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
  // strcat_chk and strncat_chk.
  // Currently we're only working with the checking functions, memcpy_chk,
  // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
  // strcat_chk and strncat_chk.
-Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *TD) {
-  if (CI->getCalledFunction() == 0) return 0;
+Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *DL) {
+  if (!CI->getCalledFunction()) return nullptr;
  
    if (Value *With = Simplifier->optimizeCall(CI)) {
      ++NumSimplified;
      return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
    }
  
  
    if (Value *With = Simplifier->optimizeCall(CI)) {
      ++NumSimplified;
      return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
    }
  
-  return 0;
+  return nullptr;
  }
  
  static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
  }
  
  static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
@@ -788,37 +972,36 @@ static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
    // is good enough in practice and simpler than handling any number of casts.
    Value *Underlying = TrampMem->stripPointerCasts();
    if (Underlying != TrampMem &&
    // is good enough in practice and simpler than handling any number of casts.
    Value *Underlying = TrampMem->stripPointerCasts();
    if (Underlying != TrampMem &&
-      (!Underlying->hasOneUse() || *Underlying->use_begin() != TrampMem))
-    return 0;
+      (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
+    return nullptr;
    if (!isa<AllocaInst>(Underlying))
    if (!isa<AllocaInst>(Underlying))
-    return 0;
+    return nullptr;
  
  
-  IntrinsicInst *InitTrampoline = 0;
-  for (Value::use_iterator I = TrampMem->use_begin(), E = TrampMem->use_end();
-       I != E; I++) {
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(*I);
+  IntrinsicInst *InitTrampoline = nullptr;
+  for (User *U : TrampMem->users()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
      if (!II)
      if (!II)
-      return 0;
+      return nullptr;
      if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
        if (InitTrampoline)
          // More than one init_trampoline writes to this value.  Give up.
      if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
        if (InitTrampoline)
          // More than one init_trampoline writes to this value.  Give up.
-        return 0;
+        return nullptr;
        InitTrampoline = II;
        continue;
      }
      if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
        // Allow any number of calls to adjust.trampoline.
        continue;
        InitTrampoline = II;
        continue;
      }
      if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
        // Allow any number of calls to adjust.trampoline.
        continue;
-    return 0;
+    return nullptr;
    }
  
    // No call to init.trampoline found.
    if (!InitTrampoline)
    }
  
    // No call to init.trampoline found.
    if (!InitTrampoline)
-    return 0;
+    return nullptr;
  
    // Check that the alloca is being used in the expected way.
    if (InitTrampoline->getOperand(0) != TrampMem)
  
    // Check that the alloca is being used in the expected way.
    if (InitTrampoline->getOperand(0) != TrampMem)
-    return 0;
+    return nullptr;
  
    return InitTrampoline;
  }
  
    return InitTrampoline;
  }
@@ -835,9 +1018,9 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
            II->getOperand(0) == TrampMem)
          return II;
      if (Inst->mayWriteToMemory())
            II->getOperand(0) == TrampMem)
          return II;
      if (Inst->mayWriteToMemory())
-      return 0;
+      return nullptr;
    }
    }
-  return 0;
+  return nullptr;
  }
  
  // Given a call to llvm.adjust.trampoline, find and return the corresponding
  }
  
  // Given a call to llvm.adjust.trampoline, find and return the corresponding
@@ -849,7 +1032,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
    IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
    if (!AdjustTramp ||
        AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
    IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
    if (!AdjustTramp ||
        AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
-    return 0;
+    return nullptr;
  
    Value *TrampMem = AdjustTramp->getOperand(0);
  
  
    Value *TrampMem = AdjustTramp->getOperand(0);
  
@@ -857,7 +1040,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
      return IT;
    if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem))
      return IT;
      return IT;
    if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem))
      return IT;
-  return 0;
+  return nullptr;
  }
  
  // visitCallSite - Improvements for call and invoke instructions.
  }
  
  // visitCallSite - Improvements for call and invoke instructions.
@@ -872,7 +1055,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
    // arguments of the call/invoke.
    Value *Callee = CS.getCalledValue();
    if (!isa<Function>(Callee) && transformConstExprCastCall(CS))
    // arguments of the call/invoke.
    Value *Callee = CS.getCalledValue();
    if (!isa<Function>(Callee) && transformConstExprCastCall(CS))
-    return 0;
+    return nullptr;
  
    if (Function *CalleeF = dyn_cast<Function>(Callee))
      // If the call and callee calling conventions don't match, this call must
  
    if (Function *CalleeF = dyn_cast<Function>(Callee))
      // If the call and callee calling conventions don't match, this call must
@@ -897,7 +1080,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
        // change the callee to a null pointer.
        cast<InvokeInst>(OldCall)->setCalledFunction(
                                      Constant::getNullValue(CalleeF->getType()));
        // change the callee to a null pointer.
        cast<InvokeInst>(OldCall)->setCalledFunction(
                                      Constant::getNullValue(CalleeF->getType()));
-      return 0;
+      return nullptr;
      }
  
    if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
      }
  
    if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
@@ -909,7 +1092,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
  
      if (isa<InvokeInst>(CS.getInstruction())) {
        // Can't remove an invoke because we cannot change the CFG.
  
      if (isa<InvokeInst>(CS.getInstruction())) {
        // Can't remove an invoke because we cannot change the CFG.
-      return 0;
+      return nullptr;
      }
  
      // This instruction is not reachable, just remove it.  We insert a store to
      }
  
      // This instruction is not reachable, just remove it.  We insert a store to
@@ -934,7 +1117,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
      for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(),
             E = CS.arg_end(); I != E; ++I, ++ix) {
        CastInst *CI = dyn_cast<CastInst>(*I);
      for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(),
             E = CS.arg_end(); I != E; ++I, ++ix) {
        CastInst *CI = dyn_cast<CastInst>(*I);
-      if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) {
+      if (CI && isSafeToEliminateVarargsCast(CS, CI, DL, ix)) {
          *I = CI->getOperand(0);
          Changed = true;
        }
          *I = CI->getOperand(0);
          Changed = true;
        }
@@ -951,13 +1134,13 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
    // this.  None of these calls are seen as possibly dead so go ahead and
    // delete the instruction now.
    if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
    // this.  None of these calls are seen as possibly dead so go ahead and
    // delete the instruction now.
    if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
-    Instruction *I = tryOptimizeCall(CI, TD);
+    Instruction *I = tryOptimizeCall(CI, DL);
      // If we changed something return the result, etc. Otherwise let
      // the fallthrough check.
      if (I) return EraseInstFromFunction(*I);
    }
  
      // If we changed something return the result, etc. Otherwise let
      // the fallthrough check.
      if (I) return EraseInstFromFunction(*I);
    }
  
-  return Changed ? CS.getInstruction() : 0;
+  return Changed ? CS.getInstruction() : nullptr;
  }
  
  // transformConstExprCastCall - If the callee is a constexpr cast of a function,
  }
  
  // transformConstExprCastCall - If the callee is a constexpr cast of a function,
@@ -966,7 +1149,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
  bool InstCombiner::transformConstExprCastCall(CallSite CS) {
    Function *Callee =
      dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
  bool InstCombiner::transformConstExprCastCall(CallSite CS) {
    Function *Callee =
      dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
-  if (Callee == 0)
+  if (!Callee)
      return false;
    Instruction *Caller = CS.getInstruction();
    const AttributeSet &CallerPAL = CS.getAttributes();
      return false;
    Instruction *Caller = CS.getInstruction();
    const AttributeSet &CallerPAL = CS.getAttributes();
@@ -1010,9 +1193,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
      // the critical edge).  Bail out in this case.
      if (!Caller->use_empty())
        if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
      // the critical edge).  Bail out in this case.
      if (!Caller->use_empty())
        if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
-        for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
-             UI != E; ++UI)
-          if (PHINode *PN = dyn_cast<PHINode>(*UI))
+        for (User *U : II->users())
+          if (PHINode *PN = dyn_cast<PHINode>(U))
              if (PN->getParent() == II->getNormalDest() ||
                  PN->getParent() == II->getUnwindDest())
                return false;
              if (PN->getParent() == II->getNormalDest() ||
                  PN->getParent() == II->getUnwindDest())
                return false;
@@ -1043,12 +1225,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
          CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
                                                           Attribute::ByVal)) {
        PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
          CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
                                                           Attribute::ByVal)) {
        PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
-      if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
+      if (!ParamPTy || !ParamPTy->getElementType()->isSized() || !DL)
          return false;
  
        Type *CurElTy = ActTy->getPointerElementType();
          return false;
  
        Type *CurElTy = ActTy->getPointerElementType();
-      if (TD->getTypeAllocSize(CurElTy) !=
-          TD->getTypeAllocSize(ParamPTy->getElementType()))
+      if (DL->getTypeAllocSize(CurElTy) !=
+          DL->getTypeAllocSize(ParamPTy->getElementType()))
          return false;
      }
    }
          return false;
      }
    }
@@ -1234,7 +1416,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
    // If the call already has the 'nest' attribute somewhere then give up -
    // otherwise 'nest' would occur twice after splicing in the chain.
    if (Attrs.hasAttrSomewhere(Attribute::Nest))
    // If the call already has the 'nest' attribute somewhere then give up -
    // otherwise 'nest' would occur twice after splicing in the chain.
    if (Attrs.hasAttrSomewhere(Attribute::Nest))
-    return 0;
+    return nullptr;
  
    assert(Tramp &&
           "transformCallThroughTrampoline called with incorrect CallSite.");
  
    assert(Tramp &&
           "transformCallThroughTrampoline called with incorrect CallSite.");
@@ -1246,7 +1428,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
    const AttributeSet &NestAttrs = NestF->getAttributes();
    if (!NestAttrs.isEmpty()) {
      unsigned NestIdx = 1;
    const AttributeSet &NestAttrs = NestF->getAttributes();
    if (!NestAttrs.isEmpty()) {
      unsigned NestIdx = 1;
-    Type *NestTy = 0;
+    Type *NestTy = nullptr;
      AttributeSet NestAttr;
  
      // Look for a parameter marked with the 'nest' attribute.
      AttributeSet NestAttr;
  
      // Look for a parameter marked with the 'nest' attribute.