From 442f620296b2a6d425dbd8b2da906488cf89efc2 Mon Sep 17 00:00:00 2001
From: Hao Liu <Hao.Liu@arm.com>
Date: Thu, 11 Jun 2015 09:05:02 +0000
Subject: [PATCH] [AArch64] Match interleaved memory accesses into ldN/stN
 instructions.

Add a pass AArch64InterleavedAccess to identify and match interleaved memory accesses. This pass transforms an interleaved load/store into ldN/stN intrinsic. As Loop Vectorizor disables optimization on interleaved accesses by default, this optimization is also disabled by default. To enable it by "-aarch64-interleaved-access-opt=true"

E.g. Transform an interleaved load (Factor = 2):
       %wide.vec = load <8 x i32>, <8 x i32>* %ptr
       %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
       %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
     Into:
       %ld2 = { <4 x i32>, <4 x i32> } call aarch64.neon.ld2(%ptr)
       %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
       %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1

E.g. Transform an interleaved store (Factor = 2):
       %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>  ; Interleaved vec
       store <8 x i32> %i.vec, <8 x i32>* %ptr
     Into:
       %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
       %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
       call void aarch64.neon.st2(%v0, %v1, %ptr)


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239514 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64.h                  |   1 +
 .../AArch64/AArch64InterleavedAccess.cpp      | 391 ++++++++++++++++++
 lib/Target/AArch64/AArch64TargetMachine.cpp   |   8 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |  12 +
 .../AArch64/AArch64TargetTransformInfo.h      |   5 +
 lib/Target/AArch64/CMakeLists.txt             |   1 +
 lib/Transforms/Vectorize/LoopVectorize.cpp    |   2 +-
 .../AArch64/aarch64-interleaved-accesses.ll   | 197 +++++++++
 8 files changed, 616 insertions(+), 1 deletion(-)
 create mode 100644 lib/Target/AArch64/AArch64InterleavedAccess.cpp
 create mode 100644 test/CodeGen/AArch64/aarch64-interleaved-accesses.ll

diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 21106c9ad29..8c966c4af9a 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -38,6 +38,7 @@ FunctionPass *createAArch64LoadStoreOptimizationPass();
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64InterleavedAccessPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
 
diff --git a/lib/Target/AArch64/AArch64InterleavedAccess.cpp b/lib/Target/AArch64/AArch64InterleavedAccess.cpp
new file mode 100644
index 00000000000..2d7f58e5847
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InterleavedAccess.cpp
@@ -0,0 +1,391 @@
+//=--------------------- AArch64InterleavedAccess.cpp ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64InterleavedAccess pass, which identifies
+// interleaved memory accesses and Transforms them into an AArch64 ldN/stN
+// intrinsics (N = 2, 3, 4).
+//
+// An interleaved load reads data from memory into several vectors, with
+// DE-interleaving the data on factor. An interleaved store writes several
+// vectors to memory with RE-interleaving the data on factor. The interleave
+// factor is equal to the number of vectors. AArch64 backend supports interleave
+// factor of 2, 3 and 4.
+//
+// E.g. Transform an interleaved load (Factor = 2):
+//        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+//        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+//        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+//      Into:
+//        %ld2 = { <4 x i32>, <4 x i32> } call aarch64.neon.ld2(%ptr)
+//        %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+//        %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+//
+// E.g. Transform an interleaved store (Factor = 2):
+//        %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>  ; Interleaved vec
+//        store <8 x i32> %i.vec, <8 x i32>* %ptr
+//      Into:
+//        %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
+//        %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
+//        call void aarch64.neon.st2(%v0, %v1, %ptr)
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-interleaved-access"
+
+static const unsigned MIN_FACTOR = 2;
+static const unsigned MAX_FACTOR = 4;
+
+namespace llvm {
+static void initializeAArch64InterleavedAccessPass(PassRegistry &);
+}
+
+namespace {
+
+class AArch64InterleavedAccess : public FunctionPass {
+
+public:
+  static char ID;
+  AArch64InterleavedAccess() : FunctionPass(ID) {
+    initializeAArch64InterleavedAccessPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "AArch64 Interleaved Access Pass";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  const DataLayout *DL;
+  Module *M;
+
+  /// \brief Transform an interleaved load into ldN intrinsic.
+  bool matchInterleavedLoad(ShuffleVectorInst *SVI,
+                            SmallSetVector<Instruction *, 32> &DeadInsts);
+
+  /// \brief Transform an interleaved store into stN intrinsic.
+  bool matchInterleavedStore(ShuffleVectorInst *SVI,
+                             SmallSetVector<Instruction *, 32> &DeadInsts);
+};
+} // end anonymous namespace.
+
+char AArch64InterleavedAccess::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64InterleavedAccess, DEBUG_TYPE,
+                      "AArch64 interleaved access Pass", false, false)
+INITIALIZE_PASS_END(AArch64InterleavedAccess, DEBUG_TYPE,
+                    "AArch64 interleaved access Pass", false, false)
+
+FunctionPass *llvm::createAArch64InterleavedAccessPass() {
+  return new AArch64InterleavedAccess();
+}
+
+/// \brief Get a ldN/stN intrinsic according to the Factor (2, 3, or 4).
+static Intrinsic::ID getLdNStNIntrinsic(unsigned Factor, bool IsLoad) {
+  static const Intrinsic::ID LoadInt[3] = {Intrinsic::aarch64_neon_ld2,
+                                           Intrinsic::aarch64_neon_ld3,
+                                           Intrinsic::aarch64_neon_ld4};
+  static const Intrinsic::ID StoreInt[3] = {Intrinsic::aarch64_neon_st2,
+                                            Intrinsic::aarch64_neon_st3,
+                                            Intrinsic::aarch64_neon_st4};
+
+  assert(Factor >= MIN_FACTOR && Factor <= MAX_FACTOR &&
+         "Invalid interleave factor");
+
+  if (IsLoad)
+    return LoadInt[Factor - 2];
+  else
+    return StoreInt[Factor - 2];
+}
+
+/// \brief Check if the mask is a DE-interleave mask of the given factor
+/// \p Factor like:
+///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
+static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
+                                       unsigned &Index) {
+  // Check all potential start indices from 0 to (Factor - 1).
+  for (Index = 0; Index < Factor; Index++) {
+    unsigned i = 0;
+
+    // Check that elements are in ascending order by Factor.
+    for (; i < Mask.size(); i++)
+      if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
+        break;
+
+    if (i == Mask.size())
+      return true;
+  }
+
+  return false;
+}
+
+/// \brief Check if the mask is a DE-interleave mask for an interleaved load.
+///
+/// E.g. DE-interleave masks (Factor = 2) could be:
+///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
+///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
+static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
+                               unsigned &Index) {
+  unsigned NumElts = Mask.size();
+  if (NumElts < 2)
+    return false;
+
+  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
+    if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
+      return true;
+
+  return false;
+}
+
+/// \brief Check if the given mask \p Mask is RE-interleaved mask of the given
+/// factor \p Factor.
+///
+/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
+static bool isReInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor) {
+  unsigned NumElts = Mask.size();
+  if (NumElts % Factor)
+    return false;
+
+  unsigned NumSubElts = NumElts / Factor;
+  if (!isPowerOf2_32(NumSubElts))
+    return false;
+
+  for (unsigned i = 0; i < NumSubElts; i++)
+    for (unsigned j = 0; j < Factor; j++)
+      if (Mask[i * Factor + j] >= 0 &&
+          static_cast<unsigned>(Mask[i * Factor + j]) != j * NumSubElts + i)
+        return false;
+
+  return true;
+}
+
+/// \brief Check if the mask is RE-interleave mask for an interleaved store.
+///
+/// E.g. The RE-interleave mask (Factor = 2) could be:
+///     <0, 4, 1, 5, 2, 6, 3, 7>
+static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
+  if (Mask.size() < 4)
+    return false;
+
+  // Check potential Factors and return true if find a factor for the mask.
+  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
+    if (isReInterleaveMaskOfFactor(Mask, Factor))
+      return true;
+
+  return false;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                   unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  return ConstantVector::get(Mask);
+}
+
+bool AArch64InterleavedAccess::matchInterleavedLoad(
+    ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (DeadInsts.count(SVI))
+    return false;
+
+  LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
+  if (!LI || !LI->isSimple() || !isa<UndefValue>(SVI->getOperand(1)))
+    return false;
+
+  SmallVector<ShuffleVectorInst *, 4> Shuffles;
+
+  // Check if all users of this load are shufflevectors.
+  for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
+    ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(*UI);
+    if (!SV)
+      return false;
+
+    Shuffles.push_back(SV);
+  }
+
+  // Check if the type of the first shuffle is legal.
+  VectorType *VecTy = Shuffles[0]->getType();
+  unsigned TypeSize = DL->getTypeAllocSizeInBits(VecTy);
+  if (TypeSize != 64 && TypeSize != 128)
+    return false;
+
+  // Check if the mask of the first shuffle is strided and get the start index.
+  unsigned Factor, Index;
+  if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
+    return false;
+
+  // Holds the corresponding index for each strided shuffle.
+  SmallVector<unsigned, 4> Indices;
+  Indices.push_back(Index);
+
+  // Check if other shufflevectors are of the same type and factor
+  for (unsigned i = 1; i < Shuffles.size(); i++) {
+    if (Shuffles[i]->getType() != VecTy)
+      return false;
+
+    unsigned Index;
+    if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
+                                    Index))
+      return false;
+
+    Indices.push_back(Index);
+  }
+
+  DEBUG(dbgs() << "Found an interleaved load:" << *LI << "\n");
+
+  // A pointer vector can not be the return type of the ldN intrinsics. Need to
+  // load integer vectors first and then convert to pointer vectors.
+  Type *EltTy = VecTy->getVectorElementType();
+  if (EltTy->isPointerTy())
+    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
+                            VecTy->getVectorNumElements());
+
+  Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *Tys[2] = {VecTy, PtrTy};
+  Function *LdNFunc =
+      Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, true), Tys);
+
+  IRBuilder<> Builder(LI);
+  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+
+  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+  DEBUG(dbgs() << "   Created:" << *LdN << "\n");
+
+  // Replace each strided shufflevector with the corresponding vector loaded
+  // by ldN.
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    ShuffleVectorInst *SV = Shuffles[i];
+    unsigned Index = Indices[i];
+
+    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+
+    // Convert the integer vector to pointer vector if the element is pointer.
+    if (EltTy->isPointerTy())
+      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+
+    SV->replaceAllUsesWith(SubVec);
+
+    DEBUG(dbgs() << "  Replaced:" << *SV << "\n"
+                 << "      With:" << *SubVec << "\n");
+
+    // Avoid analyzing it twice.
+    DeadInsts.insert(SV);
+  }
+
+  // Mark this load as dead.
+  DeadInsts.insert(LI);
+  return true;
+}
+
+bool AArch64InterleavedAccess::matchInterleavedStore(
+    ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (DeadInsts.count(SVI) || !SVI->hasOneUse())
+    return false;
+
+  StoreInst *SI = dyn_cast<StoreInst>(SVI->user_back());
+  if (!SI || !SI->isSimple())
+    return false;
+
+  // Check if the mask is interleaved and get the interleave factor.
+  unsigned Factor;
+  if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
+    return false;
+
+  VectorType *VecTy = SVI->getType();
+  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+
+  // Skip illegal vector types.
+  unsigned TypeSize = DL->getTypeAllocSizeInBits(SubVecTy);
+  if (TypeSize != 64 && TypeSize != 128)
+    return false;
+
+  DEBUG(dbgs() << "Found an interleaved store:" << *SI << "\n");
+
+  Value *Op0 = SVI->getOperand(0);
+  Value *Op1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+  // vectors to integer vectors.
+  if (EltTy->isPointerTy()) {
+    Type *IntTy = DL->getIntPtrType(EltTy);
+    unsigned NumOpElts =
+        dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+
+    // The corresponding integer vector type of the same element size.
+    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+
+    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+    SubVecTy = VectorType::get(IntTy, NumSubElts);
+  }
+
+  Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
+  Type *Tys[2] = {SubVecTy, PtrTy};
+  Function *StNFunc =
+      Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, false), Tys);
+
+  SmallVector<Value *, 5> Ops;
+
+  // Split the shufflevector operands into sub vectors for the new stN call.
+  for (unsigned i = 0; i < Factor; i++)
+    Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+
+  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
+  CallInst *StN = Builder.CreateCall(StNFunc, Ops);
+
+  (void)StN; // silence warning.
+  DEBUG(dbgs() << "  Replaced:" << *SI << "'\n");
+  DEBUG(dbgs() << "      with:" << *StN << "\n");
+
+  // Mark this shufflevector and store as dead.
+  DeadInsts.insert(SI);
+  DeadInsts.insert(SVI);
+  return true;
+}
+
+bool AArch64InterleavedAccess::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+  M = F.getParent();
+  DL = &M->getDataLayout();
+
+  // Holds dead instructions that will be erased later.
+  SmallSetVector<Instruction *, 32> DeadInsts;
+  bool Changed = false;
+  for (auto &I : inst_range(F)) {
+    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+      Changed |= matchInterleavedLoad(SVI, DeadInsts);
+      Changed |= matchInterleavedStore(SVI, DeadInsts);
+    }
+  }
+
+  for (auto I : DeadInsts)
+    I->eraseFromParent();
+
+  return Changed;
+}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index a8f13da3435..2b5625dc09e 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -67,6 +67,11 @@ EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
                           " to make use of cmpxchg flow-based information"),
                  cl::init(true));
 
+static cl::opt<bool> AArch64InterleavedAccessOpt(
+    "aarch64-interleaved-access-opt",
+    cl::desc("Optimize interleaved memory accesses in the AArch64 backend"),
+    cl::init(false), cl::Hidden);
+
 static cl::opt<bool>
 EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
                         cl::desc("Run early if-conversion"),
@@ -226,6 +231,9 @@ void AArch64PassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass());
 
+  if (TM->getOptLevel() != CodeGenOpt::None && AArch64InterleavedAccessOpt)
+    addPass(createAArch64InterleavedAccessPass());
+
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ed27cf84bbb..a51a0674c8f 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -407,6 +407,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
+unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    unsigned Alignment, unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) && "Expect vector types");
+
+  if (Factor > 1 && Factor < 5 && isTypeLegal(VecTy))
+    return Factor;
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
 unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   unsigned Cost = 0;
   for (auto *I : Tys) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 25c22bcd58e..4dabdadd8ee 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -139,6 +139,11 @@ public:
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
+  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace);
   /// @}
 };
 
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index f26327ff84a..676a2482ed1 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -38,6 +38,7 @@ add_llvm_target(AArch64CodeGen
   AArch64PBQPRegAlloc.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
+  AArch64InterleavedAccess.cpp
   AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 95c9381985a..be6c542abde 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -139,7 +139,7 @@ static cl::opt<bool> EnableMemAccessVersioning(
     cl::desc("Enable symblic stride memory access versioning"));
 
 static cl::opt<bool> EnableInterleavedMemAccesses(
-    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+    "enable-interleaved-mem-accesses", cl::init(true), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
 /// Maximum factor for an interleaved memory access.
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
new file mode 100644
index 00000000000..e651be97569
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -0,0 +1,197 @@
+; RUN: llc -march=aarch64 -aarch64-interleaved-access-opt=true < %s | FileCheck %s
+
+; CHECK-LABEL: load_factor2:
+; CHECK: ld2 { v0.8b, v1.8b }, [x0]
+define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
+  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
+  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add nsw <8 x i8> %strided.v0, %strided.v1
+  ret <8 x i8> %add
+}
+
+; CHECK-LABEL: load_delat3:
+; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+define <4 x i32> @load_delat3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %add = add nsw <4 x i32> %strided.v2, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_factor4:
+; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define <4 x i32> @load_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_factor2:
+; CHECK: st2 { v0.8b, v1.8b }, [x0]
+define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
+  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor3:
+; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_factor4:
+; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
+
+; The following cases test that interleaved access of pointer vectors can be
+; matched to ldN/stN instruction.
+
+; CHECK-LABEL: load_ptrvec_factor2:
+; CHECK: ld2 { v0.2d, v1.2d }, [x0]
+define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
+  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32*> %strided.v0
+}
+
+; CHECK-LABEL: load_ptrvec_factor3:
+; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
+  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
+  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
+  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: load_ptrvec_factor4:
+; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
+  %base = bitcast i32** %ptr to <8 x i32*>*
+  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
+  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
+  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
+  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
+  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor2:
+; CHECK: st2 { v0.2d, v1.2d }, [x0]
+define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
+  %base = bitcast i32** %ptr to <4 x i32*>*
+  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor3:
+; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
+define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
+  %base = bitcast i32** %ptr to <6 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_ptrvec_factor4:
+; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
+  %base = bitcast i32* %ptr to <8 x i32*>*
+  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
+  ret void
+}
+
+; Following cases check that shuffle maskes with undef indices can be matched
+; into ldN/stN instruction.
+
+; CHECK-LABEL: load_undef_mask_factor2:
+; CHECK: ld2 { v0.4s, v1.4s }, [x0]
+define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
+  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor3:
+; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
+  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %add = add nsw <4 x i32> %strided.v2, %strided.v1
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: load_undef_mask_factor4:
+; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
+  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
+  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
+  %add = add nsw <4 x i32> %strided.v0, %strided.v2
+  ret <4 x i32> %add
+}
+
+; CHECK-LABEL: store_undef_mask_factor2:
+; CHECK: st2 { v0.4s, v1.4s }, [x0]
+define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor3:
+; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; CHECK-LABEL: store_undef_mask_factor4:
+; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %base = bitcast i32* %ptr to <16 x i32>*
+  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
+  ret void
+}
-- 
2.34.1