From 688e7b3049dad23ee7fd0127e2d78ea4210d2f57 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Thu, 11 Jun 2015 17:30:33 +0000 Subject: [PATCH] This reverts commit r239529 and r239514. Revert "[AArch64] Match interleaved memory accesses into ldN/stN instructions." Revert "Fixing MSVC 2013 build error." The test/CodeGen/AArch64/aarch64-interleaved-accesses.ll test was failing on OS X. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239544 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64.h | 1 - .../AArch64/AArch64InterleavedAccess.cpp | 392 ------------------ lib/Target/AArch64/AArch64TargetMachine.cpp | 8 - .../AArch64/AArch64TargetTransformInfo.cpp | 12 - .../AArch64/AArch64TargetTransformInfo.h | 5 - lib/Target/AArch64/CMakeLists.txt | 1 - .../AArch64/aarch64-interleaved-accesses.ll | 197 --------- 7 files changed, 616 deletions(-) delete mode 100644 lib/Target/AArch64/AArch64InterleavedAccess.cpp delete mode 100644 test/CodeGen/AArch64/aarch64-interleaved-accesses.ll diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 8c966c4af9a..21106c9ad29 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -38,7 +38,6 @@ FunctionPass *createAArch64LoadStoreOptimizationPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); -FunctionPass *createAArch64InterleavedAccessPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); diff --git a/lib/Target/AArch64/AArch64InterleavedAccess.cpp b/lib/Target/AArch64/AArch64InterleavedAccess.cpp deleted file mode 100644 index 4219abbf7ce..00000000000 --- a/lib/Target/AArch64/AArch64InterleavedAccess.cpp +++ /dev/null @@ -1,392 +0,0 @@ -//=--------------------- AArch64InterleavedAccess.cpp ----------------------==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the AArch64InterleavedAccess pass, which identifies -// interleaved memory accesses and Transforms them into an AArch64 ldN/stN -// intrinsics (N = 2, 3, 4). -// -// An interleaved load reads data from memory into several vectors, with -// DE-interleaving the data on factor. An interleaved store writes several -// vectors to memory with RE-interleaving the data on factor. The interleave -// factor is equal to the number of vectors. AArch64 backend supports interleave -// factor of 2, 3 and 4. -// -// E.g. Transform an interleaved load (Factor = 2): -// %wide.vec = load <8 x i32>, <8 x i32>* %ptr -// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements -// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements -// Into: -// %ld2 = { <4 x i32>, <4 x i32> } call aarch64.neon.ld2(%ptr) -// %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 -// %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 -// -// E.g. Transform an interleaved store (Factor = 2): -// %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> ; Interleaved vec -// store <8 x i32> %i.vec, <8 x i32>* %ptr -// Into: -// %v0 = shuffle %i.vec, undef, <0, 1, 2, 3> -// %v1 = shuffle %i.vec, undef, <4, 5, 6, 7> -// call void aarch64.neon.st2(%v0, %v1, %ptr) -// -//===----------------------------------------------------------------------===// - -#include "AArch64.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "aarch64-interleaved-access" - -static const unsigned MIN_FACTOR = 2; -static const unsigned MAX_FACTOR = 4; - -namespace llvm { -static void initializeAArch64InterleavedAccessPass(PassRegistry &); -} - -namespace { - -class AArch64InterleavedAccess : public FunctionPass { - -public: - static char ID; - AArch64InterleavedAccess() : FunctionPass(ID) { - initializeAArch64InterleavedAccessPass(*PassRegistry::getPassRegistry()); - } - - const char *getPassName() const override { - return "AArch64 Interleaved Access Pass"; - } - - bool runOnFunction(Function &F) override; - -private: - const DataLayout *DL; - Module *M; - - /// \brief Transform an interleaved load into ldN intrinsic. - bool matchInterleavedLoad(ShuffleVectorInst *SVI, - SmallSetVector &DeadInsts); - - /// \brief Transform an interleaved store into stN intrinsic. - bool matchInterleavedStore(ShuffleVectorInst *SVI, - SmallSetVector &DeadInsts); -}; -} // end anonymous namespace. - -char AArch64InterleavedAccess::ID = 0; - -INITIALIZE_PASS_BEGIN(AArch64InterleavedAccess, DEBUG_TYPE, - "AArch64 interleaved access Pass", false, false) -INITIALIZE_PASS_END(AArch64InterleavedAccess, DEBUG_TYPE, - "AArch64 interleaved access Pass", false, false) - -FunctionPass *llvm::createAArch64InterleavedAccessPass() { - return new AArch64InterleavedAccess(); -} - -/// \brief Get a ldN/stN intrinsic according to the Factor (2, 3, or 4). -static Intrinsic::ID getLdNStNIntrinsic(unsigned Factor, bool IsLoad) { - static const Intrinsic::ID LoadInt[3] = {Intrinsic::aarch64_neon_ld2, - Intrinsic::aarch64_neon_ld3, - Intrinsic::aarch64_neon_ld4}; - static const Intrinsic::ID StoreInt[3] = {Intrinsic::aarch64_neon_st2, - Intrinsic::aarch64_neon_st3, - Intrinsic::aarch64_neon_st4}; - - assert(Factor >= MIN_FACTOR && Factor <= MAX_FACTOR && - "Invalid interleave factor"); - - if (IsLoad) - return LoadInt[Factor - 2]; - else - return StoreInt[Factor - 2]; -} - -/// \brief Check if the mask is a DE-interleave mask of the given factor -/// \p Factor like: -/// -static bool isDeInterleaveMaskOfFactor(ArrayRef Mask, unsigned Factor, - unsigned &Index) { - // Check all potential start indices from 0 to (Factor - 1). - for (Index = 0; Index < Factor; Index++) { - unsigned i = 0; - - // Check that elements are in ascending order by Factor. - for (; i < Mask.size(); i++) - if (Mask[i] >= 0 && static_cast(Mask[i]) != Index + i * Factor) - break; - - if (i == Mask.size()) - return true; - } - - return false; -} - -/// \brief Check if the mask is a DE-interleave mask for an interleaved load. -/// -/// E.g. DE-interleave masks (Factor = 2) could be: -/// <0, 2, 4, 6> (mask of index 0 to extract even elements) -/// <1, 3, 5, 7> (mask of index 1 to extract odd elements) -static bool isDeInterleaveMask(ArrayRef Mask, unsigned &Factor, - unsigned &Index) { - unsigned NumElts = Mask.size(); - if (NumElts < 2) - return false; - - for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++) - if (isDeInterleaveMaskOfFactor(Mask, Factor, Index)) - return true; - - return false; -} - -/// \brief Check if the given mask \p Mask is RE-interleaved mask of the given -/// factor \p Factor. -/// -/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...> -static bool isReInterleaveMaskOfFactor(ArrayRef Mask, unsigned Factor) { - unsigned NumElts = Mask.size(); - if (NumElts % Factor) - return false; - - unsigned NumSubElts = NumElts / Factor; - if (!isPowerOf2_32(NumSubElts)) - return false; - - for (unsigned i = 0; i < NumSubElts; i++) - for (unsigned j = 0; j < Factor; j++) - if (Mask[i * Factor + j] >= 0 && - static_cast(Mask[i * Factor + j]) != j * NumSubElts + i) - return false; - - return true; -} - -/// \brief Check if the mask is RE-interleave mask for an interleaved store. -/// -/// E.g. The RE-interleave mask (Factor = 2) could be: -/// <0, 4, 1, 5, 2, 6, 3, 7> -static bool isReInterleaveMask(ArrayRef Mask, unsigned &Factor) { - if (Mask.size() < 4) - return false; - - // Check potential Factors and return true if find a factor for the mask. - for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++) - if (isReInterleaveMaskOfFactor(Mask, Factor)) - return true; - - return false; -} - -/// \brief Get a mask consisting of sequential integers starting from \p Start. -/// -/// I.e. -static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, - unsigned NumElts) { - SmallVector Mask; - for (unsigned i = 0; i < NumElts; i++) - Mask.push_back(Builder.getInt32(Start + i)); - - return ConstantVector::get(Mask); -} - -bool AArch64InterleavedAccess::matchInterleavedLoad( - ShuffleVectorInst *SVI, SmallSetVector &DeadInsts) { - if (DeadInsts.count(SVI)) - return false; - - LoadInst *LI = dyn_cast(SVI->getOperand(0)); - if (!LI || !LI->isSimple() || !isa(SVI->getOperand(1))) - return false; - - SmallVector Shuffles; - - // Check if all users of this load are shufflevectors. - for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) { - ShuffleVectorInst *SV = dyn_cast(*UI); - if (!SV) - return false; - - Shuffles.push_back(SV); - } - - // Check if the type of the first shuffle is legal. - VectorType *VecTy = Shuffles[0]->getType(); - unsigned TypeSize = DL->getTypeAllocSizeInBits(VecTy); - if (TypeSize != 64 && TypeSize != 128) - return false; - - // Check if the mask of the first shuffle is strided and get the start index. - unsigned Factor, Index; - if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index)) - return false; - - // Holds the corresponding index for each strided shuffle. - SmallVector Indices; - Indices.push_back(Index); - - // Check if other shufflevectors are of the same type and factor - for (unsigned i = 1; i < Shuffles.size(); i++) { - if (Shuffles[i]->getType() != VecTy) - return false; - - unsigned Index; - if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor, - Index)) - return false; - - Indices.push_back(Index); - } - - DEBUG(dbgs() << "Found an interleaved load:" << *LI << "\n"); - - // A pointer vector can not be the return type of the ldN intrinsics. Need to - // load integer vectors first and then convert to pointer vectors. - Type *EltTy = VecTy->getVectorElementType(); - if (EltTy->isPointerTy()) - VecTy = VectorType::get(DL->getIntPtrType(EltTy), - VecTy->getVectorNumElements()); - - Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); - Type *Tys[2] = {VecTy, PtrTy}; - Function *LdNFunc = - Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, true), Tys); - - IRBuilder<> Builder(LI); - Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); - - CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); - DEBUG(dbgs() << " Created:" << *LdN << "\n"); - - // Replace each strided shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SV = Shuffles[i]; - unsigned Index = Indices[i]; - - Value *SubVec = Builder.CreateExtractValue(LdN, Index); - - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); - - SV->replaceAllUsesWith(SubVec); - - DEBUG(dbgs() << " Replaced:" << *SV << "\n" - << " With:" << *SubVec << "\n"); - - // Avoid analyzing it twice. - DeadInsts.insert(SV); - } - - // Mark this load as dead. - DeadInsts.insert(LI); - return true; -} - -bool AArch64InterleavedAccess::matchInterleavedStore( - ShuffleVectorInst *SVI, SmallSetVector &DeadInsts) { - if (DeadInsts.count(SVI) || !SVI->hasOneUse()) - return false; - - StoreInst *SI = dyn_cast(SVI->user_back()); - if (!SI || !SI->isSimple()) - return false; - - // Check if the mask is interleaved and get the interleave factor. - unsigned Factor; - if (!isReInterleaveMask(SVI->getShuffleMask(), Factor)) - return false; - - VectorType *VecTy = SVI->getType(); - unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; - Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); - - // Skip illegal vector types. - unsigned TypeSize = DL->getTypeAllocSizeInBits(SubVecTy); - if (TypeSize != 64 && TypeSize != 128) - return false; - - DEBUG(dbgs() << "Found an interleaved store:" << *SI << "\n"); - - Value *Op0 = SVI->getOperand(0); - Value *Op1 = SVI->getOperand(1); - IRBuilder<> Builder(SI); - - // StN intrinsics don't support pointer vectors as arguments. Convert pointer - // vectors to integer vectors. - if (EltTy->isPointerTy()) { - Type *IntTy = DL->getIntPtrType(EltTy); - unsigned NumOpElts = - dyn_cast(Op0->getType())->getVectorNumElements(); - - // The corresponding integer vector type of the same element size. - Type *IntVecTy = VectorType::get(IntTy, NumOpElts); - - Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); - Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, NumSubElts); - } - - Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); - Type *Tys[2] = {SubVecTy, PtrTy}; - Function *StNFunc = - Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, false), Tys); - - SmallVector Ops; - - // Split the shufflevector operands into sub vectors for the new stN call. - for (unsigned i = 0; i < Factor; i++) - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); - - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); - CallInst *StN = Builder.CreateCall(StNFunc, Ops); - - (void)StN; // silence warning. - DEBUG(dbgs() << " Replaced:" << *SI << "'\n"); - DEBUG(dbgs() << " with:" << *StN << "\n"); - - // Mark this shufflevector and store as dead. - DeadInsts.insert(SI); - DeadInsts.insert(SVI); - return true; -} - -bool AArch64InterleavedAccess::runOnFunction(Function &F) { - DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n"); - - M = F.getParent(); - DL = &M->getDataLayout(); - - // Holds dead instructions that will be erased later. - SmallSetVector DeadInsts; - bool Changed = false; - for (auto &I : inst_range(F)) { - if (ShuffleVectorInst *SVI = dyn_cast(&I)) { - Changed |= matchInterleavedLoad(SVI, DeadInsts); - Changed |= matchInterleavedStore(SVI, DeadInsts); - } - } - - for (auto I : DeadInsts) - I->eraseFromParent(); - - return Changed; -} diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index f0ee6649676..0502c1b9f50 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -67,11 +67,6 @@ EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden, " to make use of cmpxchg flow-based information"), cl::init(true)); -static cl::opt AArch64InterleavedAccessOpt( - "aarch64-interleaved-access-opt", - cl::desc("Optimize interleaved memory accesses in the AArch64 backend"), - cl::init(false), cl::Hidden); - static cl::opt EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), @@ -230,9 +225,6 @@ void AArch64PassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass()); - if (TM->getOptLevel() != CodeGenOpt::None && AArch64InterleavedAccessOpt) - addPass(createAArch64InterleavedAccessPass()); - TargetPassConfig::addIRPasses(); if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a51a0674c8f..ed27cf84bbb 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -407,18 +407,6 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( - unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace) { - assert(isa(VecTy) && "Expect vector types"); - - if (Factor > 1 && Factor < 5 && isTypeLegal(VecTy)) - return Factor; - - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); -} - unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { unsigned Cost = 0; for (auto *I : Tys) { diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 4dabdadd8ee..25c22bcd58e 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,11 +139,6 @@ public: bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace); /// @} }; diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 676a2482ed1..f26327ff84a 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -38,7 +38,6 @@ add_llvm_target(AArch64CodeGen AArch64PBQPRegAlloc.cpp AArch64RegisterInfo.cpp AArch64SelectionDAGInfo.cpp - AArch64InterleavedAccess.cpp AArch64StorePairSuppress.cpp AArch64Subtarget.cpp AArch64TargetMachine.cpp diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll deleted file mode 100644 index e651be97569..00000000000 --- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll +++ /dev/null @@ -1,197 +0,0 @@ -; RUN: llc -march=aarch64 -aarch64-interleaved-access-opt=true < %s | FileCheck %s - -; CHECK-LABEL: load_factor2: -; CHECK: ld2 { v0.8b, v1.8b }, [x0] -define <8 x i8> @load_factor2(<16 x i8>* %ptr) { - %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4 - %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> - %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> - %add = add nsw <8 x i8> %strided.v0, %strided.v1 - ret <8 x i8> %add -} - -; CHECK-LABEL: load_delat3: -; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0] -define <4 x i32> @load_delat3(i32* %ptr) { - %base = bitcast i32* %ptr to <12 x i32>* - %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 - %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> - %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> - %add = add nsw <4 x i32> %strided.v2, %strided.v1 - ret <4 x i32> %add -} - -; CHECK-LABEL: load_factor4: -; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] -define <4 x i32> @load_factor4(i32* %ptr) { - %base = bitcast i32* %ptr to <16 x i32>* - %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 - %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> - %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> - %add = add nsw <4 x i32> %strided.v0, %strided.v2 - ret <4 x i32> %add -} - -; CHECK-LABEL: store_factor2: -; CHECK: st2 { v0.8b, v1.8b }, [x0] -define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) { - %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> - store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4 - ret void -} - -; CHECK-LABEL: store_factor3: -; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0] -define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { - %base = bitcast i32* %ptr to <12 x i32>* - %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> - %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> - %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> - store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4 - ret void -} - -; CHECK-LABEL: store_factor4: -; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] -define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { - %base = bitcast i32* %ptr to <16 x i32>* - %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> - %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> - %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> - store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 - ret void -} - -; The following cases test that interleaved access of pointer vectors can be -; matched to ldN/stN instruction. - -; CHECK-LABEL: load_ptrvec_factor2: -; CHECK: ld2 { v0.2d, v1.2d }, [x0] -define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) { - %base = bitcast i32** %ptr to <4 x i32*>* - %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4 - %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> - ret <2 x i32*> %strided.v0 -} - -; CHECK-LABEL: load_ptrvec_factor3: -; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0] -define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { - %base = bitcast i32** %ptr to <6 x i32*>* - %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4 - %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> - store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1 - %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> - store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2 - ret void -} - -; CHECK-LABEL: load_ptrvec_factor4: -; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] -define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { - %base = bitcast i32** %ptr to <8 x i32*>* - %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4 - %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> - %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> - store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1 - store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2 - ret void -} - -; CHECK-LABEL: store_ptrvec_factor2: -; CHECK: st2 { v0.2d, v1.2d }, [x0] -define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) { - %base = bitcast i32** %ptr to <4 x i32*>* - %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> - store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4 - ret void -} - -; CHECK-LABEL: store_ptrvec_factor3: -; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0] -define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) { - %base = bitcast i32** %ptr to <6 x i32*>* - %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> - %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> - %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> - store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4 - ret void -} - -; CHECK-LABEL: store_ptrvec_factor4: -; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] -define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) { - %base = bitcast i32* %ptr to <8 x i32*>* - %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> - %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> - %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> - store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4 - ret void -} - -; Following cases check that shuffle maskes with undef indices can be matched -; into ldN/stN instruction. - -; CHECK-LABEL: load_undef_mask_factor2: -; CHECK: ld2 { v0.4s, v1.4s }, [x0] -define <4 x i32> @load_undef_mask_factor2(i32* %ptr) { - %base = bitcast i32* %ptr to <8 x i32>* - %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4 - %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> - %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> - %add = add nsw <4 x i32> %strided.v0, %strided.v1 - ret <4 x i32> %add -} - -; CHECK-LABEL: load_undef_mask_factor3: -; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0] -define <4 x i32> @load_undef_mask_factor3(i32* %ptr) { - %base = bitcast i32* %ptr to <12 x i32>* - %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 - %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> - %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> - %add = add nsw <4 x i32> %strided.v2, %strided.v1 - ret <4 x i32> %add -} - -; CHECK-LABEL: load_undef_mask_factor4: -; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] -define <4 x i32> @load_undef_mask_factor4(i32* %ptr) { - %base = bitcast i32* %ptr to <16 x i32>* - %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 - %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> - %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> - %add = add nsw <4 x i32> %strided.v0, %strided.v2 - ret <4 x i32> %add -} - -; CHECK-LABEL: store_undef_mask_factor2: -; CHECK: st2 { v0.4s, v1.4s }, [x0] -define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) { - %base = bitcast i32* %ptr to <8 x i32>* - %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> - store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4 - ret void -} - -; CHECK-LABEL: store_undef_mask_factor3: -; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0] -define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { - %base = bitcast i32* %ptr to <12 x i32>* - %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> - %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> - %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> - store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4 - ret void -} - -; CHECK-LABEL: store_undef_mask_factor4: -; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] -define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { - %base = bitcast i32* %ptr to <16 x i32>* - %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> - %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> - %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> - store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 - ret void -} -- 2.34.1