From: Suyog Sarda Date: Fri, 12 Dec 2014 12:53:44 +0000 (+0000) Subject: This patch recognizes (+ (+ v0, v1) (+ v2, v3)), reorders them for bundling into... X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=1dea0dc279b7c22e24599bbccbcf1131fe7f591d;p=oota-llvm.git This patch recognizes (+ (+ v0, v1) (+ v2, v3)), reorders them for bundling into vector of loads, and vectorizes it. Test case : float hadd(float* a) { return (a[0] + a[1]) + (a[2] + a[3]); } AArch64 assembly before patch : ldp s0, s1, [x0] ldp s2, s3, [x0, #8] fadd s0, s0, s1 fadd s1, s2, s3 fadd s0, s0, s1 ret AArch64 assembly after patch : ldp d0, d1, [x0] fadd v0.2s, v0.2s, v1.2s faddp s0, v0.2s ret Reviewed Link : http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20141208/248531.html git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224119 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 44bfea14670..dafda21da8d 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -439,6 +439,13 @@ public: /// \returns true if the memory operations A and B are consecutive. bool isConsecutiveAccess(Value *A, Value *B); + /// For consecutive loads (+(+ v0, v1)(+ v2, v3)), Left had v0 and v2 + /// while Right had v1 and v3, which prevented bundling them into + /// a vector of loads. Rorder them so that Left now has v0 and v1 + /// while Right has v2 and v3 enabling their bundling into a vector. + void reorderIfConsecutiveLoads(SmallVectorImpl &Left, + SmallVectorImpl &Right); + /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -1234,6 +1241,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); + reorderIfConsecutiveLoads (Left, Right); buildTree_rec(Left, Depth + 1); buildTree_rec(Right, Depth + 1); return; @@ -1818,6 +1826,19 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { return X == PtrSCEVB; } +void BoUpSLP::reorderIfConsecutiveLoads(SmallVectorImpl &Left, + SmallVectorImpl &Right) { + for (unsigned i = 0, e = Left.size(); i < e - 1; ++i) { + if (!isa(Left[i]) || !isa(Right[i])) + return; + if (!(isConsecutiveAccess(Left[i], Right[i]) && + isConsecutiveAccess(Right[i], Left[i + 1]))) + continue; + else + std::swap(Left[i + 1], Right[i]); + } +} + void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL) { Instruction *VL0 = cast(VL[0]); BasicBlock::iterator NextInst = VL0; @@ -2048,9 +2069,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Or: case Instruction::Xor: { ValueList LHSVL, RHSVL; - if (isa(VL0) && VL0->isCommutative()) + if (isa(VL0) && VL0->isCommutative()) { reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); - else + reorderIfConsecutiveLoads(LHSVL, RHSVL); + } else for (int i = 0, e = E->Scalars.size(); i < e; ++i) { LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); diff --git a/test/Transforms/SLPVectorizer/AArch64/horizontaladd.ll b/test/Transforms/SLPVectorizer/AArch64/horizontaladd.ll new file mode 100644 index 00000000000..98de5e7ede4 --- /dev/null +++ b/test/Transforms/SLPVectorizer/AArch64/horizontaladd.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; float hadd (float *a) { +; return (a[0] + a[1]) + (a[2] + a[3]); +; } + +; CHECK_LABEL: @hadd +; CHECK: load <2 x float>* +; CHECK: fadd <2 x float> +; CHECK: extractelement <2 x float> + +define float @hadd(float* nocapture readonly %a) { +entry: + %0 = load float* %a, align 4 + %arrayidx1 = getelementptr inbounds float* %a, i64 1 + %1 = load float* %arrayidx1, align 4 + %add = fadd float %0, %1 + %arrayidx2 = getelementptr inbounds float* %a, i64 2 + %2 = load float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float* %a, i64 3 + %3 = load float* %arrayidx3, align 4 + %add4 = fadd float %2, %3 + %add5 = fadd float %add, %add4 + ret float %add5 +}