From: Nadav Rotem Date: Sun, 14 Apr 2013 03:22:20 +0000 (+0000) Subject: SLPVectorizer: add initial support for reduction variable vectorization. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=f7eaf29cf70a545f5b717c638db83ba6e8b6b3c5;p=oota-llvm.git SLPVectorizer: add initial support for reduction variable vectorization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179470 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 209d2877433..2f55a007f24 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/Verifier.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -38,7 +39,7 @@ using namespace llvm; static cl::opt -SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden, +SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize trees if the gain is above this " "number. (gain = -cost of vectorization)")); namespace { @@ -63,7 +64,7 @@ struct SLPVectorizer : public BasicBlockPass { /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - bool CollectStores(BasicBlock *BB, BoUpSLP &R) { + bool collectStores(BasicBlock *BB, BoUpSLP &R) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { StoreInst *SI = dyn_cast(it); if (!SI) @@ -84,7 +85,79 @@ struct SLPVectorizer : public BasicBlockPass { return true; } - bool RollStoreChains(BoUpSLP &R) { + bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B, BoUpSLP &R) { + if (!A || !B) return false; + BoUpSLP::ValueList VL; + VL.push_back(A); + VL.push_back(B); + int Cost = R.getTreeCost(VL); + DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n"); + if (Cost >= -SLPCostThreshold) return false; + DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); + R.vectorizeArith(VL); + return true; + } + + bool tryToVectorizeCandidate(BinaryOperator *V, BoUpSLP &R) { + if (!V) return false; + BinaryOperator *A = dyn_cast(V->getOperand(0)); + BinaryOperator *B = dyn_cast(V->getOperand(1)); + // Try to vectorize V. + if (tryToVectorizePair(A, B, R)) return true; + + // Try to skip B. + if (B && B->hasOneUse()) { + BinaryOperator *B0 = dyn_cast(B->getOperand(0)); + BinaryOperator *B1 = dyn_cast(B->getOperand(1)); + if (tryToVectorizePair(A, B0, R)) { + B->moveBefore(V); + return true; + } + if (tryToVectorizePair(A, B1, R)) { + B->moveBefore(V); + return true; + } + } + + // Try to slip A. + if (A && A->hasOneUse()) { + BinaryOperator *A0 = dyn_cast(A->getOperand(0)); + BinaryOperator *A1 = dyn_cast(A->getOperand(1)); + if (tryToVectorizePair(A0, B, R)) { + A->moveBefore(V); + return true; + } + if (tryToVectorizePair(A1, B, R)) { + A->moveBefore(V); + return true; + } + } + return 0; + } + + bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { + bool Changed = false; + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + if (isa(it)) continue; + PHINode *P = dyn_cast(it); + if (!P) return Changed; + // Check that the PHI is a reduction PHI. + if (P->getNumIncomingValues() != 2) return Changed; + Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : + (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0)); + // Check if this is a Binary Operator. + BinaryOperator *BI = dyn_cast_or_null(Rdx); + if (!BI) continue; + + Value *Inst = BI->getOperand(0); + if (Inst == P) Inst = BI->getOperand(1); + Changed |= tryToVectorizeCandidate(dyn_cast(Inst), R); + } + + return Changed; + } + + bool rollStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -116,13 +189,15 @@ struct SLPVectorizer : public BasicBlockPass { // he store instructions. BoUpSLP R(&BB, SE, DL, TTI, AA); - if (!CollectStores(&BB, R)) - return false; + bool Changed = vectorizeReductions(&BB, R); + + if (!collectStores(&BB, R)) + return Changed; - bool Changed = RollStoreChains(R); - if (Changed) { + if (rollStoreChains(R)) { DEBUG(dbgs()<<"SLP: vectorized in \""<getName()<<"\"\n"); DEBUG(verifyFunction(*BB.getParent())); + Changed |= true; } return Changed; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index d8be0ae721c..4d075c505d2 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -208,6 +208,16 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { return 0; } +void BoUpSLP::vectorizeArith(ValueList &Operands) { + Value *Vec = vectorizeTree(Operands, Operands.size()); + BasicBlock::iterator Loc = cast(Vec); + IRBuilder<> Builder(++Loc); + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); + Operands[i]->replaceAllUsesWith(S); + } +} + int BoUpSLP::getTreeCost(ValueList &VL) { // Get rid of the list of stores that were removed, and from the // lists of instructions with multiple users. diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index 808fb10f9f4..f865236ff88 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -66,6 +66,9 @@ struct BoUpSLP { /// \returns true if the basic block was modified. bool vectorizeStores(StoreList &Stores, int costThreshold); + /// \brief Vectorize a group of scalars into a vector tree. + void vectorizeArith(ValueList &Operands); + private: /// \returns This method contains the recursive part of getTreeCost. int getTreeCost_rec(ValueList &VL, unsigned Depth); diff --git a/test/Transforms/SLPVectorizer/X86/reduction.ll b/test/Transforms/SLPVectorizer/X86/reduction.ll new file mode 100644 index 00000000000..ced9f157832 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/reduction.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.8.0" + +; int foo(double *A, int n, int m) { +; double sum = 0, v1 = 2, v0 = 3; +; for (int i=0; i < n; ++i) +; sum += 7*A[i*2] + 7*A[i*2+1]; +; return sum; +; } + +;CHECK: reduce +;CHECK: load <2 x double> +;CHECK: fmul <2 x double> +;CHECK: ret +define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) #0 { +entry: + %cmp13 = icmp sgt i32 %n, 0 + br i1 %cmp13, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ] + %mul = shl nsw i32 %i.015, 1 + %arrayidx = getelementptr inbounds double* %A, i32 %mul + %0 = load double* %arrayidx, align 4, !tbaa !0 + %mul1 = fmul double %0, 7.000000e+00 + %add12 = or i32 %mul, 1 + %arrayidx3 = getelementptr inbounds double* %A, i32 %add12 + %1 = load double* %arrayidx3, align 4, !tbaa !0 + %mul4 = fmul double %1, 7.000000e+00 + %add5 = fadd double %mul1, %mul4 + %add6 = fadd double %sum.014, %add5 + %inc = add nsw i32 %i.015, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: ; preds = %for.body + %phitmp = fptosi double %add6 to i32 + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + ret i32 %sum.0.lcssa +} + +attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"double", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"}