1 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements a basic-block vectorization pass. The algorithm was
11 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
12 // et al. It works by looking for chains of pairable operations and then
15 //===----------------------------------------------------------------------===//
17 #define BBV_NAME "bb-vectorize"
18 #define DEBUG_TYPE BBV_NAME
19 #include "llvm/Transforms/Vectorize.h"
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/DenseSet.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallSet.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringExtras.h"
27 #include "llvm/Analysis/AliasAnalysis.h"
28 #include "llvm/Analysis/AliasSetTracker.h"
29 #include "llvm/Analysis/Dominators.h"
30 #include "llvm/Analysis/ScalarEvolution.h"
31 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
32 #include "llvm/Analysis/ValueTracking.h"
33 #include "llvm/Constants.h"
34 #include "llvm/DataLayout.h"
35 #include "llvm/DerivedTypes.h"
36 #include "llvm/Function.h"
37 #include "llvm/Instructions.h"
38 #include "llvm/IntrinsicInst.h"
39 #include "llvm/Intrinsics.h"
40 #include "llvm/LLVMContext.h"
41 #include "llvm/Metadata.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/ValueHandle.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/TargetTransformInfo.h"
48 #include "llvm/Transforms/Utils/Local.h"
49 #include "llvm/Type.h"
55 IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
56 cl::Hidden, cl::desc("Ignore target information"));
58 static cl::opt<unsigned>
59 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
60 cl::desc("The required chain depth for vectorization"));
63 UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
64 cl::Hidden, cl::desc("Use the chain depth requirement with"
65 " target information"));
67 static cl::opt<unsigned>
68 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
69 cl::desc("The maximum search distance for instruction pairs"));
72 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
73 cl::desc("Replicating one element to a pair breaks the chain"));
75 static cl::opt<unsigned>
76 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
77 cl::desc("The size of the native vector registers"));
79 static cl::opt<unsigned>
80 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
81 cl::desc("The maximum number of pairing iterations"));
84 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
85 cl::desc("Don't try to form non-2^n-length vectors"));
87 static cl::opt<unsigned>
88 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
89 cl::desc("The maximum number of pairable instructions per group"));
91 static cl::opt<unsigned>
92 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
93 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
94 " a full cycle check"));
97 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
98 cl::desc("Don't try to vectorize boolean (i1) values"));
101 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
102 cl::desc("Don't try to vectorize integer values"));
105 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
106 cl::desc("Don't try to vectorize floating-point values"));
108 // FIXME: This should default to false once pointer vector support works.
110 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
111 cl::desc("Don't try to vectorize pointer values"));
114 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
115 cl::desc("Don't try to vectorize casting (conversion) operations"));
118 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
119 cl::desc("Don't try to vectorize floating-point math intrinsics"));
122 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
123 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
126 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
127 cl::desc("Don't try to vectorize select instructions"));
130 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
131 cl::desc("Don't try to vectorize comparison instructions"));
134 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
135 cl::desc("Don't try to vectorize getelementptr instructions"));
138 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
139 cl::desc("Don't try to vectorize loads and stores"));
142 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
143 cl::desc("Only generate aligned loads and stores"));
146 NoMemOpBoost("bb-vectorize-no-mem-op-boost",
147 cl::init(false), cl::Hidden,
148 cl::desc("Don't boost the chain-depth contribution of loads and stores"));
151 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
152 cl::desc("Use a fast instruction dependency analysis"));
156 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
157 cl::init(false), cl::Hidden,
158 cl::desc("When debugging is enabled, output information on the"
159 " instruction-examination process"));
161 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
162 cl::init(false), cl::Hidden,
163 cl::desc("When debugging is enabled, output information on the"
164 " candidate-selection process"));
166 DebugPairSelection("bb-vectorize-debug-pair-selection",
167 cl::init(false), cl::Hidden,
168 cl::desc("When debugging is enabled, output information on the"
169 " pair-selection process"));
171 DebugCycleCheck("bb-vectorize-debug-cycle-check",
172 cl::init(false), cl::Hidden,
173 cl::desc("When debugging is enabled, output information on the"
174 " cycle-checking process"));
177 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
178 cl::init(false), cl::Hidden,
179 cl::desc("When debugging is enabled, dump the basic block after"
180 " every pair is fused"));
183 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
186 struct BBVectorize : public BasicBlockPass {
187 static char ID; // Pass identification, replacement for typeid
189 const VectorizeConfig Config;
191 BBVectorize(const VectorizeConfig &C = VectorizeConfig())
192 : BasicBlockPass(ID), Config(C) {
193 initializeBBVectorizePass(*PassRegistry::getPassRegistry());
196 BBVectorize(Pass *P, const VectorizeConfig &C)
197 : BasicBlockPass(ID), Config(C) {
198 AA = &P->getAnalysis<AliasAnalysis>();
199 DT = &P->getAnalysis<DominatorTree>();
200 SE = &P->getAnalysis<ScalarEvolution>();
201 TD = P->getAnalysisIfAvailable<DataLayout>();
202 TTI = IgnoreTargetInfo ? 0 :
203 P->getAnalysisIfAvailable<TargetTransformInfo>();
204 VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
207 typedef std::pair<Value *, Value *> ValuePair;
208 typedef std::pair<ValuePair, int> ValuePairWithCost;
209 typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
210 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
211 typedef std::pair<VPPair, unsigned> VPPairWithType;
212 typedef std::pair<std::multimap<Value *, Value *>::iterator,
213 std::multimap<Value *, Value *>::iterator> VPIteratorPair;
214 typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
215 std::multimap<ValuePair, ValuePair>::iterator>
222 TargetTransformInfo *TTI;
223 const VectorTargetTransformInfo *VTTI;
225 // FIXME: const correct?
227 bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
229 bool getCandidatePairs(BasicBlock &BB,
230 BasicBlock::iterator &Start,
231 std::multimap<Value *, Value *> &CandidatePairs,
232 DenseSet<ValuePair> &FixedOrderPairs,
233 DenseMap<ValuePair, int> &CandidatePairCostSavings,
234 std::vector<Value *> &PairableInsts, bool NonPow2Len);
236 // FIXME: The current implementation does not account for pairs that
237 // are connected in multiple ways. For example:
238 // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
239 enum PairConnectionType {
240 PairConnectionDirect,
245 void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
246 std::vector<Value *> &PairableInsts,
247 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
248 DenseMap<VPPair, unsigned> &PairConnectionTypes);
250 void buildDepMap(BasicBlock &BB,
251 std::multimap<Value *, Value *> &CandidatePairs,
252 std::vector<Value *> &PairableInsts,
253 DenseSet<ValuePair> &PairableInstUsers);
255 void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
256 DenseMap<ValuePair, int> &CandidatePairCostSavings,
257 std::vector<Value *> &PairableInsts,
258 DenseSet<ValuePair> &FixedOrderPairs,
259 DenseMap<VPPair, unsigned> &PairConnectionTypes,
260 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
261 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
262 DenseSet<ValuePair> &PairableInstUsers,
263 DenseMap<Value *, Value *>& ChosenPairs);
265 void fuseChosenPairs(BasicBlock &BB,
266 std::vector<Value *> &PairableInsts,
267 DenseMap<Value *, Value *>& ChosenPairs,
268 DenseSet<ValuePair> &FixedOrderPairs,
269 DenseMap<VPPair, unsigned> &PairConnectionTypes,
270 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
271 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
274 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
276 bool areInstsCompatible(Instruction *I, Instruction *J,
277 bool IsSimpleLoadStore, bool NonPow2Len,
278 int &CostSavings, int &FixedOrder);
280 bool trackUsesOfI(DenseSet<Value *> &Users,
281 AliasSetTracker &WriteSet, Instruction *I,
282 Instruction *J, bool UpdateUsers = true,
283 std::multimap<Value *, Value *> *LoadMoveSet = 0);
285 void computePairsConnectedTo(
286 std::multimap<Value *, Value *> &CandidatePairs,
287 std::vector<Value *> &PairableInsts,
288 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
289 DenseMap<VPPair, unsigned> &PairConnectionTypes,
292 bool pairsConflict(ValuePair P, ValuePair Q,
293 DenseSet<ValuePair> &PairableInstUsers,
294 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
296 bool pairWillFormCycle(ValuePair P,
297 std::multimap<ValuePair, ValuePair> &PairableInstUsers,
298 DenseSet<ValuePair> &CurrentPairs);
301 std::multimap<Value *, Value *> &CandidatePairs,
302 std::vector<Value *> &PairableInsts,
303 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
304 DenseSet<ValuePair> &PairableInstUsers,
305 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
306 DenseMap<Value *, Value *> &ChosenPairs,
307 DenseMap<ValuePair, size_t> &Tree,
308 DenseSet<ValuePair> &PrunedTree, ValuePair J,
311 void buildInitialTreeFor(
312 std::multimap<Value *, Value *> &CandidatePairs,
313 std::vector<Value *> &PairableInsts,
314 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
315 DenseSet<ValuePair> &PairableInstUsers,
316 DenseMap<Value *, Value *> &ChosenPairs,
317 DenseMap<ValuePair, size_t> &Tree, ValuePair J);
319 void findBestTreeFor(
320 std::multimap<Value *, Value *> &CandidatePairs,
321 DenseMap<ValuePair, int> &CandidatePairCostSavings,
322 std::vector<Value *> &PairableInsts,
323 DenseSet<ValuePair> &FixedOrderPairs,
324 DenseMap<VPPair, unsigned> &PairConnectionTypes,
325 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
326 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
327 DenseSet<ValuePair> &PairableInstUsers,
328 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
329 DenseMap<Value *, Value *> &ChosenPairs,
330 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
331 int &BestEffSize, VPIteratorPair ChoiceRange,
334 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
335 Instruction *J, unsigned o);
337 void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
338 unsigned MaskOffset, unsigned NumInElem,
339 unsigned NumInElem1, unsigned IdxOffset,
340 std::vector<Constant*> &Mask);
342 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
345 bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
346 unsigned o, Value *&LOp, unsigned numElemL,
347 Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
348 unsigned IdxOff = 0);
350 Value *getReplacementInput(LLVMContext& Context, Instruction *I,
351 Instruction *J, unsigned o, bool IBeforeJ);
353 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
354 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
357 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
358 Instruction *J, Instruction *K,
359 Instruction *&InsertionPt, Instruction *&K1,
362 void collectPairLoadMoveSet(BasicBlock &BB,
363 DenseMap<Value *, Value *> &ChosenPairs,
364 std::multimap<Value *, Value *> &LoadMoveSet,
367 void collectLoadMoveSet(BasicBlock &BB,
368 std::vector<Value *> &PairableInsts,
369 DenseMap<Value *, Value *> &ChosenPairs,
370 std::multimap<Value *, Value *> &LoadMoveSet);
372 bool canMoveUsesOfIAfterJ(BasicBlock &BB,
373 std::multimap<Value *, Value *> &LoadMoveSet,
374 Instruction *I, Instruction *J);
376 void moveUsesOfIAfterJ(BasicBlock &BB,
377 std::multimap<Value *, Value *> &LoadMoveSet,
378 Instruction *&InsertionPt,
379 Instruction *I, Instruction *J);
381 void combineMetadata(Instruction *K, const Instruction *J);
383 bool vectorizeBB(BasicBlock &BB) {
384 if (!DT->isReachableFromEntry(&BB)) {
385 DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
386 " in " << BB.getParent()->getName() << "\n");
390 DEBUG(if (VTTI) dbgs() << "BBV: using target information\n");
392 bool changed = false;
393 // Iterate a sufficient number of times to merge types of size 1 bit,
394 // then 2 bits, then 4, etc. up to half of the target vector width of the
395 // target vector register.
398 (VTTI || v <= Config.VectorBits) &&
399 (!Config.MaxIter || n <= Config.MaxIter);
401 DEBUG(dbgs() << "BBV: fusing loop #" << n <<
402 " for " << BB.getName() << " in " <<
403 BB.getParent()->getName() << "...\n");
404 if (vectorizePairs(BB))
410 if (changed && !Pow2LenOnly) {
412 for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
413 DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
414 n << " for " << BB.getName() << " in " <<
415 BB.getParent()->getName() << "...\n");
416 if (!vectorizePairs(BB, true)) break;
420 DEBUG(dbgs() << "BBV: done!\n");
424 virtual bool runOnBasicBlock(BasicBlock &BB) {
425 AA = &getAnalysis<AliasAnalysis>();
426 DT = &getAnalysis<DominatorTree>();
427 SE = &getAnalysis<ScalarEvolution>();
428 TD = getAnalysisIfAvailable<DataLayout>();
429 TTI = IgnoreTargetInfo ? 0 :
430 getAnalysisIfAvailable<TargetTransformInfo>();
431 VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
433 return vectorizeBB(BB);
436 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
437 BasicBlockPass::getAnalysisUsage(AU);
438 AU.addRequired<AliasAnalysis>();
439 AU.addRequired<DominatorTree>();
440 AU.addRequired<ScalarEvolution>();
441 AU.addPreserved<AliasAnalysis>();
442 AU.addPreserved<DominatorTree>();
443 AU.addPreserved<ScalarEvolution>();
444 AU.setPreservesCFG();
447 static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
448 assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
449 "Cannot form vector from incompatible scalar types");
450 Type *STy = ElemTy->getScalarType();
453 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
454 numElem = VTy->getNumElements();
459 if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
460 numElem += VTy->getNumElements();
465 return VectorType::get(STy, numElem);
468 static inline void getInstructionTypes(Instruction *I,
469 Type *&T1, Type *&T2) {
470 if (isa<StoreInst>(I)) {
471 // For stores, it is the value type, not the pointer type that matters
472 // because the value is what will come from a vector register.
474 Value *IVal = cast<StoreInst>(I)->getValueOperand();
475 T1 = IVal->getType();
481 T2 = cast<CastInst>(I)->getSrcTy();
485 if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
486 T2 = SI->getCondition()->getType();
487 } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
488 T2 = SI->getOperand(0)->getType();
489 } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
490 T2 = CI->getOperand(0)->getType();
494 // Returns the weight associated with the provided value. A chain of
495 // candidate pairs has a length given by the sum of the weights of its
496 // members (one weight per pair; the weight of each member of the pair
497 // is assumed to be the same). This length is then compared to the
498 // chain-length threshold to determine if a given chain is significant
499 // enough to be vectorized. The length is also used in comparing
500 // candidate chains where longer chains are considered to be better.
501 // Note: when this function returns 0, the resulting instructions are
502 // not actually fused.
503 inline size_t getDepthFactor(Value *V) {
504 // InsertElement and ExtractElement have a depth factor of zero. This is
505 // for two reasons: First, they cannot be usefully fused. Second, because
506 // the pass generates a lot of these, they can confuse the simple metric
507 // used to compare the trees in the next iteration. Thus, giving them a
508 // weight of zero allows the pass to essentially ignore them in
509 // subsequent iterations when looking for vectorization opportunities
510 // while still tracking dependency chains that flow through those
512 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
515 // Give a load or store half of the required depth so that load/store
516 // pairs will vectorize.
517 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
518 return Config.ReqChainDepth/2;
523 // Returns the cost of the provided instruction using VTTI.
524 // This does not handle loads and stores.
525 unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
528 case Instruction::GetElementPtr:
529 // We mark this instruction as zero-cost because scalar GEPs are usually
530 // lowered to the intruction addressing mode. At the moment we don't
531 // generate vector GEPs.
533 case Instruction::Br:
534 return VTTI->getCFInstrCost(Opcode);
535 case Instruction::PHI:
537 case Instruction::Add:
538 case Instruction::FAdd:
539 case Instruction::Sub:
540 case Instruction::FSub:
541 case Instruction::Mul:
542 case Instruction::FMul:
543 case Instruction::UDiv:
544 case Instruction::SDiv:
545 case Instruction::FDiv:
546 case Instruction::URem:
547 case Instruction::SRem:
548 case Instruction::FRem:
549 case Instruction::Shl:
550 case Instruction::LShr:
551 case Instruction::AShr:
552 case Instruction::And:
553 case Instruction::Or:
554 case Instruction::Xor:
555 return VTTI->getArithmeticInstrCost(Opcode, T1);
556 case Instruction::Select:
557 case Instruction::ICmp:
558 case Instruction::FCmp:
559 return VTTI->getCmpSelInstrCost(Opcode, T1, T2);
560 case Instruction::ZExt:
561 case Instruction::SExt:
562 case Instruction::FPToUI:
563 case Instruction::FPToSI:
564 case Instruction::FPExt:
565 case Instruction::PtrToInt:
566 case Instruction::IntToPtr:
567 case Instruction::SIToFP:
568 case Instruction::UIToFP:
569 case Instruction::Trunc:
570 case Instruction::FPTrunc:
571 case Instruction::BitCast:
572 case Instruction::ShuffleVector:
573 return VTTI->getCastInstrCost(Opcode, T1, T2);
579 // This determines the relative offset of two loads or stores, returning
580 // true if the offset could be determined to be some constant value.
581 // For example, if OffsetInElmts == 1, then J accesses the memory directly
582 // after I; if OffsetInElmts == -1 then I accesses the memory
584 bool getPairPtrInfo(Instruction *I, Instruction *J,
585 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
586 unsigned &IAddressSpace, unsigned &JAddressSpace,
587 int64_t &OffsetInElmts, bool ComputeOffset = true) {
589 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
590 LoadInst *LJ = cast<LoadInst>(J);
591 IPtr = LI->getPointerOperand();
592 JPtr = LJ->getPointerOperand();
593 IAlignment = LI->getAlignment();
594 JAlignment = LJ->getAlignment();
595 IAddressSpace = LI->getPointerAddressSpace();
596 JAddressSpace = LJ->getPointerAddressSpace();
598 StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
599 IPtr = SI->getPointerOperand();
600 JPtr = SJ->getPointerOperand();
601 IAlignment = SI->getAlignment();
602 JAlignment = SJ->getAlignment();
603 IAddressSpace = SI->getPointerAddressSpace();
604 JAddressSpace = SJ->getPointerAddressSpace();
610 const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
611 const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
613 // If this is a trivial offset, then we'll get something like
614 // 1*sizeof(type). With target data, which we need anyway, this will get
615 // constant folded into a number.
616 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
617 if (const SCEVConstant *ConstOffSCEV =
618 dyn_cast<SCEVConstant>(OffsetSCEV)) {
619 ConstantInt *IntOff = ConstOffSCEV->getValue();
620 int64_t Offset = IntOff->getSExtValue();
622 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
623 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
625 Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
626 if (VTy != VTy2 && Offset < 0) {
627 int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
628 OffsetInElmts = Offset/VTy2TSS;
629 return (abs64(Offset) % VTy2TSS) == 0;
632 OffsetInElmts = Offset/VTyTSS;
633 return (abs64(Offset) % VTyTSS) == 0;
639 // Returns true if the provided CallInst represents an intrinsic that can
641 bool isVectorizableIntrinsic(CallInst* I) {
642 Function *F = I->getCalledFunction();
643 if (!F) return false;
645 unsigned IID = F->getIntrinsicID();
646 if (!IID) return false;
651 case Intrinsic::sqrt:
652 case Intrinsic::powi:
656 case Intrinsic::log2:
657 case Intrinsic::log10:
659 case Intrinsic::exp2:
661 return Config.VectorizeMath;
663 return Config.VectorizeFMA;
667 // Returns true if J is the second element in some pair referenced by
668 // some multimap pair iterator pair.
669 template <typename V>
670 bool isSecondInIteratorPair(V J, std::pair<
671 typename std::multimap<V, V>::iterator,
672 typename std::multimap<V, V>::iterator> PairRange) {
673 for (typename std::multimap<V, V>::iterator K = PairRange.first;
674 K != PairRange.second; ++K)
675 if (K->second == J) return true;
680 bool isPureIEChain(InsertElementInst *IE) {
681 InsertElementInst *IENext = IE;
683 if (!isa<UndefValue>(IENext->getOperand(0)) &&
684 !isa<InsertElementInst>(IENext->getOperand(0))) {
688 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
694 // This function implements one vectorization iteration on the provided
695 // basic block. It returns true if the block is changed.
696 bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
698 BasicBlock::iterator Start = BB.getFirstInsertionPt();
700 std::vector<Value *> AllPairableInsts;
701 DenseMap<Value *, Value *> AllChosenPairs;
702 DenseSet<ValuePair> AllFixedOrderPairs;
703 DenseMap<VPPair, unsigned> AllPairConnectionTypes;
704 std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
707 std::vector<Value *> PairableInsts;
708 std::multimap<Value *, Value *> CandidatePairs;
709 DenseSet<ValuePair> FixedOrderPairs;
710 DenseMap<ValuePair, int> CandidatePairCostSavings;
711 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
713 CandidatePairCostSavings,
714 PairableInsts, NonPow2Len);
715 if (PairableInsts.empty()) continue;
717 // Now we have a map of all of the pairable instructions and we need to
718 // select the best possible pairing. A good pairing is one such that the
719 // users of the pair are also paired. This defines a (directed) forest
720 // over the pairs such that two pairs are connected iff the second pair
723 // Note that it only matters that both members of the second pair use some
724 // element of the first pair (to allow for splatting).
726 std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
727 DenseMap<VPPair, unsigned> PairConnectionTypes;
728 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
729 PairConnectionTypes);
730 if (ConnectedPairs.empty()) continue;
732 for (std::multimap<ValuePair, ValuePair>::iterator
733 I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
735 ConnectedPairDeps.insert(VPPair(I->second, I->first));
738 // Build the pairable-instruction dependency map
739 DenseSet<ValuePair> PairableInstUsers;
740 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
742 // There is now a graph of the connected pairs. For each variable, pick
743 // the pairing with the largest tree meeting the depth requirement on at
744 // least one branch. Then select all pairings that are part of that tree
745 // and remove them from the list of available pairings and pairable
748 DenseMap<Value *, Value *> ChosenPairs;
749 choosePairs(CandidatePairs, CandidatePairCostSavings,
750 PairableInsts, FixedOrderPairs, PairConnectionTypes,
751 ConnectedPairs, ConnectedPairDeps,
752 PairableInstUsers, ChosenPairs);
754 if (ChosenPairs.empty()) continue;
755 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
756 PairableInsts.end());
757 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
759 // Only for the chosen pairs, propagate information on fixed-order pairs,
760 // pair connections, and their types to the data structures used by the
761 // pair fusion procedures.
762 for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
763 IE = ChosenPairs.end(); I != IE; ++I) {
764 if (FixedOrderPairs.count(*I))
765 AllFixedOrderPairs.insert(*I);
766 else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
767 AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
769 for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
771 DenseMap<VPPair, unsigned>::iterator K =
772 PairConnectionTypes.find(VPPair(*I, *J));
773 if (K != PairConnectionTypes.end()) {
774 AllPairConnectionTypes.insert(*K);
776 K = PairConnectionTypes.find(VPPair(*J, *I));
777 if (K != PairConnectionTypes.end())
778 AllPairConnectionTypes.insert(*K);
783 for (std::multimap<ValuePair, ValuePair>::iterator
784 I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
786 if (AllPairConnectionTypes.count(*I)) {
787 AllConnectedPairs.insert(*I);
788 AllConnectedPairDeps.insert(VPPair(I->second, I->first));
791 } while (ShouldContinue);
793 if (AllChosenPairs.empty()) return false;
794 NumFusedOps += AllChosenPairs.size();
796 // A set of pairs has now been selected. It is now necessary to replace the
797 // paired instructions with vector instructions. For this procedure each
798 // operand must be replaced with a vector operand. This vector is formed
799 // by using build_vector on the old operands. The replaced values are then
800 // replaced with a vector_extract on the result. Subsequent optimization
801 // passes should coalesce the build/extract combinations.
803 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
804 AllPairConnectionTypes,
805 AllConnectedPairs, AllConnectedPairDeps);
807 // It is important to cleanup here so that future iterations of this
808 // function have less work to do.
809 (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
813 // This function returns true if the provided instruction is capable of being
814 // fused into a vector instruction. This determination is based only on the
815 // type and other attributes of the instruction.
816 bool BBVectorize::isInstVectorizable(Instruction *I,
817 bool &IsSimpleLoadStore) {
818 IsSimpleLoadStore = false;
820 if (CallInst *C = dyn_cast<CallInst>(I)) {
821 if (!isVectorizableIntrinsic(C))
823 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
824 // Vectorize simple loads if possbile:
825 IsSimpleLoadStore = L->isSimple();
826 if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
828 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
829 // Vectorize simple stores if possbile:
830 IsSimpleLoadStore = S->isSimple();
831 if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
833 } else if (CastInst *C = dyn_cast<CastInst>(I)) {
834 // We can vectorize casts, but not casts of pointer types, etc.
835 if (!Config.VectorizeCasts)
838 Type *SrcTy = C->getSrcTy();
839 if (!SrcTy->isSingleValueType())
842 Type *DestTy = C->getDestTy();
843 if (!DestTy->isSingleValueType())
845 } else if (isa<SelectInst>(I)) {
846 if (!Config.VectorizeSelect)
848 } else if (isa<CmpInst>(I)) {
849 if (!Config.VectorizeCmp)
851 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
852 if (!Config.VectorizeGEP)
855 // Currently, vector GEPs exist only with one index.
856 if (G->getNumIndices() != 1)
858 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
859 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
863 // We can't vectorize memory operations without target data
864 if (TD == 0 && IsSimpleLoadStore)
868 getInstructionTypes(I, T1, T2);
870 // Not every type can be vectorized...
871 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
872 !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
875 if (T1->getScalarSizeInBits() == 1) {
876 if (!Config.VectorizeBools)
879 if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
883 if (T2->getScalarSizeInBits() == 1) {
884 if (!Config.VectorizeBools)
887 if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
891 if (!Config.VectorizeFloats
892 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
895 // Don't vectorize target-specific types.
896 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
898 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
901 if ((!Config.VectorizePointers || TD == 0) &&
902 (T1->getScalarType()->isPointerTy() ||
903 T2->getScalarType()->isPointerTy()))
906 if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
907 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
913 // This function returns true if the two provided instructions are compatible
914 // (meaning that they can be fused into a vector instruction). This assumes
915 // that I has already been determined to be vectorizable and that J is not
916 // in the use tree of I.
917 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
918 bool IsSimpleLoadStore, bool NonPow2Len,
919 int &CostSavings, int &FixedOrder) {
920 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
921 " <-> " << *J << "\n");
926 // Loads and stores can be merged if they have different alignments,
927 // but are otherwise the same.
928 if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
929 (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
932 Type *IT1, *IT2, *JT1, *JT2;
933 getInstructionTypes(I, IT1, IT2);
934 getInstructionTypes(J, JT1, JT2);
935 unsigned MaxTypeBits = std::max(
936 IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
937 IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
938 if (!VTTI && MaxTypeBits > Config.VectorBits)
941 // FIXME: handle addsub-type operations!
943 if (IsSimpleLoadStore) {
945 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
946 int64_t OffsetInElmts = 0;
947 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
948 IAddressSpace, JAddressSpace,
949 OffsetInElmts) && abs64(OffsetInElmts) == 1) {
950 FixedOrder = (int) OffsetInElmts;
951 unsigned BottomAlignment = IAlignment;
952 if (OffsetInElmts < 0) BottomAlignment = JAlignment;
954 Type *aTypeI = isa<StoreInst>(I) ?
955 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
956 Type *aTypeJ = isa<StoreInst>(J) ?
957 cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
958 Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
960 if (Config.AlignedOnly) {
961 // An aligned load or store is possible only if the instruction
962 // with the lower offset has an alignment suitable for the
965 unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
966 if (BottomAlignment < VecAlignment)
971 unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), aTypeI,
972 IAlignment, IAddressSpace);
973 unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
974 JAlignment, JAddressSpace);
975 unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType,
978 if (VCost > ICost + JCost)
981 // We don't want to fuse to a type that will be split, even
982 // if the two input types will also be split and there is no other
984 unsigned VParts = VTTI->getNumberOfParts(VType);
987 else if (!VParts && VCost == ICost + JCost)
990 CostSavings = ICost + JCost - VCost;
996 unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
997 unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
998 Type *VT1 = getVecTypeForPair(IT1, JT1),
999 *VT2 = getVecTypeForPair(IT2, JT2);
1000 unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
1002 if (VCost > ICost + JCost)
1005 // We don't want to fuse to a type that will be split, even
1006 // if the two input types will also be split and there is no other
1008 unsigned VParts1 = VTTI->getNumberOfParts(VT1),
1009 VParts2 = VTTI->getNumberOfParts(VT2);
1010 if (VParts1 > 1 || VParts2 > 1)
1012 else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
1015 CostSavings = ICost + JCost - VCost;
1018 // The powi intrinsic is special because only the first argument is
1019 // vectorized, the second arguments must be equal.
1020 CallInst *CI = dyn_cast<CallInst>(I);
1022 if (CI && (FI = CI->getCalledFunction()) &&
1023 FI->getIntrinsicID() == Intrinsic::powi) {
1025 Value *A1I = CI->getArgOperand(1),
1026 *A1J = cast<CallInst>(J)->getArgOperand(1);
1027 const SCEV *A1ISCEV = SE->getSCEV(A1I),
1028 *A1JSCEV = SE->getSCEV(A1J);
1029 return (A1ISCEV == A1JSCEV);
1035 // Figure out whether or not J uses I and update the users and write-set
1036 // structures associated with I. Specifically, Users represents the set of
1037 // instructions that depend on I. WriteSet represents the set
1038 // of memory locations that are dependent on I. If UpdateUsers is true,
1039 // and J uses I, then Users is updated to contain J and WriteSet is updated
1040 // to contain any memory locations to which J writes. The function returns
1041 // true if J uses I. By default, alias analysis is used to determine
1042 // whether J reads from memory that overlaps with a location in WriteSet.
1043 // If LoadMoveSet is not null, then it is a previously-computed multimap
1044 // where the key is the memory-based user instruction and the value is
1045 // the instruction to be compared with I. So, if LoadMoveSet is provided,
1046 // then the alias analysis is not used. This is necessary because this
1047 // function is called during the process of moving instructions during
1048 // vectorization and the results of the alias analysis are not stable during
1050 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
1051 AliasSetTracker &WriteSet, Instruction *I,
1052 Instruction *J, bool UpdateUsers,
1053 std::multimap<Value *, Value *> *LoadMoveSet) {
1056 // This instruction may already be marked as a user due, for example, to
1057 // being a member of a selected pair.
1062 for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
1065 if (I == V || Users.count(V)) {
1070 if (!UsesI && J->mayReadFromMemory()) {
1072 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
1073 UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
1075 for (AliasSetTracker::iterator W = WriteSet.begin(),
1076 WE = WriteSet.end(); W != WE; ++W) {
1077 if (W->aliasesUnknownInst(J, *AA)) {
1085 if (UsesI && UpdateUsers) {
1086 if (J->mayWriteToMemory()) WriteSet.add(J);
1093 // This function iterates over all instruction pairs in the provided
1094 // basic block and collects all candidate pairs for vectorization.
1095 bool BBVectorize::getCandidatePairs(BasicBlock &BB,
1096 BasicBlock::iterator &Start,
1097 std::multimap<Value *, Value *> &CandidatePairs,
1098 DenseSet<ValuePair> &FixedOrderPairs,
1099 DenseMap<ValuePair, int> &CandidatePairCostSavings,
1100 std::vector<Value *> &PairableInsts, bool NonPow2Len) {
1101 BasicBlock::iterator E = BB.end();
1102 if (Start == E) return false;
1104 bool ShouldContinue = false, IAfterStart = false;
1105 for (BasicBlock::iterator I = Start++; I != E; ++I) {
1106 if (I == Start) IAfterStart = true;
1108 bool IsSimpleLoadStore;
1109 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
1111 // Look for an instruction with which to pair instruction *I...
1112 DenseSet<Value *> Users;
1113 AliasSetTracker WriteSet(*AA);
1114 bool JAfterStart = IAfterStart;
1115 BasicBlock::iterator J = llvm::next(I);
1116 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
1117 if (J == Start) JAfterStart = true;
1119 // Determine if J uses I, if so, exit the loop.
1120 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
1121 if (Config.FastDep) {
1122 // Note: For this heuristic to be effective, independent operations
1123 // must tend to be intermixed. This is likely to be true from some
1124 // kinds of grouped loop unrolling (but not the generic LLVM pass),
1125 // but otherwise may require some kind of reordering pass.
1127 // When using fast dependency analysis,
1128 // stop searching after first use:
1131 if (UsesI) continue;
1134 // J does not use I, and comes before the first use of I, so it can be
1135 // merged with I if the instructions are compatible.
1136 int CostSavings, FixedOrder;
1137 if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
1138 CostSavings, FixedOrder)) continue;
1140 // J is a candidate for merging with I.
1141 if (!PairableInsts.size() ||
1142 PairableInsts[PairableInsts.size()-1] != I) {
1143 PairableInsts.push_back(I);
1146 CandidatePairs.insert(ValuePair(I, J));
1148 CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
1151 if (FixedOrder == 1)
1152 FixedOrderPairs.insert(ValuePair(I, J));
1153 else if (FixedOrder == -1)
1154 FixedOrderPairs.insert(ValuePair(J, I));
1156 // The next call to this function must start after the last instruction
1157 // selected during this invocation.
1159 Start = llvm::next(J);
1160 IAfterStart = JAfterStart = false;
1163 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
1164 << *I << " <-> " << *J << " (cost savings: " <<
1165 CostSavings << ")\n");
1167 // If we have already found too many pairs, break here and this function
1168 // will be called again starting after the last instruction selected
1169 // during this invocation.
1170 if (PairableInsts.size() >= Config.MaxInsts) {
1171 ShouldContinue = true;
1180 DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
1181 << " instructions with candidate pairs\n");
1183 return ShouldContinue;
1186 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
1187 // it looks for pairs such that both members have an input which is an
1188 // output of PI or PJ.
1189 void BBVectorize::computePairsConnectedTo(
1190 std::multimap<Value *, Value *> &CandidatePairs,
1191 std::vector<Value *> &PairableInsts,
1192 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1193 DenseMap<VPPair, unsigned> &PairConnectionTypes,
1197 // For each possible pairing for this variable, look at the uses of
1198 // the first value...
1199 for (Value::use_iterator I = P.first->use_begin(),
1200 E = P.first->use_end(); I != E; ++I) {
1201 if (isa<LoadInst>(*I)) {
1202 // A pair cannot be connected to a load because the load only takes one
1203 // operand (the address) and it is a scalar even after vectorization.
1205 } else if ((SI = dyn_cast<StoreInst>(*I)) &&
1206 P.first == SI->getPointerOperand()) {
1207 // Similarly, a pair cannot be connected to a store through its
1212 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
1214 // For each use of the first variable, look for uses of the second
1216 for (Value::use_iterator J = P.second->use_begin(),
1217 E2 = P.second->use_end(); J != E2; ++J) {
1218 if ((SJ = dyn_cast<StoreInst>(*J)) &&
1219 P.second == SJ->getPointerOperand())
1222 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
1225 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
1226 VPPair VP(P, ValuePair(*I, *J));
1227 ConnectedPairs.insert(VP);
1228 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
1232 if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
1233 VPPair VP(P, ValuePair(*J, *I));
1234 ConnectedPairs.insert(VP);
1235 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
1239 if (Config.SplatBreaksChain) continue;
1240 // Look for cases where just the first value in the pair is used by
1241 // both members of another pair (splatting).
1242 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
1243 if ((SJ = dyn_cast<StoreInst>(*J)) &&
1244 P.first == SJ->getPointerOperand())
1247 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
1248 VPPair VP(P, ValuePair(*I, *J));
1249 ConnectedPairs.insert(VP);
1250 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1255 if (Config.SplatBreaksChain) return;
1256 // Look for cases where just the second value in the pair is used by
1257 // both members of another pair (splatting).
1258 for (Value::use_iterator I = P.second->use_begin(),
1259 E = P.second->use_end(); I != E; ++I) {
1260 if (isa<LoadInst>(*I))
1262 else if ((SI = dyn_cast<StoreInst>(*I)) &&
1263 P.second == SI->getPointerOperand())
1266 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
1268 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
1269 if ((SJ = dyn_cast<StoreInst>(*J)) &&
1270 P.second == SJ->getPointerOperand())
1273 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
1274 VPPair VP(P, ValuePair(*I, *J));
1275 ConnectedPairs.insert(VP);
1276 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1282 // This function figures out which pairs are connected. Two pairs are
1283 // connected if some output of the first pair forms an input to both members
1284 // of the second pair.
1285 void BBVectorize::computeConnectedPairs(
1286 std::multimap<Value *, Value *> &CandidatePairs,
1287 std::vector<Value *> &PairableInsts,
1288 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1289 DenseMap<VPPair, unsigned> &PairConnectionTypes) {
1291 for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
1292 PE = PairableInsts.end(); PI != PE; ++PI) {
1293 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
1295 for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
1296 P != choiceRange.second; ++P)
1297 computePairsConnectedTo(CandidatePairs, PairableInsts,
1298 ConnectedPairs, PairConnectionTypes, *P);
1301 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
1302 << " pair connections.\n");
1305 // This function builds a set of use tuples such that <A, B> is in the set
1306 // if B is in the use tree of A. If B is in the use tree of A, then B
1307 // depends on the output of A.
1308 void BBVectorize::buildDepMap(
1310 std::multimap<Value *, Value *> &CandidatePairs,
1311 std::vector<Value *> &PairableInsts,
1312 DenseSet<ValuePair> &PairableInstUsers) {
1313 DenseSet<Value *> IsInPair;
1314 for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
1315 E = CandidatePairs.end(); C != E; ++C) {
1316 IsInPair.insert(C->first);
1317 IsInPair.insert(C->second);
1320 // Iterate through the basic block, recording all Users of each
1321 // pairable instruction.
1323 BasicBlock::iterator E = BB.end();
1324 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
1325 if (IsInPair.find(I) == IsInPair.end()) continue;
1327 DenseSet<Value *> Users;
1328 AliasSetTracker WriteSet(*AA);
1329 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
1330 (void) trackUsesOfI(Users, WriteSet, I, J);
1332 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
1334 PairableInstUsers.insert(ValuePair(I, *U));
1338 // Returns true if an input to pair P is an output of pair Q and also an
1339 // input of pair Q is an output of pair P. If this is the case, then these
1340 // two pairs cannot be simultaneously fused.
1341 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
1342 DenseSet<ValuePair> &PairableInstUsers,
1343 std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
1344 // Two pairs are in conflict if they are mutual Users of eachother.
1345 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
1346 PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
1347 PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
1348 PairableInstUsers.count(ValuePair(P.second, Q.second));
1349 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
1350 PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
1351 PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
1352 PairableInstUsers.count(ValuePair(Q.second, P.second));
1353 if (PairableInstUserMap) {
1354 // FIXME: The expensive part of the cycle check is not so much the cycle
1355 // check itself but this edge insertion procedure. This needs some
1356 // profiling and probably a different data structure (same is true of
1357 // most uses of std::multimap).
1359 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
1360 if (!isSecondInIteratorPair(P, QPairRange))
1361 PairableInstUserMap->insert(VPPair(Q, P));
1364 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
1365 if (!isSecondInIteratorPair(Q, PPairRange))
1366 PairableInstUserMap->insert(VPPair(P, Q));
1370 return (QUsesP && PUsesQ);
1373 // This function walks the use graph of current pairs to see if, starting
1374 // from P, the walk returns to P.
1375 bool BBVectorize::pairWillFormCycle(ValuePair P,
1376 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
1377 DenseSet<ValuePair> &CurrentPairs) {
1378 DEBUG(if (DebugCycleCheck)
1379 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
1380 << *P.second << "\n");
1381 // A lookup table of visisted pairs is kept because the PairableInstUserMap
1382 // contains non-direct associations.
1383 DenseSet<ValuePair> Visited;
1384 SmallVector<ValuePair, 32> Q;
1385 // General depth-first post-order traversal:
1388 ValuePair QTop = Q.pop_back_val();
1389 Visited.insert(QTop);
1391 DEBUG(if (DebugCycleCheck)
1392 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
1393 << *QTop.second << "\n");
1394 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
1395 for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
1396 C != QPairRange.second; ++C) {
1397 if (C->second == P) {
1399 << "BBV: rejected to prevent non-trivial cycle formation: "
1400 << *C->first.first << " <-> " << *C->first.second << "\n");
1404 if (CurrentPairs.count(C->second) && !Visited.count(C->second))
1405 Q.push_back(C->second);
1407 } while (!Q.empty());
1412 // This function builds the initial tree of connected pairs with the
1413 // pair J at the root.
1414 void BBVectorize::buildInitialTreeFor(
1415 std::multimap<Value *, Value *> &CandidatePairs,
1416 std::vector<Value *> &PairableInsts,
1417 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1418 DenseSet<ValuePair> &PairableInstUsers,
1419 DenseMap<Value *, Value *> &ChosenPairs,
1420 DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
1421 // Each of these pairs is viewed as the root node of a Tree. The Tree
1422 // is then walked (depth-first). As this happens, we keep track of
1423 // the pairs that compose the Tree and the maximum depth of the Tree.
1424 SmallVector<ValuePairWithDepth, 32> Q;
1425 // General depth-first post-order traversal:
1426 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1428 ValuePairWithDepth QTop = Q.back();
1430 // Push each child onto the queue:
1431 bool MoreChildren = false;
1432 size_t MaxChildDepth = QTop.second;
1433 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
1434 for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
1435 k != qtRange.second; ++k) {
1436 // Make sure that this child pair is still a candidate:
1437 bool IsStillCand = false;
1438 VPIteratorPair checkRange =
1439 CandidatePairs.equal_range(k->second.first);
1440 for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
1441 m != checkRange.second; ++m) {
1442 if (m->second == k->second.second) {
1449 DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
1450 if (C == Tree.end()) {
1451 size_t d = getDepthFactor(k->second.first);
1452 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
1453 MoreChildren = true;
1455 MaxChildDepth = std::max(MaxChildDepth, C->second);
1460 if (!MoreChildren) {
1461 // Record the current pair as part of the Tree:
1462 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1465 } while (!Q.empty());
1468 // Given some initial tree, prune it by removing conflicting pairs (pairs
1469 // that cannot be simultaneously chosen for vectorization).
1470 void BBVectorize::pruneTreeFor(
1471 std::multimap<Value *, Value *> &CandidatePairs,
1472 std::vector<Value *> &PairableInsts,
1473 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1474 DenseSet<ValuePair> &PairableInstUsers,
1475 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
1476 DenseMap<Value *, Value *> &ChosenPairs,
1477 DenseMap<ValuePair, size_t> &Tree,
1478 DenseSet<ValuePair> &PrunedTree, ValuePair J,
1479 bool UseCycleCheck) {
1480 SmallVector<ValuePairWithDepth, 32> Q;
1481 // General depth-first post-order traversal:
1482 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1484 ValuePairWithDepth QTop = Q.pop_back_val();
1485 PrunedTree.insert(QTop.first);
1487 // Visit each child, pruning as necessary...
1488 SmallVector<ValuePairWithDepth, 8> BestChildren;
1489 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
1490 for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
1491 K != QTopRange.second; ++K) {
1492 DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
1493 if (C == Tree.end()) continue;
1495 // This child is in the Tree, now we need to make sure it is the
1496 // best of any conflicting children. There could be multiple
1497 // conflicting children, so first, determine if we're keeping
1498 // this child, then delete conflicting children as necessary.
1500 // It is also necessary to guard against pairing-induced
1501 // dependencies. Consider instructions a .. x .. y .. b
1502 // such that (a,b) are to be fused and (x,y) are to be fused
1503 // but a is an input to x and b is an output from y. This
1504 // means that y cannot be moved after b but x must be moved
1505 // after b for (a,b) to be fused. In other words, after
1506 // fusing (a,b) we have y .. a/b .. x where y is an input
1507 // to a/b and x is an output to a/b: x and y can no longer
1508 // be legally fused. To prevent this condition, we must
1509 // make sure that a child pair added to the Tree is not
1510 // both an input and output of an already-selected pair.
1512 // Pairing-induced dependencies can also form from more complicated
1513 // cycles. The pair vs. pair conflicts are easy to check, and so
1514 // that is done explicitly for "fast rejection", and because for
1515 // child vs. child conflicts, we may prefer to keep the current
1516 // pair in preference to the already-selected child.
1517 DenseSet<ValuePair> CurrentPairs;
1520 for (SmallVector<ValuePairWithDepth, 8>::iterator C2
1521 = BestChildren.begin(), E2 = BestChildren.end();
1523 if (C2->first.first == C->first.first ||
1524 C2->first.first == C->first.second ||
1525 C2->first.second == C->first.first ||
1526 C2->first.second == C->first.second ||
1527 pairsConflict(C2->first, C->first, PairableInstUsers,
1528 UseCycleCheck ? &PairableInstUserMap : 0)) {
1529 if (C2->second >= C->second) {
1534 CurrentPairs.insert(C2->first);
1537 if (!CanAdd) continue;
1539 // Even worse, this child could conflict with another node already
1540 // selected for the Tree. If that is the case, ignore this child.
1541 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
1542 E2 = PrunedTree.end(); T != E2; ++T) {
1543 if (T->first == C->first.first ||
1544 T->first == C->first.second ||
1545 T->second == C->first.first ||
1546 T->second == C->first.second ||
1547 pairsConflict(*T, C->first, PairableInstUsers,
1548 UseCycleCheck ? &PairableInstUserMap : 0)) {
1553 CurrentPairs.insert(*T);
1555 if (!CanAdd) continue;
1557 // And check the queue too...
1558 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
1559 E2 = Q.end(); C2 != E2; ++C2) {
1560 if (C2->first.first == C->first.first ||
1561 C2->first.first == C->first.second ||
1562 C2->first.second == C->first.first ||
1563 C2->first.second == C->first.second ||
1564 pairsConflict(C2->first, C->first, PairableInstUsers,
1565 UseCycleCheck ? &PairableInstUserMap : 0)) {
1570 CurrentPairs.insert(C2->first);
1572 if (!CanAdd) continue;
1574 // Last but not least, check for a conflict with any of the
1575 // already-chosen pairs.
1576 for (DenseMap<Value *, Value *>::iterator C2 =
1577 ChosenPairs.begin(), E2 = ChosenPairs.end();
1579 if (pairsConflict(*C2, C->first, PairableInstUsers,
1580 UseCycleCheck ? &PairableInstUserMap : 0)) {
1585 CurrentPairs.insert(*C2);
1587 if (!CanAdd) continue;
1589 // To check for non-trivial cycles formed by the addition of the
1590 // current pair we've formed a list of all relevant pairs, now use a
1591 // graph walk to check for a cycle. We start from the current pair and
1592 // walk the use tree to see if we again reach the current pair. If we
1593 // do, then the current pair is rejected.
1595 // FIXME: It may be more efficient to use a topological-ordering
1596 // algorithm to improve the cycle check. This should be investigated.
1597 if (UseCycleCheck &&
1598 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1601 // This child can be added, but we may have chosen it in preference
1602 // to an already-selected child. Check for this here, and if a
1603 // conflict is found, then remove the previously-selected child
1604 // before adding this one in its place.
1605 for (SmallVector<ValuePairWithDepth, 8>::iterator C2
1606 = BestChildren.begin(); C2 != BestChildren.end();) {
1607 if (C2->first.first == C->first.first ||
1608 C2->first.first == C->first.second ||
1609 C2->first.second == C->first.first ||
1610 C2->first.second == C->first.second ||
1611 pairsConflict(C2->first, C->first, PairableInstUsers))
1612 C2 = BestChildren.erase(C2);
1617 BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
1620 for (SmallVector<ValuePairWithDepth, 8>::iterator C
1621 = BestChildren.begin(), E2 = BestChildren.end();
1623 size_t DepthF = getDepthFactor(C->first.first);
1624 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1626 } while (!Q.empty());
1629 // This function finds the best tree of mututally-compatible connected
1630 // pairs, given the choice of root pairs as an iterator range.
1631 void BBVectorize::findBestTreeFor(
1632 std::multimap<Value *, Value *> &CandidatePairs,
1633 DenseMap<ValuePair, int> &CandidatePairCostSavings,
1634 std::vector<Value *> &PairableInsts,
1635 DenseSet<ValuePair> &FixedOrderPairs,
1636 DenseMap<VPPair, unsigned> &PairConnectionTypes,
1637 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1638 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
1639 DenseSet<ValuePair> &PairableInstUsers,
1640 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
1641 DenseMap<Value *, Value *> &ChosenPairs,
1642 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
1643 int &BestEffSize, VPIteratorPair ChoiceRange,
1644 bool UseCycleCheck) {
1645 for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
1646 J != ChoiceRange.second; ++J) {
1648 // Before going any further, make sure that this pair does not
1649 // conflict with any already-selected pairs (see comment below
1650 // near the Tree pruning for more details).
1651 DenseSet<ValuePair> ChosenPairSet;
1652 bool DoesConflict = false;
1653 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
1654 E = ChosenPairs.end(); C != E; ++C) {
1655 if (pairsConflict(*C, *J, PairableInstUsers,
1656 UseCycleCheck ? &PairableInstUserMap : 0)) {
1657 DoesConflict = true;
1661 ChosenPairSet.insert(*C);
1663 if (DoesConflict) continue;
1665 if (UseCycleCheck &&
1666 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
1669 DenseMap<ValuePair, size_t> Tree;
1670 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1671 PairableInstUsers, ChosenPairs, Tree, *J);
1673 // Because we'll keep the child with the largest depth, the largest
1674 // depth is still the same in the unpruned Tree.
1675 size_t MaxDepth = Tree.lookup(*J);
1677 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
1678 << *J->first << " <-> " << *J->second << "} of depth " <<
1679 MaxDepth << " and size " << Tree.size() << "\n");
1681 // At this point the Tree has been constructed, but, may contain
1682 // contradictory children (meaning that different children of
1683 // some tree node may be attempting to fuse the same instruction).
1684 // So now we walk the tree again, in the case of a conflict,
1685 // keep only the child with the largest depth. To break a tie,
1686 // favor the first child.
1688 DenseSet<ValuePair> PrunedTree;
1689 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1690 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
1691 PrunedTree, *J, UseCycleCheck);
1695 DenseSet<Value *> PrunedTreeInstrs;
1696 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
1697 E = PrunedTree.end(); S != E; ++S) {
1698 PrunedTreeInstrs.insert(S->first);
1699 PrunedTreeInstrs.insert(S->second);
1702 // The set of pairs that have already contributed to the total cost.
1703 DenseSet<ValuePair> IncomingPairs;
1705 // If the cost model were perfect, this might not be necessary; but we
1706 // need to make sure that we don't get stuck vectorizing our own
1708 bool HasNontrivialInsts = false;
1710 // The node weights represent the cost savings associated with
1711 // fusing the pair of instructions.
1712 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
1713 E = PrunedTree.end(); S != E; ++S) {
1714 if (!isa<ShuffleVectorInst>(S->first) &&
1715 !isa<InsertElementInst>(S->first) &&
1716 !isa<ExtractElementInst>(S->first))
1717 HasNontrivialInsts = true;
1719 bool FlipOrder = false;
1721 if (getDepthFactor(S->first)) {
1722 int ESContrib = CandidatePairCostSavings.find(*S)->second;
1723 DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
1724 << *S->first << " <-> " << *S->second << "} = " <<
1726 EffSize += ESContrib;
1729 // The edge weights contribute in a negative sense: they represent
1730 // the cost of shuffles.
1731 VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
1732 if (IP.first != ConnectedPairDeps.end()) {
1733 unsigned NumDepsDirect = 0, NumDepsSwap = 0;
1734 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
1735 Q != IP.second; ++Q) {
1736 if (!PrunedTree.count(Q->second))
1738 DenseMap<VPPair, unsigned>::iterator R =
1739 PairConnectionTypes.find(VPPair(Q->second, Q->first));
1740 assert(R != PairConnectionTypes.end() &&
1741 "Cannot find pair connection type");
1742 if (R->second == PairConnectionDirect)
1744 else if (R->second == PairConnectionSwap)
1748 // If there are more swaps than direct connections, then
1749 // the pair order will be flipped during fusion. So the real
1750 // number of swaps is the minimum number.
1751 FlipOrder = !FixedOrderPairs.count(*S) &&
1752 ((NumDepsSwap > NumDepsDirect) ||
1753 FixedOrderPairs.count(ValuePair(S->second, S->first)));
1755 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
1756 Q != IP.second; ++Q) {
1757 if (!PrunedTree.count(Q->second))
1759 DenseMap<VPPair, unsigned>::iterator R =
1760 PairConnectionTypes.find(VPPair(Q->second, Q->first));
1761 assert(R != PairConnectionTypes.end() &&
1762 "Cannot find pair connection type");
1763 Type *Ty1 = Q->second.first->getType(),
1764 *Ty2 = Q->second.second->getType();
1765 Type *VTy = getVecTypeForPair(Ty1, Ty2);
1766 if ((R->second == PairConnectionDirect && FlipOrder) ||
1767 (R->second == PairConnectionSwap && !FlipOrder) ||
1768 R->second == PairConnectionSplat) {
1769 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1771 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1772 *Q->second.first << " <-> " << *Q->second.second <<
1774 *S->first << " <-> " << *S->second << "} = " <<
1776 EffSize -= ESContrib;
1781 // Compute the cost of outgoing edges. We assume that edges outgoing
1782 // to shuffles, inserts or extracts can be merged, and so contribute
1783 // no additional cost.
1784 if (!S->first->getType()->isVoidTy()) {
1785 Type *Ty1 = S->first->getType(),
1786 *Ty2 = S->second->getType();
1787 Type *VTy = getVecTypeForPair(Ty1, Ty2);
1789 bool NeedsExtraction = false;
1790 for (Value::use_iterator I = S->first->use_begin(),
1791 IE = S->first->use_end(); I != IE; ++I) {
1792 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
1793 // Shuffle can be folded if it has no other input
1794 if (isa<UndefValue>(SI->getOperand(1)))
1797 if (isa<ExtractElementInst>(*I))
1799 if (PrunedTreeInstrs.count(*I))
1801 NeedsExtraction = true;
1805 if (NeedsExtraction) {
1807 if (Ty1->isVectorTy())
1808 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1811 ESContrib = (int) VTTI->getVectorInstrCost(
1812 Instruction::ExtractElement, VTy, 0);
1814 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1815 *S->first << "} = " << ESContrib << "\n");
1816 EffSize -= ESContrib;
1819 NeedsExtraction = false;
1820 for (Value::use_iterator I = S->second->use_begin(),
1821 IE = S->second->use_end(); I != IE; ++I) {
1822 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
1823 // Shuffle can be folded if it has no other input
1824 if (isa<UndefValue>(SI->getOperand(1)))
1827 if (isa<ExtractElementInst>(*I))
1829 if (PrunedTreeInstrs.count(*I))
1831 NeedsExtraction = true;
1835 if (NeedsExtraction) {
1837 if (Ty2->isVectorTy())
1838 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1841 ESContrib = (int) VTTI->getVectorInstrCost(
1842 Instruction::ExtractElement, VTy, 1);
1843 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1844 *S->second << "} = " << ESContrib << "\n");
1845 EffSize -= ESContrib;
1849 // Compute the cost of incoming edges.
1850 if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
1851 Instruction *S1 = cast<Instruction>(S->first),
1852 *S2 = cast<Instruction>(S->second);
1853 for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
1854 Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
1856 // Combining constants into vector constants (or small vector
1857 // constants into larger ones are assumed free).
1858 if (isa<Constant>(O1) && isa<Constant>(O2))
1864 ValuePair VP = ValuePair(O1, O2);
1865 ValuePair VPR = ValuePair(O2, O1);
1867 // Internal edges are not handled here.
1868 if (PrunedTree.count(VP) || PrunedTree.count(VPR))
1871 Type *Ty1 = O1->getType(),
1872 *Ty2 = O2->getType();
1873 Type *VTy = getVecTypeForPair(Ty1, Ty2);
1875 // Combining vector operations of the same type is also assumed
1876 // folded with other operations.
1878 // If both are insert elements, then both can be widened.
1879 InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
1880 *IEO2 = dyn_cast<InsertElementInst>(O2);
1881 if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
1883 // If both are extract elements, and both have the same input
1884 // type, then they can be replaced with a shuffle
1885 ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
1886 *EIO2 = dyn_cast<ExtractElementInst>(O2);
1888 EIO1->getOperand(0)->getType() ==
1889 EIO2->getOperand(0)->getType())
1891 // If both are a shuffle with equal operand types and only two
1892 // unqiue operands, then they can be replaced with a single
1894 ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
1895 *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
1897 SIO1->getOperand(0)->getType() ==
1898 SIO2->getOperand(0)->getType()) {
1899 SmallSet<Value *, 4> SIOps;
1900 SIOps.insert(SIO1->getOperand(0));
1901 SIOps.insert(SIO1->getOperand(1));
1902 SIOps.insert(SIO2->getOperand(0));
1903 SIOps.insert(SIO2->getOperand(1));
1904 if (SIOps.size() <= 2)
1910 // This pair has already been formed.
1911 if (IncomingPairs.count(VP)) {
1913 } else if (IncomingPairs.count(VPR)) {
1914 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1916 } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
1917 ESContrib = (int) VTTI->getVectorInstrCost(
1918 Instruction::InsertElement, VTy, 0);
1919 ESContrib += (int) VTTI->getVectorInstrCost(
1920 Instruction::InsertElement, VTy, 1);
1921 } else if (!Ty1->isVectorTy()) {
1922 // O1 needs to be inserted into a vector of size O2, and then
1923 // both need to be shuffled together.
1924 ESContrib = (int) VTTI->getVectorInstrCost(
1925 Instruction::InsertElement, Ty2, 0);
1926 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
1928 } else if (!Ty2->isVectorTy()) {
1929 // O2 needs to be inserted into a vector of size O1, and then
1930 // both need to be shuffled together.
1931 ESContrib = (int) VTTI->getVectorInstrCost(
1932 Instruction::InsertElement, Ty1, 0);
1933 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
1936 Type *TyBig = Ty1, *TySmall = Ty2;
1937 if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
1938 std::swap(TyBig, TySmall);
1940 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1942 if (TyBig != TySmall)
1943 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
1947 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
1948 << *O1 << " <-> " << *O2 << "} = " <<
1950 EffSize -= ESContrib;
1951 IncomingPairs.insert(VP);
1956 if (!HasNontrivialInsts) {
1957 DEBUG(if (DebugPairSelection) dbgs() <<
1958 "\tNo non-trivial instructions in tree;"
1959 " override to zero effective size\n");
1963 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
1964 E = PrunedTree.end(); S != E; ++S)
1965 EffSize += (int) getDepthFactor(S->first);
1968 DEBUG(if (DebugPairSelection)
1969 dbgs() << "BBV: found pruned Tree for pair {"
1970 << *J->first << " <-> " << *J->second << "} of depth " <<
1971 MaxDepth << " and size " << PrunedTree.size() <<
1972 " (effective size: " << EffSize << ")\n");
1973 if (((VTTI && !UseChainDepthWithTI) ||
1974 MaxDepth >= Config.ReqChainDepth) &&
1975 EffSize > 0 && EffSize > BestEffSize) {
1976 BestMaxDepth = MaxDepth;
1977 BestEffSize = EffSize;
1978 BestTree = PrunedTree;
1983 // Given the list of candidate pairs, this function selects those
1984 // that will be fused into vector instructions.
1985 void BBVectorize::choosePairs(
1986 std::multimap<Value *, Value *> &CandidatePairs,
1987 DenseMap<ValuePair, int> &CandidatePairCostSavings,
1988 std::vector<Value *> &PairableInsts,
1989 DenseSet<ValuePair> &FixedOrderPairs,
1990 DenseMap<VPPair, unsigned> &PairConnectionTypes,
1991 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1992 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
1993 DenseSet<ValuePair> &PairableInstUsers,
1994 DenseMap<Value *, Value *>& ChosenPairs) {
1995 bool UseCycleCheck =
1996 CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck;
1997 std::multimap<ValuePair, ValuePair> PairableInstUserMap;
1998 for (std::vector<Value *>::iterator I = PairableInsts.begin(),
1999 E = PairableInsts.end(); I != E; ++I) {
2000 // The number of possible pairings for this variable:
2001 size_t NumChoices = CandidatePairs.count(*I);
2002 if (!NumChoices) continue;
2004 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
2006 // The best pair to choose and its tree:
2007 size_t BestMaxDepth = 0;
2008 int BestEffSize = 0;
2009 DenseSet<ValuePair> BestTree;
2010 findBestTreeFor(CandidatePairs, CandidatePairCostSavings,
2011 PairableInsts, FixedOrderPairs, PairConnectionTypes,
2012 ConnectedPairs, ConnectedPairDeps,
2013 PairableInstUsers, PairableInstUserMap, ChosenPairs,
2014 BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
2017 // A tree has been chosen (or not) at this point. If no tree was
2018 // chosen, then this instruction, I, cannot be paired (and is no longer
2021 DEBUG(if (BestTree.size() > 0)
2022 dbgs() << "BBV: selected pairs in the best tree for: "
2023 << *cast<Instruction>(*I) << "\n");
2025 for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
2026 SE2 = BestTree.end(); S != SE2; ++S) {
2027 // Insert the members of this tree into the list of chosen pairs.
2028 ChosenPairs.insert(ValuePair(S->first, S->second));
2029 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
2030 *S->second << "\n");
2032 // Remove all candidate pairs that have values in the chosen tree.
2033 for (std::multimap<Value *, Value *>::iterator K =
2034 CandidatePairs.begin(); K != CandidatePairs.end();) {
2035 if (K->first == S->first || K->second == S->first ||
2036 K->second == S->second || K->first == S->second) {
2037 // Don't remove the actual pair chosen so that it can be used
2038 // in subsequent tree selections.
2039 if (!(K->first == S->first && K->second == S->second))
2040 CandidatePairs.erase(K++);
2050 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
2053 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
2058 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
2059 (n > 0 ? "." + utostr(n) : "")).str();
2062 // Returns the value that is to be used as the pointer input to the vector
2063 // instruction that fuses I with J.
2064 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
2065 Instruction *I, Instruction *J, unsigned o) {
2067 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
2068 int64_t OffsetInElmts;
2070 // Note: the analysis might fail here, that is why the pair order has
2071 // been precomputed (OffsetInElmts must be unused here).
2072 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
2073 IAddressSpace, JAddressSpace,
2074 OffsetInElmts, false);
2076 // The pointer value is taken to be the one with the lowest offset.
2079 Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
2080 Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
2081 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2082 Type *VArgPtrType = PointerType::get(VArgType,
2083 cast<PointerType>(IPtr->getType())->getAddressSpace());
2084 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
2085 /* insert before */ I);
2088 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
2089 unsigned MaskOffset, unsigned NumInElem,
2090 unsigned NumInElem1, unsigned IdxOffset,
2091 std::vector<Constant*> &Mask) {
2092 unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
2093 for (unsigned v = 0; v < NumElem1; ++v) {
2094 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
2096 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
2098 unsigned mm = m + (int) IdxOffset;
2099 if (m >= (int) NumInElem1)
2100 mm += (int) NumInElem;
2102 Mask[v+MaskOffset] =
2103 ConstantInt::get(Type::getInt32Ty(Context), mm);
2108 // Returns the value that is to be used as the vector-shuffle mask to the
2109 // vector instruction that fuses I with J.
2110 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
2111 Instruction *I, Instruction *J) {
2112 // This is the shuffle mask. We need to append the second
2113 // mask to the first, and the numbers need to be adjusted.
2115 Type *ArgTypeI = I->getType();
2116 Type *ArgTypeJ = J->getType();
2117 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2119 unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
2121 // Get the total number of elements in the fused vector type.
2122 // By definition, this must equal the number of elements in
2124 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
2125 std::vector<Constant*> Mask(NumElem);
2127 Type *OpTypeI = I->getOperand(0)->getType();
2128 unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
2129 Type *OpTypeJ = J->getOperand(0)->getType();
2130 unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
2132 // The fused vector will be:
2133 // -----------------------------------------------------
2134 // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
2135 // -----------------------------------------------------
2136 // from which we'll extract NumElem total elements (where the first NumElemI
2137 // of them come from the mask in I and the remainder come from the mask
2140 // For the mask from the first pair...
2141 fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI,
2144 // For the mask from the second pair...
2145 fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
2148 return ConstantVector::get(Mask);
2151 bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
2152 Instruction *J, unsigned o, Value *&LOp,
2154 Type *ArgTypeL, Type *ArgTypeH,
2155 bool IBeforeJ, unsigned IdxOff) {
2156 bool ExpandedIEChain = false;
2157 if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
2158 // If we have a pure insertelement chain, then this can be rewritten
2159 // into a chain that directly builds the larger type.
2160 if (isPureIEChain(LIE)) {
2161 SmallVector<Value *, 8> VectElemts(numElemL,
2162 UndefValue::get(ArgTypeL->getScalarType()));
2163 InsertElementInst *LIENext = LIE;
2166 cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
2167 VectElemts[Idx] = LIENext->getOperand(1);
2169 dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
2172 Value *LIEPrev = UndefValue::get(ArgTypeH);
2173 for (unsigned i = 0; i < numElemL; ++i) {
2174 if (isa<UndefValue>(VectElemts[i])) continue;
2175 LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
2176 ConstantInt::get(Type::getInt32Ty(Context),
2178 getReplacementName(IBeforeJ ? I : J,
2180 LIENext->insertBefore(IBeforeJ ? J : I);
2184 LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
2185 ExpandedIEChain = true;
2189 return ExpandedIEChain;
2192 // Returns the value to be used as the specified operand of the vector
2193 // instruction that fuses I with J.
2194 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
2195 Instruction *J, unsigned o, bool IBeforeJ) {
2196 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2197 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
2199 // Compute the fused vector type for this operand
2200 Type *ArgTypeI = I->getOperand(o)->getType();
2201 Type *ArgTypeJ = J->getOperand(o)->getType();
2202 VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2204 Instruction *L = I, *H = J;
2205 Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
2208 if (ArgTypeL->isVectorTy())
2209 numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
2214 if (ArgTypeH->isVectorTy())
2215 numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
2219 Value *LOp = L->getOperand(o);
2220 Value *HOp = H->getOperand(o);
2221 unsigned numElem = VArgType->getNumElements();
2223 // First, we check if we can reuse the "original" vector outputs (if these
2224 // exist). We might need a shuffle.
2225 ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
2226 ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
2227 ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
2228 ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
2230 // FIXME: If we're fusing shuffle instructions, then we can't apply this
2231 // optimization. The input vectors to the shuffle might be a different
2232 // length from the shuffle outputs. Unfortunately, the replacement
2233 // shuffle mask has already been formed, and the mask entries are sensitive
2234 // to the sizes of the inputs.
2235 bool IsSizeChangeShuffle =
2236 isa<ShuffleVectorInst>(L) &&
2237 (LOp->getType() != L->getType() || HOp->getType() != H->getType());
2239 if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
2240 // We can have at most two unique vector inputs.
2241 bool CanUseInputs = true;
2244 I1 = LEE->getOperand(0);
2246 I1 = LSV->getOperand(0);
2247 I2 = LSV->getOperand(1);
2248 if (I2 == I1 || isa<UndefValue>(I2))
2253 Value *I3 = HEE->getOperand(0);
2254 if (!I2 && I3 != I1)
2256 else if (I3 != I1 && I3 != I2)
2257 CanUseInputs = false;
2259 Value *I3 = HSV->getOperand(0);
2260 if (!I2 && I3 != I1)
2262 else if (I3 != I1 && I3 != I2)
2263 CanUseInputs = false;
2266 Value *I4 = HSV->getOperand(1);
2267 if (!isa<UndefValue>(I4)) {
2268 if (!I2 && I4 != I1)
2270 else if (I4 != I1 && I4 != I2)
2271 CanUseInputs = false;
2278 cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
2281 cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
2284 // We have one or two input vectors. We need to map each index of the
2285 // operands to the index of the original vector.
2286 SmallVector<std::pair<int, int>, 8> II(numElem);
2287 for (unsigned i = 0; i < numElemL; ++i) {
2291 cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
2292 INum = LEE->getOperand(0) == I1 ? 0 : 1;
2294 Idx = LSV->getMaskValue(i);
2295 if (Idx < (int) LOpElem) {
2296 INum = LSV->getOperand(0) == I1 ? 0 : 1;
2299 INum = LSV->getOperand(1) == I1 ? 0 : 1;
2303 II[i] = std::pair<int, int>(Idx, INum);
2305 for (unsigned i = 0; i < numElemH; ++i) {
2309 cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
2310 INum = HEE->getOperand(0) == I1 ? 0 : 1;
2312 Idx = HSV->getMaskValue(i);
2313 if (Idx < (int) HOpElem) {
2314 INum = HSV->getOperand(0) == I1 ? 0 : 1;
2317 INum = HSV->getOperand(1) == I1 ? 0 : 1;
2321 II[i + numElemL] = std::pair<int, int>(Idx, INum);
2324 // We now have an array which tells us from which index of which
2325 // input vector each element of the operand comes.
2326 VectorType *I1T = cast<VectorType>(I1->getType());
2327 unsigned I1Elem = I1T->getNumElements();
2330 // In this case there is only one underlying vector input. Check for
2331 // the trivial case where we can use the input directly.
2332 if (I1Elem == numElem) {
2333 bool ElemInOrder = true;
2334 for (unsigned i = 0; i < numElem; ++i) {
2335 if (II[i].first != (int) i && II[i].first != -1) {
2336 ElemInOrder = false;
2345 // A shuffle is needed.
2346 std::vector<Constant *> Mask(numElem);
2347 for (unsigned i = 0; i < numElem; ++i) {
2348 int Idx = II[i].first;
2350 Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
2352 Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2356 new ShuffleVectorInst(I1, UndefValue::get(I1T),
2357 ConstantVector::get(Mask),
2358 getReplacementName(IBeforeJ ? I : J,
2360 S->insertBefore(IBeforeJ ? J : I);
2364 VectorType *I2T = cast<VectorType>(I2->getType());
2365 unsigned I2Elem = I2T->getNumElements();
2367 // This input comes from two distinct vectors. The first step is to
2368 // make sure that both vectors are the same length. If not, the
2369 // smaller one will need to grow before they can be shuffled together.
2370 if (I1Elem < I2Elem) {
2371 std::vector<Constant *> Mask(I2Elem);
2373 for (; v < I1Elem; ++v)
2374 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2375 for (; v < I2Elem; ++v)
2376 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2378 Instruction *NewI1 =
2379 new ShuffleVectorInst(I1, UndefValue::get(I1T),
2380 ConstantVector::get(Mask),
2381 getReplacementName(IBeforeJ ? I : J,
2383 NewI1->insertBefore(IBeforeJ ? J : I);
2387 } else if (I1Elem > I2Elem) {
2388 std::vector<Constant *> Mask(I1Elem);
2390 for (; v < I2Elem; ++v)
2391 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2392 for (; v < I1Elem; ++v)
2393 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2395 Instruction *NewI2 =
2396 new ShuffleVectorInst(I2, UndefValue::get(I2T),
2397 ConstantVector::get(Mask),
2398 getReplacementName(IBeforeJ ? I : J,
2400 NewI2->insertBefore(IBeforeJ ? J : I);
2406 // Now that both I1 and I2 are the same length we can shuffle them
2407 // together (and use the result).
2408 std::vector<Constant *> Mask(numElem);
2409 for (unsigned v = 0; v < numElem; ++v) {
2410 if (II[v].first == -1) {
2411 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2413 int Idx = II[v].first + II[v].second * I1Elem;
2414 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2418 Instruction *NewOp =
2419 new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
2420 getReplacementName(IBeforeJ ? I : J, true, o));
2421 NewOp->insertBefore(IBeforeJ ? J : I);
2426 Type *ArgType = ArgTypeL;
2427 if (numElemL < numElemH) {
2428 if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
2429 ArgTypeL, VArgType, IBeforeJ, 1)) {
2430 // This is another short-circuit case: we're combining a scalar into
2431 // a vector that is formed by an IE chain. We've just expanded the IE
2432 // chain, now insert the scalar and we're done.
2434 Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
2435 getReplacementName(IBeforeJ ? I : J, true, o));
2436 S->insertBefore(IBeforeJ ? J : I);
2438 } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
2439 ArgTypeH, IBeforeJ)) {
2440 // The two vector inputs to the shuffle must be the same length,
2441 // so extend the smaller vector to be the same length as the larger one.
2445 std::vector<Constant *> Mask(numElemH);
2447 for (; v < numElemL; ++v)
2448 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2449 for (; v < numElemH; ++v)
2450 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2452 NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
2453 ConstantVector::get(Mask),
2454 getReplacementName(IBeforeJ ? I : J,
2457 NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
2458 getReplacementName(IBeforeJ ? I : J,
2462 NLOp->insertBefore(IBeforeJ ? J : I);
2467 } else if (numElemL > numElemH) {
2468 if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
2469 ArgTypeH, VArgType, IBeforeJ)) {
2471 InsertElementInst::Create(LOp, HOp,
2472 ConstantInt::get(Type::getInt32Ty(Context),
2474 getReplacementName(IBeforeJ ? I : J,
2476 S->insertBefore(IBeforeJ ? J : I);
2478 } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
2479 ArgTypeL, IBeforeJ)) {
2482 std::vector<Constant *> Mask(numElemL);
2484 for (; v < numElemH; ++v)
2485 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2486 for (; v < numElemL; ++v)
2487 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2489 NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
2490 ConstantVector::get(Mask),
2491 getReplacementName(IBeforeJ ? I : J,
2494 NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
2495 getReplacementName(IBeforeJ ? I : J,
2499 NHOp->insertBefore(IBeforeJ ? J : I);
2504 if (ArgType->isVectorTy()) {
2505 unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
2506 std::vector<Constant*> Mask(numElem);
2507 for (unsigned v = 0; v < numElem; ++v) {
2509 // If the low vector was expanded, we need to skip the extra
2510 // undefined entries.
2511 if (v >= numElemL && numElemH > numElemL)
2512 Idx += (numElemH - numElemL);
2513 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2516 Instruction *BV = new ShuffleVectorInst(LOp, HOp,
2517 ConstantVector::get(Mask),
2518 getReplacementName(IBeforeJ ? I : J, true, o));
2519 BV->insertBefore(IBeforeJ ? J : I);
2523 Instruction *BV1 = InsertElementInst::Create(
2524 UndefValue::get(VArgType), LOp, CV0,
2525 getReplacementName(IBeforeJ ? I : J,
2527 BV1->insertBefore(IBeforeJ ? J : I);
2528 Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
2529 getReplacementName(IBeforeJ ? I : J,
2531 BV2->insertBefore(IBeforeJ ? J : I);
2535 // This function creates an array of values that will be used as the inputs
2536 // to the vector instruction that fuses I with J.
2537 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
2538 Instruction *I, Instruction *J,
2539 SmallVector<Value *, 3> &ReplacedOperands,
2541 unsigned NumOperands = I->getNumOperands();
2543 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
2544 // Iterate backward so that we look at the store pointer
2545 // first and know whether or not we need to flip the inputs.
2547 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
2548 // This is the pointer for a load/store instruction.
2549 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
2551 } else if (isa<CallInst>(I)) {
2552 Function *F = cast<CallInst>(I)->getCalledFunction();
2553 unsigned IID = F->getIntrinsicID();
2554 if (o == NumOperands-1) {
2555 BasicBlock &BB = *I->getParent();
2557 Module *M = BB.getParent()->getParent();
2558 Type *ArgTypeI = I->getType();
2559 Type *ArgTypeJ = J->getType();
2560 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2562 ReplacedOperands[o] = Intrinsic::getDeclaration(M,
2563 (Intrinsic::ID) IID, VArgType);
2565 } else if (IID == Intrinsic::powi && o == 1) {
2566 // The second argument of powi is a single integer and we've already
2567 // checked that both arguments are equal. As a result, we just keep
2568 // I's second argument.
2569 ReplacedOperands[o] = I->getOperand(o);
2572 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
2573 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
2577 ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
2581 // This function creates two values that represent the outputs of the
2582 // original I and J instructions. These are generally vector shuffles
2583 // or extracts. In many cases, these will end up being unused and, thus,
2584 // eliminated by later passes.
2585 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
2586 Instruction *J, Instruction *K,
2587 Instruction *&InsertionPt,
2588 Instruction *&K1, Instruction *&K2) {
2589 if (isa<StoreInst>(I)) {
2590 AA->replaceWithNewValue(I, K);
2591 AA->replaceWithNewValue(J, K);
2593 Type *IType = I->getType();
2594 Type *JType = J->getType();
2596 VectorType *VType = getVecTypeForPair(IType, JType);
2597 unsigned numElem = VType->getNumElements();
2599 unsigned numElemI, numElemJ;
2600 if (IType->isVectorTy())
2601 numElemI = cast<VectorType>(IType)->getNumElements();
2605 if (JType->isVectorTy())
2606 numElemJ = cast<VectorType>(JType)->getNumElements();
2610 if (IType->isVectorTy()) {
2611 std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
2612 for (unsigned v = 0; v < numElemI; ++v) {
2613 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2614 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
2617 K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
2618 ConstantVector::get( Mask1),
2619 getReplacementName(K, false, 1));
2621 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2622 K1 = ExtractElementInst::Create(K, CV0,
2623 getReplacementName(K, false, 1));
2626 if (JType->isVectorTy()) {
2627 std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
2628 for (unsigned v = 0; v < numElemJ; ++v) {
2629 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2630 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
2633 K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
2634 ConstantVector::get( Mask2),
2635 getReplacementName(K, false, 2));
2637 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
2638 K2 = ExtractElementInst::Create(K, CV1,
2639 getReplacementName(K, false, 2));
2643 K2->insertAfter(K1);
2648 // Move all uses of the function I (including pairing-induced uses) after J.
2649 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
2650 std::multimap<Value *, Value *> &LoadMoveSet,
2651 Instruction *I, Instruction *J) {
2652 // Skip to the first instruction past I.
2653 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
2655 DenseSet<Value *> Users;
2656 AliasSetTracker WriteSet(*AA);
2657 for (; cast<Instruction>(L) != J; ++L)
2658 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
2660 assert(cast<Instruction>(L) == J &&
2661 "Tracking has not proceeded far enough to check for dependencies");
2662 // If J is now in the use set of I, then trackUsesOfI will return true
2663 // and we have a dependency cycle (and the fusing operation must abort).
2664 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
2667 // Move all uses of the function I (including pairing-induced uses) after J.
2668 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
2669 std::multimap<Value *, Value *> &LoadMoveSet,
2670 Instruction *&InsertionPt,
2671 Instruction *I, Instruction *J) {
2672 // Skip to the first instruction past I.
2673 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
2675 DenseSet<Value *> Users;
2676 AliasSetTracker WriteSet(*AA);
2677 for (; cast<Instruction>(L) != J;) {
2678 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
2679 // Move this instruction
2680 Instruction *InstToMove = L; ++L;
2682 DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
2683 " to after " << *InsertionPt << "\n");
2684 InstToMove->removeFromParent();
2685 InstToMove->insertAfter(InsertionPt);
2686 InsertionPt = InstToMove;
2693 // Collect all load instruction that are in the move set of a given first
2694 // pair member. These loads depend on the first instruction, I, and so need
2695 // to be moved after J (the second instruction) when the pair is fused.
2696 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
2697 DenseMap<Value *, Value *> &ChosenPairs,
2698 std::multimap<Value *, Value *> &LoadMoveSet,
2700 // Skip to the first instruction past I.
2701 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
2703 DenseSet<Value *> Users;
2704 AliasSetTracker WriteSet(*AA);
2706 // Note: We cannot end the loop when we reach J because J could be moved
2707 // farther down the use chain by another instruction pairing. Also, J
2708 // could be before I if this is an inverted input.
2709 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
2710 if (trackUsesOfI(Users, WriteSet, I, L)) {
2711 if (L->mayReadFromMemory())
2712 LoadMoveSet.insert(ValuePair(L, I));
2717 // In cases where both load/stores and the computation of their pointers
2718 // are chosen for vectorization, we can end up in a situation where the
2719 // aliasing analysis starts returning different query results as the
2720 // process of fusing instruction pairs continues. Because the algorithm
2721 // relies on finding the same use trees here as were found earlier, we'll
2722 // need to precompute the necessary aliasing information here and then
2723 // manually update it during the fusion process.
2724 void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
2725 std::vector<Value *> &PairableInsts,
2726 DenseMap<Value *, Value *> &ChosenPairs,
2727 std::multimap<Value *, Value *> &LoadMoveSet) {
2728 for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
2729 PIE = PairableInsts.end(); PI != PIE; ++PI) {
2730 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
2731 if (P == ChosenPairs.end()) continue;
2733 Instruction *I = cast<Instruction>(P->first);
2734 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
2738 // When the first instruction in each pair is cloned, it will inherit its
2739 // parent's metadata. This metadata must be combined with that of the other
2740 // instruction in a safe way.
2741 void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
2742 SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
2743 K->getAllMetadataOtherThanDebugLoc(Metadata);
2744 for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
2745 unsigned Kind = Metadata[i].first;
2746 MDNode *JMD = J->getMetadata(Kind);
2747 MDNode *KMD = Metadata[i].second;
2751 K->setMetadata(Kind, 0); // Remove unknown metadata
2753 case LLVMContext::MD_tbaa:
2754 K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
2756 case LLVMContext::MD_fpmath:
2757 K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
2763 // This function fuses the chosen instruction pairs into vector instructions,
2764 // taking care preserve any needed scalar outputs and, then, it reorders the
2765 // remaining instructions as needed (users of the first member of the pair
2766 // need to be moved to after the location of the second member of the pair
2767 // because the vector instruction is inserted in the location of the pair's
2769 void BBVectorize::fuseChosenPairs(BasicBlock &BB,
2770 std::vector<Value *> &PairableInsts,
2771 DenseMap<Value *, Value *> &ChosenPairs,
2772 DenseSet<ValuePair> &FixedOrderPairs,
2773 DenseMap<VPPair, unsigned> &PairConnectionTypes,
2774 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
2775 std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
2776 LLVMContext& Context = BB.getContext();
2778 // During the vectorization process, the order of the pairs to be fused
2779 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
2780 // list. After a pair is fused, the flipped pair is removed from the list.
2781 DenseSet<ValuePair> FlippedPairs;
2782 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
2783 E = ChosenPairs.end(); P != E; ++P)
2784 FlippedPairs.insert(ValuePair(P->second, P->first));
2785 for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
2786 E = FlippedPairs.end(); P != E; ++P)
2787 ChosenPairs.insert(*P);
2789 std::multimap<Value *, Value *> LoadMoveSet;
2790 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
2792 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
2794 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
2795 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
2796 if (P == ChosenPairs.end()) {
2801 if (getDepthFactor(P->first) == 0) {
2802 // These instructions are not really fused, but are tracked as though
2803 // they are. Any case in which it would be interesting to fuse them
2804 // will be taken care of by InstCombine.
2810 Instruction *I = cast<Instruction>(P->first),
2811 *J = cast<Instruction>(P->second);
2813 DEBUG(dbgs() << "BBV: fusing: " << *I <<
2814 " <-> " << *J << "\n");
2816 // Remove the pair and flipped pair from the list.
2817 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
2818 assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
2819 ChosenPairs.erase(FP);
2820 ChosenPairs.erase(P);
2822 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
2823 DEBUG(dbgs() << "BBV: fusion of: " << *I <<
2825 " aborted because of non-trivial dependency cycle\n");
2831 // If the pair must have the other order, then flip it.
2832 bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
2833 if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
2834 // This pair does not have a fixed order, and so we might want to
2835 // flip it if that will yield fewer shuffles. We count the number
2836 // of dependencies connected via swaps, and those directly connected,
2837 // and flip the order if the number of swaps is greater.
2838 bool OrigOrder = true;
2839 VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
2840 if (IP.first == ConnectedPairDeps.end()) {
2841 IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
2845 if (IP.first != ConnectedPairDeps.end()) {
2846 unsigned NumDepsDirect = 0, NumDepsSwap = 0;
2847 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
2848 Q != IP.second; ++Q) {
2849 DenseMap<VPPair, unsigned>::iterator R =
2850 PairConnectionTypes.find(VPPair(Q->second, Q->first));
2851 assert(R != PairConnectionTypes.end() &&
2852 "Cannot find pair connection type");
2853 if (R->second == PairConnectionDirect)
2855 else if (R->second == PairConnectionSwap)
2860 std::swap(NumDepsDirect, NumDepsSwap);
2862 if (NumDepsSwap > NumDepsDirect) {
2863 FlipPairOrder = true;
2864 DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
2865 " <-> " << *J << "\n");
2870 Instruction *L = I, *H = J;
2874 // If the pair being fused uses the opposite order from that in the pair
2875 // connection map, then we need to flip the types.
2876 VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
2877 for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
2878 Q != IP.second; ++Q) {
2879 DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
2880 assert(R != PairConnectionTypes.end() &&
2881 "Cannot find pair connection type");
2882 if (R->second == PairConnectionDirect)
2883 R->second = PairConnectionSwap;
2884 else if (R->second == PairConnectionSwap)
2885 R->second = PairConnectionDirect;
2888 bool LBeforeH = !FlipPairOrder;
2889 unsigned NumOperands = I->getNumOperands();
2890 SmallVector<Value *, 3> ReplacedOperands(NumOperands);
2891 getReplacementInputsForPair(Context, L, H, ReplacedOperands,
2894 // Make a copy of the original operation, change its type to the vector
2895 // type and replace its operands with the vector operands.
2896 Instruction *K = L->clone();
2899 else if (H->hasName())
2902 if (!isa<StoreInst>(K))
2903 K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
2905 combineMetadata(K, H);
2906 K->intersectOptionalDataWith(H);
2908 for (unsigned o = 0; o < NumOperands; ++o)
2909 K->setOperand(o, ReplacedOperands[o]);
2913 // Instruction insertion point:
2914 Instruction *InsertionPt = K;
2915 Instruction *K1 = 0, *K2 = 0;
2916 replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
2918 // The use tree of the first original instruction must be moved to after
2919 // the location of the second instruction. The entire use tree of the
2920 // first instruction is disjoint from the input tree of the second
2921 // (by definition), and so commutes with it.
2923 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
2925 if (!isa<StoreInst>(I)) {
2926 L->replaceAllUsesWith(K1);
2927 H->replaceAllUsesWith(K2);
2928 AA->replaceWithNewValue(L, K1);
2929 AA->replaceWithNewValue(H, K2);
2932 // Instructions that may read from memory may be in the load move set.
2933 // Once an instruction is fused, we no longer need its move set, and so
2934 // the values of the map never need to be updated. However, when a load
2935 // is fused, we need to merge the entries from both instructions in the
2936 // pair in case those instructions were in the move set of some other
2937 // yet-to-be-fused pair. The loads in question are the keys of the map.
2938 if (I->mayReadFromMemory()) {
2939 std::vector<ValuePair> NewSetMembers;
2940 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
2941 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
2942 for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
2943 N != IPairRange.second; ++N)
2944 NewSetMembers.push_back(ValuePair(K, N->second));
2945 for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
2946 N != JPairRange.second; ++N)
2947 NewSetMembers.push_back(ValuePair(K, N->second));
2948 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
2949 AE = NewSetMembers.end(); A != AE; ++A)
2950 LoadMoveSet.insert(*A);
2953 // Before removing I, set the iterator to the next instruction.
2954 PI = llvm::next(BasicBlock::iterator(I));
2955 if (cast<Instruction>(PI) == J)
2960 I->eraseFromParent();
2961 J->eraseFromParent();
2963 DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
2967 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
2971 char BBVectorize::ID = 0;
2972 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
2973 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
2974 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
2975 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
2976 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
2977 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
2979 BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
2980 return new BBVectorize(C);
2984 llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
2985 BBVectorize BBVectorizer(P, C);
2986 return BBVectorizer.vectorizeBB(BB);
2989 //===----------------------------------------------------------------------===//
2990 VectorizeConfig::VectorizeConfig() {
2991 VectorBits = ::VectorBits;
2992 VectorizeBools = !::NoBools;
2993 VectorizeInts = !::NoInts;
2994 VectorizeFloats = !::NoFloats;
2995 VectorizePointers = !::NoPointers;
2996 VectorizeCasts = !::NoCasts;
2997 VectorizeMath = !::NoMath;
2998 VectorizeFMA = !::NoFMA;
2999 VectorizeSelect = !::NoSelect;
3000 VectorizeCmp = !::NoCmp;
3001 VectorizeGEP = !::NoGEP;
3002 VectorizeMemOps = !::NoMemOps;
3003 AlignedOnly = ::AlignedOnly;
3004 ReqChainDepth= ::ReqChainDepth;
3005 SearchLimit = ::SearchLimit;
3006 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
3007 SplatBreaksChain = ::SplatBreaksChain;
3008 MaxInsts = ::MaxInsts;
3009 MaxIter = ::MaxIter;
3010 Pow2LenOnly = ::Pow2LenOnly;
3011 NoMemOpBoost = ::NoMemOpBoost;
3012 FastDep = ::FastDep;