1 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements a basic-block vectorization pass. The algorithm was
11 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
12 // et al. It works by looking for chains of pairable operations and then
15 //===----------------------------------------------------------------------===//
17 #define BBV_NAME "bb-vectorize"
18 #define DEBUG_TYPE BBV_NAME
19 #include "llvm/Constants.h"
20 #include "llvm/DerivedTypes.h"
21 #include "llvm/Function.h"
22 #include "llvm/Instructions.h"
23 #include "llvm/IntrinsicInst.h"
24 #include "llvm/Intrinsics.h"
25 #include "llvm/LLVMContext.h"
26 #include "llvm/Pass.h"
27 #include "llvm/Type.h"
28 #include "llvm/ADT/DenseMap.h"
29 #include "llvm/ADT/DenseSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/StringExtras.h"
34 #include "llvm/Analysis/AliasAnalysis.h"
35 #include "llvm/Analysis/AliasSetTracker.h"
36 #include "llvm/Analysis/ScalarEvolution.h"
37 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
38 #include "llvm/Analysis/ValueTracking.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/raw_ostream.h"
42 #include "llvm/Support/ValueHandle.h"
43 #include "llvm/Target/TargetData.h"
44 #include "llvm/Transforms/Vectorize.h"
49 static cl::opt<unsigned>
50 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
51 cl::desc("The required chain depth for vectorization"));
53 static cl::opt<unsigned>
54 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
55 cl::desc("The maximum search distance for instruction pairs"));
58 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
59 cl::desc("Replicating one element to a pair breaks the chain"));
61 static cl::opt<unsigned>
62 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
63 cl::desc("The size of the native vector registers"));
65 static cl::opt<unsigned>
66 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
67 cl::desc("The maximum number of pairing iterations"));
69 static cl::opt<unsigned>
70 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
71 cl::desc("The maximum number of pairable instructions per group"));
73 static cl::opt<unsigned>
74 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
75 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
76 " a full cycle check"));
79 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
80 cl::desc("Don't try to vectorize integer values"));
83 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
84 cl::desc("Don't try to vectorize floating-point values"));
87 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
88 cl::desc("Don't try to vectorize casting (conversion) operations"));
91 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
92 cl::desc("Don't try to vectorize floating-point math intrinsics"));
95 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
96 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
99 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
100 cl::desc("Don't try to vectorize loads and stores"));
103 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
104 cl::desc("Only generate aligned loads and stores"));
107 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
108 cl::desc("Use a fast instruction dependency analysis"));
112 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
113 cl::init(false), cl::Hidden,
114 cl::desc("When debugging is enabled, output information on the"
115 " instruction-examination process"));
117 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
118 cl::init(false), cl::Hidden,
119 cl::desc("When debugging is enabled, output information on the"
120 " candidate-selection process"));
122 DebugPairSelection("bb-vectorize-debug-pair-selection",
123 cl::init(false), cl::Hidden,
124 cl::desc("When debugging is enabled, output information on the"
125 " pair-selection process"));
127 DebugCycleCheck("bb-vectorize-debug-cycle-check",
128 cl::init(false), cl::Hidden,
129 cl::desc("When debugging is enabled, output information on the"
130 " cycle-checking process"));
133 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
136 struct BBVectorize : public BasicBlockPass {
137 static char ID; // Pass identification, replacement for typeid
138 BBVectorize() : BasicBlockPass(ID) {
139 initializeBBVectorizePass(*PassRegistry::getPassRegistry());
142 typedef std::pair<Value *, Value *> ValuePair;
143 typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
144 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
145 typedef std::pair<std::multimap<Value *, Value *>::iterator,
146 std::multimap<Value *, Value *>::iterator> VPIteratorPair;
147 typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
148 std::multimap<ValuePair, ValuePair>::iterator>
155 // FIXME: const correct?
157 bool vectorizePairs(BasicBlock &BB);
159 bool getCandidatePairs(BasicBlock &BB,
160 BasicBlock::iterator &Start,
161 std::multimap<Value *, Value *> &CandidatePairs,
162 std::vector<Value *> &PairableInsts);
164 void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
165 std::vector<Value *> &PairableInsts,
166 std::multimap<ValuePair, ValuePair> &ConnectedPairs);
168 void buildDepMap(BasicBlock &BB,
169 std::multimap<Value *, Value *> &CandidatePairs,
170 std::vector<Value *> &PairableInsts,
171 DenseSet<ValuePair> &PairableInstUsers);
173 void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
174 std::vector<Value *> &PairableInsts,
175 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
176 DenseSet<ValuePair> &PairableInstUsers,
177 DenseMap<Value *, Value *>& ChosenPairs);
179 void fuseChosenPairs(BasicBlock &BB,
180 std::vector<Value *> &PairableInsts,
181 DenseMap<Value *, Value *>& ChosenPairs);
183 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
185 bool areInstsCompatible(Instruction *I, Instruction *J,
186 bool IsSimpleLoadStore);
188 bool trackUsesOfI(DenseSet<Value *> &Users,
189 AliasSetTracker &WriteSet, Instruction *I,
190 Instruction *J, bool UpdateUsers = true,
191 std::multimap<Value *, Value *> *LoadMoveSet = 0);
193 void computePairsConnectedTo(
194 std::multimap<Value *, Value *> &CandidatePairs,
195 std::vector<Value *> &PairableInsts,
196 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
199 bool pairsConflict(ValuePair P, ValuePair Q,
200 DenseSet<ValuePair> &PairableInstUsers,
201 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
203 bool pairWillFormCycle(ValuePair P,
204 std::multimap<ValuePair, ValuePair> &PairableInstUsers,
205 DenseSet<ValuePair> &CurrentPairs);
208 std::multimap<Value *, Value *> &CandidatePairs,
209 std::vector<Value *> &PairableInsts,
210 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
211 DenseSet<ValuePair> &PairableInstUsers,
212 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
213 DenseMap<Value *, Value *> &ChosenPairs,
214 DenseMap<ValuePair, size_t> &Tree,
215 DenseSet<ValuePair> &PrunedTree, ValuePair J,
218 void buildInitialTreeFor(
219 std::multimap<Value *, Value *> &CandidatePairs,
220 std::vector<Value *> &PairableInsts,
221 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
222 DenseSet<ValuePair> &PairableInstUsers,
223 DenseMap<Value *, Value *> &ChosenPairs,
224 DenseMap<ValuePair, size_t> &Tree, ValuePair J);
226 void findBestTreeFor(
227 std::multimap<Value *, Value *> &CandidatePairs,
228 std::vector<Value *> &PairableInsts,
229 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
230 DenseSet<ValuePair> &PairableInstUsers,
231 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
232 DenseMap<Value *, Value *> &ChosenPairs,
233 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
234 size_t &BestEffSize, VPIteratorPair ChoiceRange,
237 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
238 Instruction *J, unsigned o, bool &FlipMemInputs);
240 void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
241 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
242 unsigned IdxOffset, std::vector<Constant*> &Mask);
244 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
247 Value *getReplacementInput(LLVMContext& Context, Instruction *I,
248 Instruction *J, unsigned o, bool FlipMemInputs);
250 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
251 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
252 bool &FlipMemInputs);
254 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
255 Instruction *J, Instruction *K,
256 Instruction *&InsertionPt, Instruction *&K1,
257 Instruction *&K2, bool &FlipMemInputs);
259 void collectPairLoadMoveSet(BasicBlock &BB,
260 DenseMap<Value *, Value *> &ChosenPairs,
261 std::multimap<Value *, Value *> &LoadMoveSet,
264 void collectLoadMoveSet(BasicBlock &BB,
265 std::vector<Value *> &PairableInsts,
266 DenseMap<Value *, Value *> &ChosenPairs,
267 std::multimap<Value *, Value *> &LoadMoveSet);
269 bool canMoveUsesOfIAfterJ(BasicBlock &BB,
270 std::multimap<Value *, Value *> &LoadMoveSet,
271 Instruction *I, Instruction *J);
273 void moveUsesOfIAfterJ(BasicBlock &BB,
274 std::multimap<Value *, Value *> &LoadMoveSet,
275 Instruction *&InsertionPt,
276 Instruction *I, Instruction *J);
278 virtual bool runOnBasicBlock(BasicBlock &BB) {
279 AA = &getAnalysis<AliasAnalysis>();
280 SE = &getAnalysis<ScalarEvolution>();
281 TD = getAnalysisIfAvailable<TargetData>();
283 bool changed = false;
284 // Iterate a sufficient number of times to merge types of size 1 bit,
285 // then 2 bits, then 4, etc. up to half of the target vector width of the
286 // target vector register.
287 for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter);
289 DEBUG(dbgs() << "BBV: fusing loop #" << n <<
290 " for " << BB.getName() << " in " <<
291 BB.getParent()->getName() << "...\n");
292 if (vectorizePairs(BB))
298 DEBUG(dbgs() << "BBV: done!\n");
302 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
303 BasicBlockPass::getAnalysisUsage(AU);
304 AU.addRequired<AliasAnalysis>();
305 AU.addRequired<ScalarEvolution>();
306 AU.addPreserved<AliasAnalysis>();
307 AU.addPreserved<ScalarEvolution>();
308 AU.setPreservesCFG();
311 // This returns the vector type that holds a pair of the provided type.
312 // If the provided type is already a vector, then its length is doubled.
313 static inline VectorType *getVecTypeForPair(Type *ElemTy) {
314 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
315 unsigned numElem = VTy->getNumElements();
316 return VectorType::get(ElemTy->getScalarType(), numElem*2);
319 return VectorType::get(ElemTy, 2);
322 // Returns the weight associated with the provided value. A chain of
323 // candidate pairs has a length given by the sum of the weights of its
324 // members (one weight per pair; the weight of each member of the pair
325 // is assumed to be the same). This length is then compared to the
326 // chain-length threshold to determine if a given chain is significant
327 // enough to be vectorized. The length is also used in comparing
328 // candidate chains where longer chains are considered to be better.
329 // Note: when this function returns 0, the resulting instructions are
330 // not actually fused.
331 static inline size_t getDepthFactor(Value *V) {
332 // InsertElement and ExtractElement have a depth factor of zero. This is
333 // for two reasons: First, they cannot be usefully fused. Second, because
334 // the pass generates a lot of these, they can confuse the simple metric
335 // used to compare the trees in the next iteration. Thus, giving them a
336 // weight of zero allows the pass to essentially ignore them in
337 // subsequent iterations when looking for vectorization opportunities
338 // while still tracking dependency chains that flow through those
340 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
346 // This determines the relative offset of two loads or stores, returning
347 // true if the offset could be determined to be some constant value.
348 // For example, if OffsetInElmts == 1, then J accesses the memory directly
349 // after I; if OffsetInElmts == -1 then I accesses the memory
350 // directly after J. This function assumes that both instructions
351 // have the same type.
352 bool getPairPtrInfo(Instruction *I, Instruction *J,
353 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
354 int64_t &OffsetInElmts) {
356 if (isa<LoadInst>(I)) {
357 IPtr = cast<LoadInst>(I)->getPointerOperand();
358 JPtr = cast<LoadInst>(J)->getPointerOperand();
359 IAlignment = cast<LoadInst>(I)->getAlignment();
360 JAlignment = cast<LoadInst>(J)->getAlignment();
362 IPtr = cast<StoreInst>(I)->getPointerOperand();
363 JPtr = cast<StoreInst>(J)->getPointerOperand();
364 IAlignment = cast<StoreInst>(I)->getAlignment();
365 JAlignment = cast<StoreInst>(J)->getAlignment();
368 const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
369 const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
371 // If this is a trivial offset, then we'll get something like
372 // 1*sizeof(type). With target data, which we need anyway, this will get
373 // constant folded into a number.
374 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
375 if (const SCEVConstant *ConstOffSCEV =
376 dyn_cast<SCEVConstant>(OffsetSCEV)) {
377 ConstantInt *IntOff = ConstOffSCEV->getValue();
378 int64_t Offset = IntOff->getSExtValue();
380 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
381 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
383 assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
385 OffsetInElmts = Offset/VTyTSS;
386 return (abs64(Offset) % VTyTSS) == 0;
392 // Returns true if the provided CallInst represents an intrinsic that can
394 bool isVectorizableIntrinsic(CallInst* I) {
395 Function *F = I->getCalledFunction();
396 if (!F) return false;
398 unsigned IID = F->getIntrinsicID();
399 if (!IID) return false;
404 case Intrinsic::sqrt:
405 case Intrinsic::powi:
409 case Intrinsic::log2:
410 case Intrinsic::log10:
412 case Intrinsic::exp2:
420 // Returns true if J is the second element in some pair referenced by
421 // some multimap pair iterator pair.
422 template <typename V>
423 bool isSecondInIteratorPair(V J, std::pair<
424 typename std::multimap<V, V>::iterator,
425 typename std::multimap<V, V>::iterator> PairRange) {
426 for (typename std::multimap<V, V>::iterator K = PairRange.first;
427 K != PairRange.second; ++K)
428 if (K->second == J) return true;
434 // This function implements one vectorization iteration on the provided
435 // basic block. It returns true if the block is changed.
436 bool BBVectorize::vectorizePairs(BasicBlock &BB) {
438 BasicBlock::iterator Start = BB.getFirstInsertionPt();
440 std::vector<Value *> AllPairableInsts;
441 DenseMap<Value *, Value *> AllChosenPairs;
444 std::vector<Value *> PairableInsts;
445 std::multimap<Value *, Value *> CandidatePairs;
446 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
448 if (PairableInsts.empty()) continue;
450 // Now we have a map of all of the pairable instructions and we need to
451 // select the best possible pairing. A good pairing is one such that the
452 // users of the pair are also paired. This defines a (directed) forest
453 // over the pairs such that two pairs are connected iff the second pair
456 // Note that it only matters that both members of the second pair use some
457 // element of the first pair (to allow for splatting).
459 std::multimap<ValuePair, ValuePair> ConnectedPairs;
460 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
461 if (ConnectedPairs.empty()) continue;
463 // Build the pairable-instruction dependency map
464 DenseSet<ValuePair> PairableInstUsers;
465 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
467 // There is now a graph of the connected pairs. For each variable, pick
468 // the pairing with the largest tree meeting the depth requirement on at
469 // least one branch. Then select all pairings that are part of that tree
470 // and remove them from the list of available pairings and pairable
473 DenseMap<Value *, Value *> ChosenPairs;
474 choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
475 PairableInstUsers, ChosenPairs);
477 if (ChosenPairs.empty()) continue;
478 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
479 PairableInsts.end());
480 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
481 } while (ShouldContinue);
483 if (AllChosenPairs.empty()) return false;
484 NumFusedOps += AllChosenPairs.size();
486 // A set of pairs has now been selected. It is now necessary to replace the
487 // paired instructions with vector instructions. For this procedure each
488 // operand much be replaced with a vector operand. This vector is formed
489 // by using build_vector on the old operands. The replaced values are then
490 // replaced with a vector_extract on the result. Subsequent optimization
491 // passes should coalesce the build/extract combinations.
493 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
497 // This function returns true if the provided instruction is capable of being
498 // fused into a vector instruction. This determination is based only on the
499 // type and other attributes of the instruction.
500 bool BBVectorize::isInstVectorizable(Instruction *I,
501 bool &IsSimpleLoadStore) {
502 IsSimpleLoadStore = false;
504 if (CallInst *C = dyn_cast<CallInst>(I)) {
505 if (!isVectorizableIntrinsic(C))
507 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
508 // Vectorize simple loads if possbile:
509 IsSimpleLoadStore = L->isSimple();
510 if (!IsSimpleLoadStore || NoMemOps)
512 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
513 // Vectorize simple stores if possbile:
514 IsSimpleLoadStore = S->isSimple();
515 if (!IsSimpleLoadStore || NoMemOps)
517 } else if (CastInst *C = dyn_cast<CastInst>(I)) {
518 // We can vectorize casts, but not casts of pointer types, etc.
522 Type *SrcTy = C->getSrcTy();
523 if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy())
526 Type *DestTy = C->getDestTy();
527 if (!DestTy->isSingleValueType() || DestTy->isPointerTy())
529 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
530 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
534 // We can't vectorize memory operations without target data
535 if (TD == 0 && IsSimpleLoadStore)
539 if (isa<StoreInst>(I)) {
540 // For stores, it is the value type, not the pointer type that matters
541 // because the value is what will come from a vector register.
543 Value *IVal = cast<StoreInst>(I)->getValueOperand();
544 T1 = IVal->getType();
550 T2 = cast<CastInst>(I)->getSrcTy();
554 // Not every type can be vectorized...
555 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
556 !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
559 if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
562 if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
565 if (T1->getPrimitiveSizeInBits() > VectorBits/2 ||
566 T2->getPrimitiveSizeInBits() > VectorBits/2)
572 // This function returns true if the two provided instructions are compatible
573 // (meaning that they can be fused into a vector instruction). This assumes
574 // that I has already been determined to be vectorizable and that J is not
575 // in the use tree of I.
576 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
577 bool IsSimpleLoadStore) {
578 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
579 " <-> " << *J << "\n");
581 // Loads and stores can be merged if they have different alignments,
582 // but are otherwise the same.
585 if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) {
586 if (I->getType() != J->getType())
589 if (LI->getPointerOperand()->getType() !=
590 LJ->getPointerOperand()->getType() ||
591 LI->isVolatile() != LJ->isVolatile() ||
592 LI->getOrdering() != LJ->getOrdering() ||
593 LI->getSynchScope() != LJ->getSynchScope())
595 } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) {
596 if (SI->getValueOperand()->getType() !=
597 SJ->getValueOperand()->getType() ||
598 SI->getPointerOperand()->getType() !=
599 SJ->getPointerOperand()->getType() ||
600 SI->isVolatile() != SJ->isVolatile() ||
601 SI->getOrdering() != SJ->getOrdering() ||
602 SI->getSynchScope() != SJ->getSynchScope())
604 } else if (!J->isSameOperationAs(I)) {
607 // FIXME: handle addsub-type operations!
609 if (IsSimpleLoadStore) {
611 unsigned IAlignment, JAlignment;
612 int64_t OffsetInElmts = 0;
613 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
614 OffsetInElmts) && abs64(OffsetInElmts) == 1) {
616 Type *aType = isa<StoreInst>(I) ?
617 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
618 // An aligned load or store is possible only if the instruction
619 // with the lower offset has an alignment suitable for the
622 unsigned BottomAlignment = IAlignment;
623 if (OffsetInElmts < 0) BottomAlignment = JAlignment;
625 Type *VType = getVecTypeForPair(aType);
626 unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
627 if (BottomAlignment < VecAlignment)
633 } else if (isa<ShuffleVectorInst>(I)) {
634 // Only merge two shuffles if they're both constant
635 return isa<Constant>(I->getOperand(2)) &&
636 isa<Constant>(J->getOperand(2));
637 // FIXME: We may want to vectorize non-constant shuffles also.
643 // Figure out whether or not J uses I and update the users and write-set
644 // structures associated with I. Specifically, Users represents the set of
645 // instructions that depend on I. WriteSet represents the set
646 // of memory locations that are dependent on I. If UpdateUsers is true,
647 // and J uses I, then Users is updated to contain J and WriteSet is updated
648 // to contain any memory locations to which J writes. The function returns
649 // true if J uses I. By default, alias analysis is used to determine
650 // whether J reads from memory that overlaps with a location in WriteSet.
651 // If LoadMoveSet is not null, then it is a previously-computed multimap
652 // where the key is the memory-based user instruction and the value is
653 // the instruction to be compared with I. So, if LoadMoveSet is provided,
654 // then the alias analysis is not used. This is necessary because this
655 // function is called during the process of moving instructions during
656 // vectorization and the results of the alias analysis are not stable during
658 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
659 AliasSetTracker &WriteSet, Instruction *I,
660 Instruction *J, bool UpdateUsers,
661 std::multimap<Value *, Value *> *LoadMoveSet) {
664 // This instruction may already be marked as a user due, for example, to
665 // being a member of a selected pair.
670 for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
673 if (I == V || Users.count(V)) {
678 if (!UsesI && J->mayReadFromMemory()) {
680 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
681 UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
683 for (AliasSetTracker::iterator W = WriteSet.begin(),
684 WE = WriteSet.end(); W != WE; ++W) {
685 for (AliasSet::iterator A = W->begin(), AE = W->end();
687 AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(),
689 if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) {
699 if (UsesI && UpdateUsers) {
700 if (J->mayWriteToMemory()) WriteSet.add(J);
707 // This function iterates over all instruction pairs in the provided
708 // basic block and collects all candidate pairs for vectorization.
709 bool BBVectorize::getCandidatePairs(BasicBlock &BB,
710 BasicBlock::iterator &Start,
711 std::multimap<Value *, Value *> &CandidatePairs,
712 std::vector<Value *> &PairableInsts) {
713 BasicBlock::iterator E = BB.end();
714 if (Start == E) return false;
716 bool ShouldContinue = false, IAfterStart = false;
717 for (BasicBlock::iterator I = Start++; I != E; ++I) {
718 if (I == Start) IAfterStart = true;
720 bool IsSimpleLoadStore;
721 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
723 // Look for an instruction with which to pair instruction *I...
724 DenseSet<Value *> Users;
725 AliasSetTracker WriteSet(*AA);
726 bool JAfterStart = IAfterStart;
727 BasicBlock::iterator J = llvm::next(I);
728 for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) {
729 if (J == Start) JAfterStart = true;
731 // Determine if J uses I, if so, exit the loop.
732 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep);
734 // Note: For this heuristic to be effective, independent operations
735 // must tend to be intermixed. This is likely to be true from some
736 // kinds of grouped loop unrolling (but not the generic LLVM pass),
737 // but otherwise may require some kind of reordering pass.
739 // When using fast dependency analysis,
740 // stop searching after first use:
746 // J does not use I, and comes before the first use of I, so it can be
747 // merged with I if the instructions are compatible.
748 if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
750 // J is a candidate for merging with I.
751 if (!PairableInsts.size() ||
752 PairableInsts[PairableInsts.size()-1] != I) {
753 PairableInsts.push_back(I);
756 CandidatePairs.insert(ValuePair(I, J));
758 // The next call to this function must start after the last instruction
759 // selected during this invocation.
761 Start = llvm::next(J);
762 IAfterStart = JAfterStart = false;
765 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
766 << *I << " <-> " << *J << "\n");
768 // If we have already found too many pairs, break here and this function
769 // will be called again starting after the last instruction selected
770 // during this invocation.
771 if (PairableInsts.size() >= MaxInsts) {
772 ShouldContinue = true;
781 DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
782 << " instructions with candidate pairs\n");
784 return ShouldContinue;
787 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
788 // it looks for pairs such that both members have an input which is an
789 // output of PI or PJ.
790 void BBVectorize::computePairsConnectedTo(
791 std::multimap<Value *, Value *> &CandidatePairs,
792 std::vector<Value *> &PairableInsts,
793 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
795 // For each possible pairing for this variable, look at the uses of
796 // the first value...
797 for (Value::use_iterator I = P.first->use_begin(),
798 E = P.first->use_end(); I != E; ++I) {
799 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
801 // For each use of the first variable, look for uses of the second
803 for (Value::use_iterator J = P.second->use_begin(),
804 E2 = P.second->use_end(); J != E2; ++J) {
805 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
808 if (isSecondInIteratorPair<Value*>(*J, IPairRange))
809 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
812 if (isSecondInIteratorPair<Value*>(*I, JPairRange))
813 ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
816 if (SplatBreaksChain) continue;
817 // Look for cases where just the first value in the pair is used by
818 // both members of another pair (splatting).
819 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
820 if (isSecondInIteratorPair<Value*>(*J, IPairRange))
821 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
825 if (SplatBreaksChain) return;
826 // Look for cases where just the second value in the pair is used by
827 // both members of another pair (splatting).
828 for (Value::use_iterator I = P.second->use_begin(),
829 E = P.second->use_end(); I != E; ++I) {
830 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
832 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
833 if (isSecondInIteratorPair<Value*>(*J, IPairRange))
834 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
839 // This function figures out which pairs are connected. Two pairs are
840 // connected if some output of the first pair forms an input to both members
841 // of the second pair.
842 void BBVectorize::computeConnectedPairs(
843 std::multimap<Value *, Value *> &CandidatePairs,
844 std::vector<Value *> &PairableInsts,
845 std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
847 for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
848 PE = PairableInsts.end(); PI != PE; ++PI) {
849 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
851 for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
852 P != choiceRange.second; ++P)
853 computePairsConnectedTo(CandidatePairs, PairableInsts,
857 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
858 << " pair connections.\n");
861 // This function builds a set of use tuples such that <A, B> is in the set
862 // if B is in the use tree of A. If B is in the use tree of A, then B
863 // depends on the output of A.
864 void BBVectorize::buildDepMap(
866 std::multimap<Value *, Value *> &CandidatePairs,
867 std::vector<Value *> &PairableInsts,
868 DenseSet<ValuePair> &PairableInstUsers) {
869 DenseSet<Value *> IsInPair;
870 for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
871 E = CandidatePairs.end(); C != E; ++C) {
872 IsInPair.insert(C->first);
873 IsInPair.insert(C->second);
876 // Iterate through the basic block, recording all Users of each
877 // pairable instruction.
879 BasicBlock::iterator E = BB.end();
880 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
881 if (IsInPair.find(I) == IsInPair.end()) continue;
883 DenseSet<Value *> Users;
884 AliasSetTracker WriteSet(*AA);
885 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
886 (void) trackUsesOfI(Users, WriteSet, I, J);
888 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
890 PairableInstUsers.insert(ValuePair(I, *U));
894 // Returns true if an input to pair P is an output of pair Q and also an
895 // input of pair Q is an output of pair P. If this is the case, then these
896 // two pairs cannot be simultaneously fused.
897 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
898 DenseSet<ValuePair> &PairableInstUsers,
899 std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
900 // Two pairs are in conflict if they are mutual Users of eachother.
901 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
902 PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
903 PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
904 PairableInstUsers.count(ValuePair(P.second, Q.second));
905 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
906 PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
907 PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
908 PairableInstUsers.count(ValuePair(Q.second, P.second));
909 if (PairableInstUserMap) {
910 // FIXME: The expensive part of the cycle check is not so much the cycle
911 // check itself but this edge insertion procedure. This needs some
912 // profiling and probably a different data structure (same is true of
913 // most uses of std::multimap).
915 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
916 if (!isSecondInIteratorPair(P, QPairRange))
917 PairableInstUserMap->insert(VPPair(Q, P));
920 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
921 if (!isSecondInIteratorPair(Q, PPairRange))
922 PairableInstUserMap->insert(VPPair(P, Q));
926 return (QUsesP && PUsesQ);
929 // This function walks the use graph of current pairs to see if, starting
930 // from P, the walk returns to P.
931 bool BBVectorize::pairWillFormCycle(ValuePair P,
932 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
933 DenseSet<ValuePair> &CurrentPairs) {
934 DEBUG(if (DebugCycleCheck)
935 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
936 << *P.second << "\n");
937 // A lookup table of visisted pairs is kept because the PairableInstUserMap
938 // contains non-direct associations.
939 DenseSet<ValuePair> Visited;
940 SmallVector<ValuePair, 32> Q;
941 // General depth-first post-order traversal:
944 ValuePair QTop = Q.pop_back_val();
945 Visited.insert(QTop);
947 DEBUG(if (DebugCycleCheck)
948 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
949 << *QTop.second << "\n");
950 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
951 for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
952 C != QPairRange.second; ++C) {
953 if (C->second == P) {
955 << "BBV: rejected to prevent non-trivial cycle formation: "
956 << *C->first.first << " <-> " << *C->first.second << "\n");
960 if (CurrentPairs.count(C->second) > 0 &&
961 Visited.count(C->second) == 0)
962 Q.push_back(C->second);
964 } while (!Q.empty());
969 // This function builds the initial tree of connected pairs with the
970 // pair J at the root.
971 void BBVectorize::buildInitialTreeFor(
972 std::multimap<Value *, Value *> &CandidatePairs,
973 std::vector<Value *> &PairableInsts,
974 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
975 DenseSet<ValuePair> &PairableInstUsers,
976 DenseMap<Value *, Value *> &ChosenPairs,
977 DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
978 // Each of these pairs is viewed as the root node of a Tree. The Tree
979 // is then walked (depth-first). As this happens, we keep track of
980 // the pairs that compose the Tree and the maximum depth of the Tree.
981 SmallVector<ValuePairWithDepth, 32> Q;
982 // General depth-first post-order traversal:
983 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
985 ValuePairWithDepth QTop = Q.back();
987 // Push each child onto the queue:
988 bool MoreChildren = false;
989 size_t MaxChildDepth = QTop.second;
990 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
991 for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
992 k != qtRange.second; ++k) {
993 // Make sure that this child pair is still a candidate:
994 bool IsStillCand = false;
995 VPIteratorPair checkRange =
996 CandidatePairs.equal_range(k->second.first);
997 for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
998 m != checkRange.second; ++m) {
999 if (m->second == k->second.second) {
1006 DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
1007 if (C == Tree.end()) {
1008 size_t d = getDepthFactor(k->second.first);
1009 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
1010 MoreChildren = true;
1012 MaxChildDepth = std::max(MaxChildDepth, C->second);
1017 if (!MoreChildren) {
1018 // Record the current pair as part of the Tree:
1019 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1022 } while (!Q.empty());
1025 // Given some initial tree, prune it by removing conflicting pairs (pairs
1026 // that cannot be simultaneously chosen for vectorization).
1027 void BBVectorize::pruneTreeFor(
1028 std::multimap<Value *, Value *> &CandidatePairs,
1029 std::vector<Value *> &PairableInsts,
1030 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1031 DenseSet<ValuePair> &PairableInstUsers,
1032 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
1033 DenseMap<Value *, Value *> &ChosenPairs,
1034 DenseMap<ValuePair, size_t> &Tree,
1035 DenseSet<ValuePair> &PrunedTree, ValuePair J,
1036 bool UseCycleCheck) {
1037 SmallVector<ValuePairWithDepth, 32> Q;
1038 // General depth-first post-order traversal:
1039 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1041 ValuePairWithDepth QTop = Q.pop_back_val();
1042 PrunedTree.insert(QTop.first);
1044 // Visit each child, pruning as necessary...
1045 DenseMap<ValuePair, size_t> BestChilden;
1046 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
1047 for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
1048 K != QTopRange.second; ++K) {
1049 DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
1050 if (C == Tree.end()) continue;
1052 // This child is in the Tree, now we need to make sure it is the
1053 // best of any conflicting children. There could be multiple
1054 // conflicting children, so first, determine if we're keeping
1055 // this child, then delete conflicting children as necessary.
1057 // It is also necessary to guard against pairing-induced
1058 // dependencies. Consider instructions a .. x .. y .. b
1059 // such that (a,b) are to be fused and (x,y) are to be fused
1060 // but a is an input to x and b is an output from y. This
1061 // means that y cannot be moved after b but x must be moved
1062 // after b for (a,b) to be fused. In other words, after
1063 // fusing (a,b) we have y .. a/b .. x where y is an input
1064 // to a/b and x is an output to a/b: x and y can no longer
1065 // be legally fused. To prevent this condition, we must
1066 // make sure that a child pair added to the Tree is not
1067 // both an input and output of an already-selected pair.
1069 // Pairing-induced dependencies can also form from more complicated
1070 // cycles. The pair vs. pair conflicts are easy to check, and so
1071 // that is done explicitly for "fast rejection", and because for
1072 // child vs. child conflicts, we may prefer to keep the current
1073 // pair in preference to the already-selected child.
1074 DenseSet<ValuePair> CurrentPairs;
1077 for (DenseMap<ValuePair, size_t>::iterator C2
1078 = BestChilden.begin(), E2 = BestChilden.end();
1080 if (C2->first.first == C->first.first ||
1081 C2->first.first == C->first.second ||
1082 C2->first.second == C->first.first ||
1083 C2->first.second == C->first.second ||
1084 pairsConflict(C2->first, C->first, PairableInstUsers,
1085 UseCycleCheck ? &PairableInstUserMap : 0)) {
1086 if (C2->second >= C->second) {
1091 CurrentPairs.insert(C2->first);
1094 if (!CanAdd) continue;
1096 // Even worse, this child could conflict with another node already
1097 // selected for the Tree. If that is the case, ignore this child.
1098 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
1099 E2 = PrunedTree.end(); T != E2; ++T) {
1100 if (T->first == C->first.first ||
1101 T->first == C->first.second ||
1102 T->second == C->first.first ||
1103 T->second == C->first.second ||
1104 pairsConflict(*T, C->first, PairableInstUsers,
1105 UseCycleCheck ? &PairableInstUserMap : 0)) {
1110 CurrentPairs.insert(*T);
1112 if (!CanAdd) continue;
1114 // And check the queue too...
1115 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
1116 E2 = Q.end(); C2 != E2; ++C2) {
1117 if (C2->first.first == C->first.first ||
1118 C2->first.first == C->first.second ||
1119 C2->first.second == C->first.first ||
1120 C2->first.second == C->first.second ||
1121 pairsConflict(C2->first, C->first, PairableInstUsers,
1122 UseCycleCheck ? &PairableInstUserMap : 0)) {
1127 CurrentPairs.insert(C2->first);
1129 if (!CanAdd) continue;
1131 // Last but not least, check for a conflict with any of the
1132 // already-chosen pairs.
1133 for (DenseMap<Value *, Value *>::iterator C2 =
1134 ChosenPairs.begin(), E2 = ChosenPairs.end();
1136 if (pairsConflict(*C2, C->first, PairableInstUsers,
1137 UseCycleCheck ? &PairableInstUserMap : 0)) {
1142 CurrentPairs.insert(*C2);
1144 if (!CanAdd) continue;
1146 // To check for non-trivial cycles formed by the addition of the
1147 // current pair we've formed a list of all relevant pairs, now use a
1148 // graph walk to check for a cycle. We start from the current pair and
1149 // walk the use tree to see if we again reach the current pair. If we
1150 // do, then the current pair is rejected.
1152 // FIXME: It may be more efficient to use a topological-ordering
1153 // algorithm to improve the cycle check. This should be investigated.
1154 if (UseCycleCheck &&
1155 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1158 // This child can be added, but we may have chosen it in preference
1159 // to an already-selected child. Check for this here, and if a
1160 // conflict is found, then remove the previously-selected child
1161 // before adding this one in its place.
1162 for (DenseMap<ValuePair, size_t>::iterator C2
1163 = BestChilden.begin(); C2 != BestChilden.end();) {
1164 if (C2->first.first == C->first.first ||
1165 C2->first.first == C->first.second ||
1166 C2->first.second == C->first.first ||
1167 C2->first.second == C->first.second ||
1168 pairsConflict(C2->first, C->first, PairableInstUsers))
1169 BestChilden.erase(C2++);
1174 BestChilden.insert(ValuePairWithDepth(C->first, C->second));
1177 for (DenseMap<ValuePair, size_t>::iterator C
1178 = BestChilden.begin(), E2 = BestChilden.end();
1180 size_t DepthF = getDepthFactor(C->first.first);
1181 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1183 } while (!Q.empty());
1186 // This function finds the best tree of mututally-compatible connected
1187 // pairs, given the choice of root pairs as an iterator range.
1188 void BBVectorize::findBestTreeFor(
1189 std::multimap<Value *, Value *> &CandidatePairs,
1190 std::vector<Value *> &PairableInsts,
1191 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1192 DenseSet<ValuePair> &PairableInstUsers,
1193 std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
1194 DenseMap<Value *, Value *> &ChosenPairs,
1195 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
1196 size_t &BestEffSize, VPIteratorPair ChoiceRange,
1197 bool UseCycleCheck) {
1198 for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
1199 J != ChoiceRange.second; ++J) {
1201 // Before going any further, make sure that this pair does not
1202 // conflict with any already-selected pairs (see comment below
1203 // near the Tree pruning for more details).
1204 DenseSet<ValuePair> ChosenPairSet;
1205 bool DoesConflict = false;
1206 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
1207 E = ChosenPairs.end(); C != E; ++C) {
1208 if (pairsConflict(*C, *J, PairableInstUsers,
1209 UseCycleCheck ? &PairableInstUserMap : 0)) {
1210 DoesConflict = true;
1214 ChosenPairSet.insert(*C);
1216 if (DoesConflict) continue;
1218 if (UseCycleCheck &&
1219 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
1222 DenseMap<ValuePair, size_t> Tree;
1223 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1224 PairableInstUsers, ChosenPairs, Tree, *J);
1226 // Because we'll keep the child with the largest depth, the largest
1227 // depth is still the same in the unpruned Tree.
1228 size_t MaxDepth = Tree.lookup(*J);
1230 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
1231 << *J->first << " <-> " << *J->second << "} of depth " <<
1232 MaxDepth << " and size " << Tree.size() << "\n");
1234 // At this point the Tree has been constructed, but, may contain
1235 // contradictory children (meaning that different children of
1236 // some tree node may be attempting to fuse the same instruction).
1237 // So now we walk the tree again, in the case of a conflict,
1238 // keep only the child with the largest depth. To break a tie,
1239 // favor the first child.
1241 DenseSet<ValuePair> PrunedTree;
1242 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1243 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
1244 PrunedTree, *J, UseCycleCheck);
1247 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
1248 E = PrunedTree.end(); S != E; ++S)
1249 EffSize += getDepthFactor(S->first);
1251 DEBUG(if (DebugPairSelection)
1252 dbgs() << "BBV: found pruned Tree for pair {"
1253 << *J->first << " <-> " << *J->second << "} of depth " <<
1254 MaxDepth << " and size " << PrunedTree.size() <<
1255 " (effective size: " << EffSize << ")\n");
1256 if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) {
1257 BestMaxDepth = MaxDepth;
1258 BestEffSize = EffSize;
1259 BestTree = PrunedTree;
1264 // Given the list of candidate pairs, this function selects those
1265 // that will be fused into vector instructions.
1266 void BBVectorize::choosePairs(
1267 std::multimap<Value *, Value *> &CandidatePairs,
1268 std::vector<Value *> &PairableInsts,
1269 std::multimap<ValuePair, ValuePair> &ConnectedPairs,
1270 DenseSet<ValuePair> &PairableInstUsers,
1271 DenseMap<Value *, Value *>& ChosenPairs) {
1272 bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck;
1273 std::multimap<ValuePair, ValuePair> PairableInstUserMap;
1274 for (std::vector<Value *>::iterator I = PairableInsts.begin(),
1275 E = PairableInsts.end(); I != E; ++I) {
1276 // The number of possible pairings for this variable:
1277 size_t NumChoices = CandidatePairs.count(*I);
1278 if (!NumChoices) continue;
1280 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
1282 // The best pair to choose and its tree:
1283 size_t BestMaxDepth = 0, BestEffSize = 0;
1284 DenseSet<ValuePair> BestTree;
1285 findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1286 PairableInstUsers, PairableInstUserMap, ChosenPairs,
1287 BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
1290 // A tree has been chosen (or not) at this point. If no tree was
1291 // chosen, then this instruction, I, cannot be paired (and is no longer
1294 DEBUG(if (BestTree.size() > 0)
1295 dbgs() << "BBV: selected pairs in the best tree for: "
1296 << *cast<Instruction>(*I) << "\n");
1298 for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
1299 SE2 = BestTree.end(); S != SE2; ++S) {
1300 // Insert the members of this tree into the list of chosen pairs.
1301 ChosenPairs.insert(ValuePair(S->first, S->second));
1302 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
1303 *S->second << "\n");
1305 // Remove all candidate pairs that have values in the chosen tree.
1306 for (std::multimap<Value *, Value *>::iterator K =
1307 CandidatePairs.begin(); K != CandidatePairs.end();) {
1308 if (K->first == S->first || K->second == S->first ||
1309 K->second == S->second || K->first == S->second) {
1310 // Don't remove the actual pair chosen so that it can be used
1311 // in subsequent tree selections.
1312 if (!(K->first == S->first && K->second == S->second))
1313 CandidatePairs.erase(K++);
1323 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
1326 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
1331 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
1332 (n > 0 ? "." + utostr(n) : "")).str();
1335 // Returns the value that is to be used as the pointer input to the vector
1336 // instruction that fuses I with J.
1337 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
1338 Instruction *I, Instruction *J, unsigned o,
1339 bool &FlipMemInputs) {
1341 unsigned IAlignment, JAlignment;
1342 int64_t OffsetInElmts;
1343 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
1346 // The pointer value is taken to be the one with the lowest offset.
1348 if (OffsetInElmts > 0) {
1351 FlipMemInputs = true;
1355 Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType();
1356 Type *VArgType = getVecTypeForPair(ArgType);
1357 Type *VArgPtrType = PointerType::get(VArgType,
1358 cast<PointerType>(IPtr->getType())->getAddressSpace());
1359 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
1360 /* insert before */ FlipMemInputs ? J : I);
1363 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
1364 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
1365 unsigned IdxOffset, std::vector<Constant*> &Mask) {
1366 for (unsigned v = 0; v < NumElem/2; ++v) {
1367 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
1369 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
1371 unsigned mm = m + (int) IdxOffset;
1372 if (m >= (int) NumInElem)
1373 mm += (int) NumInElem;
1375 Mask[v+MaskOffset] =
1376 ConstantInt::get(Type::getInt32Ty(Context), mm);
1381 // Returns the value that is to be used as the vector-shuffle mask to the
1382 // vector instruction that fuses I with J.
1383 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
1384 Instruction *I, Instruction *J) {
1385 // This is the shuffle mask. We need to append the second
1386 // mask to the first, and the numbers need to be adjusted.
1388 Type *ArgType = I->getType();
1389 Type *VArgType = getVecTypeForPair(ArgType);
1391 // Get the total number of elements in the fused vector type.
1392 // By definition, this must equal the number of elements in
1394 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
1395 std::vector<Constant*> Mask(NumElem);
1397 Type *OpType = I->getOperand(0)->getType();
1398 unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
1400 // For the mask from the first pair...
1401 fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
1403 // For the mask from the second pair...
1404 fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
1407 return ConstantVector::get(Mask);
1410 // Returns the value to be used as the specified operand of the vector
1411 // instruction that fuses I with J.
1412 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
1413 Instruction *J, unsigned o, bool FlipMemInputs) {
1414 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
1415 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
1417 // Compute the fused vector type for this operand
1418 Type *ArgType = I->getOperand(o)->getType();
1419 VectorType *VArgType = getVecTypeForPair(ArgType);
1421 Instruction *L = I, *H = J;
1422 if (FlipMemInputs) {
1427 if (ArgType->isVectorTy()) {
1428 unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
1429 std::vector<Constant*> Mask(numElem);
1430 for (unsigned v = 0; v < numElem; ++v)
1431 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
1433 Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
1435 ConstantVector::get(Mask),
1436 getReplacementName(I, true, o));
1437 BV->insertBefore(J);
1441 // If these two inputs are the output of another vector instruction,
1442 // then we should use that output directly. It might be necessary to
1443 // permute it first. [When pairings are fused recursively, you can
1444 // end up with cases where a large vector is decomposed into scalars
1445 // using extractelement instructions, then built into size-2
1446 // vectors using insertelement and the into larger vectors using
1447 // shuffles. InstCombine does not simplify all of these cases well,
1448 // and so we make sure that shuffles are generated here when possible.
1449 ExtractElementInst *LEE
1450 = dyn_cast<ExtractElementInst>(L->getOperand(o));
1451 ExtractElementInst *HEE
1452 = dyn_cast<ExtractElementInst>(H->getOperand(o));
1455 LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
1456 VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
1457 unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
1458 unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
1459 if (LEE->getOperand(0) == HEE->getOperand(0)) {
1460 if (LowIndx == 0 && HighIndx == 1)
1461 return LEE->getOperand(0);
1463 std::vector<Constant*> Mask(2);
1464 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
1465 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
1467 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
1468 UndefValue::get(EEType),
1469 ConstantVector::get(Mask),
1470 getReplacementName(I, true, o));
1471 BV->insertBefore(J);
1475 std::vector<Constant*> Mask(2);
1476 HighIndx += EEType->getNumElements();
1477 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
1478 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
1480 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
1482 ConstantVector::get(Mask),
1483 getReplacementName(I, true, o));
1484 BV->insertBefore(J);
1488 Instruction *BV1 = InsertElementInst::Create(
1489 UndefValue::get(VArgType),
1490 L->getOperand(o), CV0,
1491 getReplacementName(I, true, o, 1));
1492 BV1->insertBefore(I);
1493 Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
1495 getReplacementName(I, true, o, 2));
1496 BV2->insertBefore(J);
1500 // This function creates an array of values that will be used as the inputs
1501 // to the vector instruction that fuses I with J.
1502 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
1503 Instruction *I, Instruction *J,
1504 SmallVector<Value *, 3> &ReplacedOperands,
1505 bool &FlipMemInputs) {
1506 FlipMemInputs = false;
1507 unsigned NumOperands = I->getNumOperands();
1509 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
1510 // Iterate backward so that we look at the store pointer
1511 // first and know whether or not we need to flip the inputs.
1513 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
1514 // This is the pointer for a load/store instruction.
1515 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
1518 } else if (isa<CallInst>(I) && o == NumOperands-1) {
1519 Function *F = cast<CallInst>(I)->getCalledFunction();
1520 unsigned IID = F->getIntrinsicID();
1521 BasicBlock &BB = *I->getParent();
1523 Module *M = BB.getParent()->getParent();
1524 Type *ArgType = I->getType();
1525 Type *VArgType = getVecTypeForPair(ArgType);
1527 // FIXME: is it safe to do this here?
1528 ReplacedOperands[o] = Intrinsic::getDeclaration(M,
1529 (Intrinsic::ID) IID, VArgType);
1531 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
1532 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
1536 ReplacedOperands[o] =
1537 getReplacementInput(Context, I, J, o, FlipMemInputs);
1541 // This function creates two values that represent the outputs of the
1542 // original I and J instructions. These are generally vector shuffles
1543 // or extracts. In many cases, these will end up being unused and, thus,
1544 // eliminated by later passes.
1545 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
1546 Instruction *J, Instruction *K,
1547 Instruction *&InsertionPt,
1548 Instruction *&K1, Instruction *&K2,
1549 bool &FlipMemInputs) {
1550 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
1551 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
1553 if (isa<StoreInst>(I)) {
1554 AA->replaceWithNewValue(I, K);
1555 AA->replaceWithNewValue(J, K);
1557 Type *IType = I->getType();
1558 Type *VType = getVecTypeForPair(IType);
1560 if (IType->isVectorTy()) {
1561 unsigned numElem = cast<VectorType>(IType)->getNumElements();
1562 std::vector<Constant*> Mask1(numElem), Mask2(numElem);
1563 for (unsigned v = 0; v < numElem; ++v) {
1564 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
1565 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
1568 K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
1569 ConstantVector::get(
1570 FlipMemInputs ? Mask2 : Mask1),
1571 getReplacementName(K, false, 1));
1572 K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
1573 ConstantVector::get(
1574 FlipMemInputs ? Mask1 : Mask2),
1575 getReplacementName(K, false, 2));
1577 K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
1578 getReplacementName(K, false, 1));
1579 K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
1580 getReplacementName(K, false, 2));
1584 K2->insertAfter(K1);
1589 // Move all uses of the function I (including pairing-induced uses) after J.
1590 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
1591 std::multimap<Value *, Value *> &LoadMoveSet,
1592 Instruction *I, Instruction *J) {
1593 // Skip to the first instruction past I.
1594 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
1596 DenseSet<Value *> Users;
1597 AliasSetTracker WriteSet(*AA);
1598 for (; cast<Instruction>(L) != J; ++L)
1599 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
1601 assert(cast<Instruction>(L) == J &&
1602 "Tracking has not proceeded far enough to check for dependencies");
1603 // If J is now in the use set of I, then trackUsesOfI will return true
1604 // and we have a dependency cycle (and the fusing operation must abort).
1605 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
1608 // Move all uses of the function I (including pairing-induced uses) after J.
1609 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
1610 std::multimap<Value *, Value *> &LoadMoveSet,
1611 Instruction *&InsertionPt,
1612 Instruction *I, Instruction *J) {
1613 // Skip to the first instruction past I.
1614 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
1616 DenseSet<Value *> Users;
1617 AliasSetTracker WriteSet(*AA);
1618 for (; cast<Instruction>(L) != J;) {
1619 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
1620 // Move this instruction
1621 Instruction *InstToMove = L; ++L;
1623 DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
1624 " to after " << *InsertionPt << "\n");
1625 InstToMove->removeFromParent();
1626 InstToMove->insertAfter(InsertionPt);
1627 InsertionPt = InstToMove;
1634 // Collect all load instruction that are in the move set of a given first
1635 // pair member. These loads depend on the first instruction, I, and so need
1636 // to be moved after J (the second instruction) when the pair is fused.
1637 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
1638 DenseMap<Value *, Value *> &ChosenPairs,
1639 std::multimap<Value *, Value *> &LoadMoveSet,
1641 // Skip to the first instruction past I.
1642 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
1644 DenseSet<Value *> Users;
1645 AliasSetTracker WriteSet(*AA);
1647 // Note: We cannot end the loop when we reach J because J could be moved
1648 // farther down the use chain by another instruction pairing. Also, J
1649 // could be before I if this is an inverted input.
1650 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
1651 if (trackUsesOfI(Users, WriteSet, I, L)) {
1652 if (L->mayReadFromMemory())
1653 LoadMoveSet.insert(ValuePair(L, I));
1658 // In cases where both load/stores and the computation of their pointers
1659 // are chosen for vectorization, we can end up in a situation where the
1660 // aliasing analysis starts returning different query results as the
1661 // process of fusing instruction pairs continues. Because the algorithm
1662 // relies on finding the same use trees here as were found earlier, we'll
1663 // need to precompute the necessary aliasing information here and then
1664 // manually update it during the fusion process.
1665 void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
1666 std::vector<Value *> &PairableInsts,
1667 DenseMap<Value *, Value *> &ChosenPairs,
1668 std::multimap<Value *, Value *> &LoadMoveSet) {
1669 for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
1670 PIE = PairableInsts.end(); PI != PIE; ++PI) {
1671 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
1672 if (P == ChosenPairs.end()) continue;
1674 Instruction *I = cast<Instruction>(P->first);
1675 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
1679 // This function fuses the chosen instruction pairs into vector instructions,
1680 // taking care preserve any needed scalar outputs and, then, it reorders the
1681 // remaining instructions as needed (users of the first member of the pair
1682 // need to be moved to after the location of the second member of the pair
1683 // because the vector instruction is inserted in the location of the pair's
1685 void BBVectorize::fuseChosenPairs(BasicBlock &BB,
1686 std::vector<Value *> &PairableInsts,
1687 DenseMap<Value *, Value *> &ChosenPairs) {
1688 LLVMContext& Context = BB.getContext();
1690 // During the vectorization process, the order of the pairs to be fused
1691 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
1692 // list. After a pair is fused, the flipped pair is removed from the list.
1693 std::vector<ValuePair> FlippedPairs;
1694 FlippedPairs.reserve(ChosenPairs.size());
1695 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
1696 E = ChosenPairs.end(); P != E; ++P)
1697 FlippedPairs.push_back(ValuePair(P->second, P->first));
1698 for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
1699 E = FlippedPairs.end(); P != E; ++P)
1700 ChosenPairs.insert(*P);
1702 std::multimap<Value *, Value *> LoadMoveSet;
1703 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
1705 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
1707 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
1708 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
1709 if (P == ChosenPairs.end()) {
1714 if (getDepthFactor(P->first) == 0) {
1715 // These instructions are not really fused, but are tracked as though
1716 // they are. Any case in which it would be interesting to fuse them
1717 // will be taken care of by InstCombine.
1723 Instruction *I = cast<Instruction>(P->first),
1724 *J = cast<Instruction>(P->second);
1726 DEBUG(dbgs() << "BBV: fusing: " << *I <<
1727 " <-> " << *J << "\n");
1729 // Remove the pair and flipped pair from the list.
1730 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
1731 assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
1732 ChosenPairs.erase(FP);
1733 ChosenPairs.erase(P);
1735 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
1736 DEBUG(dbgs() << "BBV: fusion of: " << *I <<
1738 " aborted because of non-trivial dependency cycle\n");
1745 unsigned NumOperands = I->getNumOperands();
1746 SmallVector<Value *, 3> ReplacedOperands(NumOperands);
1747 getReplacementInputsForPair(Context, I, J, ReplacedOperands,
1750 // Make a copy of the original operation, change its type to the vector
1751 // type and replace its operands with the vector operands.
1752 Instruction *K = I->clone();
1753 if (I->hasName()) K->takeName(I);
1755 if (!isa<StoreInst>(K))
1756 K->mutateType(getVecTypeForPair(I->getType()));
1758 for (unsigned o = 0; o < NumOperands; ++o)
1759 K->setOperand(o, ReplacedOperands[o]);
1761 // If we've flipped the memory inputs, make sure that we take the correct
1763 if (FlipMemInputs) {
1764 if (isa<StoreInst>(K))
1765 cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
1767 cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
1772 // Instruction insertion point:
1773 Instruction *InsertionPt = K;
1774 Instruction *K1 = 0, *K2 = 0;
1775 replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
1778 // The use tree of the first original instruction must be moved to after
1779 // the location of the second instruction. The entire use tree of the
1780 // first instruction is disjoint from the input tree of the second
1781 // (by definition), and so commutes with it.
1783 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
1785 if (!isa<StoreInst>(I)) {
1786 I->replaceAllUsesWith(K1);
1787 J->replaceAllUsesWith(K2);
1788 AA->replaceWithNewValue(I, K1);
1789 AA->replaceWithNewValue(J, K2);
1792 // Instructions that may read from memory may be in the load move set.
1793 // Once an instruction is fused, we no longer need its move set, and so
1794 // the values of the map never need to be updated. However, when a load
1795 // is fused, we need to merge the entries from both instructions in the
1796 // pair in case those instructions were in the move set of some other
1797 // yet-to-be-fused pair. The loads in question are the keys of the map.
1798 if (I->mayReadFromMemory()) {
1799 std::vector<ValuePair> NewSetMembers;
1800 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
1801 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
1802 for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
1803 N != IPairRange.second; ++N)
1804 NewSetMembers.push_back(ValuePair(K, N->second));
1805 for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
1806 N != JPairRange.second; ++N)
1807 NewSetMembers.push_back(ValuePair(K, N->second));
1808 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
1809 AE = NewSetMembers.end(); A != AE; ++A)
1810 LoadMoveSet.insert(*A);
1813 // Before removing I, set the iterator to the next instruction.
1814 PI = llvm::next(BasicBlock::iterator(I));
1815 if (cast<Instruction>(PI) == J)
1820 I->eraseFromParent();
1821 J->eraseFromParent();
1824 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
1828 char BBVectorize::ID = 0;
1829 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
1830 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
1831 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
1832 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1833 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
1835 BasicBlockPass *llvm::createBBVectorizePass() {
1836 return new BBVectorize();