MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
cl::desc("The maximum number of pairing iterations"));
+static cl::opt<unsigned>
+MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
+ cl::desc("The maximum number of pairable instructions per group"));
+
static cl::opt<unsigned>
MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
cl::desc("Only generate aligned loads and stores"));
+static cl::opt<bool>
+NoMemOpBoost("bb-vectorize-no-mem-op-boost",
+ cl::init(false), cl::Hidden,
+ cl::desc("Don't boost the chain-depth contribution of loads and stores"));
+
static cl::opt<bool>
FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
cl::desc("Use a fast instruction dependency analysis"));
bool vectorizePairs(BasicBlock &BB);
- void getCandidatePairs(BasicBlock &BB,
+ bool getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
std::multimap<Value *, Value *> &CandidatePairs,
std::vector<Value *> &PairableInsts);
AliasSetTracker &WriteSet, Instruction *I,
Instruction *J, bool UpdateUsers = true,
std::multimap<Value *, Value *> *LoadMoveSet = 0);
-
+
void computePairsConnectedTo(
std::multimap<Value *, Value *> &CandidatePairs,
std::vector<Value *> &PairableInsts,
if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
return 0;
+ // Give a load or store half of the required depth so that load/store
+ // pairs will vectorize.
+ if (!NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
+ return ReqChainDepth/2;
+
return 1;
}
// This function implements one vectorization iteration on the provided
// basic block. It returns true if the block is changed.
bool BBVectorize::vectorizePairs(BasicBlock &BB) {
- std::vector<Value *> PairableInsts;
- std::multimap<Value *, Value *> CandidatePairs;
- getCandidatePairs(BB, CandidatePairs, PairableInsts);
- if (PairableInsts.empty()) return false;
-
- // Now we have a map of all of the pairable instructions and we need to
- // select the best possible pairing. A good pairing is one such that the
- // users of the pair are also paired. This defines a (directed) forest
- // over the pairs such that two pairs are connected iff the second pair
- // uses the first.
-
- // Note that it only matters that both members of the second pair use some
- // element of the first pair (to allow for splatting).
-
- std::multimap<ValuePair, ValuePair> ConnectedPairs;
- computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
- if (ConnectedPairs.empty()) return false;
-
- // Build the pairable-instruction dependency map
- DenseSet<ValuePair> PairableInstUsers;
- buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
-
- // There is now a graph of the connected pairs. For each variable, pick the
- // pairing with the largest tree meeting the depth requirement on at least
- // one branch. Then select all pairings that are part of that tree and
- // remove them from the list of available pairings and pairable variables.
-
- DenseMap<Value *, Value *> ChosenPairs;
- choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
- PairableInstUsers, ChosenPairs);
-
- if (ChosenPairs.empty()) return false;
- NumFusedOps += ChosenPairs.size();
-
+ bool ShouldContinue;
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+
+ std::vector<Value *> AllPairableInsts;
+ DenseMap<Value *, Value *> AllChosenPairs;
+
+ do {
+ std::vector<Value *> PairableInsts;
+ std::multimap<Value *, Value *> CandidatePairs;
+ ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+ PairableInsts);
+ if (PairableInsts.empty()) continue;
+
+ // Now we have a map of all of the pairable instructions and we need to
+ // select the best possible pairing. A good pairing is one such that the
+ // users of the pair are also paired. This defines a (directed) forest
+ // over the pairs such that two pairs are connected iff the second pair
+ // uses the first.
+
+ // Note that it only matters that both members of the second pair use some
+ // element of the first pair (to allow for splatting).
+
+ std::multimap<ValuePair, ValuePair> ConnectedPairs;
+ computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+ if (ConnectedPairs.empty()) continue;
+
+ // Build the pairable-instruction dependency map
+ DenseSet<ValuePair> PairableInstUsers;
+ buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+ // There is now a graph of the connected pairs. For each variable, pick
+ // the pairing with the largest tree meeting the depth requirement on at
+ // least one branch. Then select all pairings that are part of that tree
+ // and remove them from the list of available pairings and pairable
+ // variables.
+
+ DenseMap<Value *, Value *> ChosenPairs;
+ choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, ChosenPairs);
+
+ if (ChosenPairs.empty()) continue;
+ AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
+ PairableInsts.end());
+ AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+ } while (ShouldContinue);
+
+ if (AllChosenPairs.empty()) return false;
+ NumFusedOps += AllChosenPairs.size();
+
// A set of pairs has now been selected. It is now necessary to replace the
// paired instructions with vector instructions. For this procedure each
// operand much be replaced with a vector operand. This vector is formed
// by using build_vector on the old operands. The replaced values are then
// replaced with a vector_extract on the result. Subsequent optimization
// passes should coalesce the build/extract combinations.
-
- fuseChosenPairs(BB, PairableInsts, ChosenPairs);
-
+
+ fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
return true;
}
// An aligned load or store is possible only if the instruction
// with the lower offset has an alignment suitable for the
// vector type.
-
+
unsigned BottomAlignment = IAlignment;
if (OffsetInElmts < 0) BottomAlignment = JAlignment;
-
+
Type *VType = getVecTypeForPair(aType);
unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
if (BottomAlignment < VecAlignment)
} else {
for (AliasSetTracker::iterator W = WriteSet.begin(),
WE = WriteSet.end(); W != WE; ++W) {
- for (AliasSet::iterator A = W->begin(), AE = W->end();
- A != AE; ++A) {
- AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(),
- A->getTBAAInfo());
- if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) {
- UsesI = true;
- break;
- }
+ if (W->aliasesUnknownInst(J, *AA)) {
+ UsesI = true;
+ break;
}
- if (UsesI) break;
}
}
}
// This function iterates over all instruction pairs in the provided
// basic block and collects all candidate pairs for vectorization.
- void BBVectorize::getCandidatePairs(BasicBlock &BB,
+ bool BBVectorize::getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
std::multimap<Value *, Value *> &CandidatePairs,
std::vector<Value *> &PairableInsts) {
BasicBlock::iterator E = BB.end();
- for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+ if (Start == E) return false;
+
+ bool ShouldContinue = false, IAfterStart = false;
+ for (BasicBlock::iterator I = Start++; I != E; ++I) {
+ if (I == Start) IAfterStart = true;
+
bool IsSimpleLoadStore;
if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
// Look for an instruction with which to pair instruction *I...
DenseSet<Value *> Users;
AliasSetTracker WriteSet(*AA);
- BasicBlock::iterator J = I; ++J;
+ bool JAfterStart = IAfterStart;
+ BasicBlock::iterator J = llvm::next(I);
for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) {
+ if (J == Start) JAfterStart = true;
+
// Determine if J uses I, if so, exit the loop.
bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep);
if (FastDep) {
PairableInsts[PairableInsts.size()-1] != I) {
PairableInsts.push_back(I);
}
+
CandidatePairs.insert(ValuePair(I, J));
+
+ // The next call to this function must start after the last instruction
+ // selected during this invocation.
+ if (JAfterStart) {
+ Start = llvm::next(J);
+ IAfterStart = JAfterStart = false;
+ }
+
DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
<< *I << " <-> " << *J << "\n");
+
+ // If we have already found too many pairs, break here and this function
+ // will be called again starting after the last instruction selected
+ // during this invocation.
+ if (PairableInsts.size() >= MaxInsts) {
+ ShouldContinue = true;
+ break;
+ }
}
+
+ if (ShouldContinue)
+ break;
}
DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
<< " instructions with candidate pairs\n");
+
+ return ShouldContinue;
}
// Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
// A lookup table of visisted pairs is kept because the PairableInstUserMap
// contains non-direct associations.
DenseSet<ValuePair> Visited;
- std::vector<ValuePair> Q;
+ SmallVector<ValuePair, 32> Q;
// General depth-first post-order traversal:
Q.push_back(P);
- while (!Q.empty()) {
- ValuePair QTop = Q.back();
-
+ do {
+ ValuePair QTop = Q.pop_back_val();
Visited.insert(QTop);
- Q.pop_back();
DEBUG(if (DebugCycleCheck)
dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
return true;
}
- if (CurrentPairs.count(C->second) > 0 &&
- Visited.count(C->second) == 0)
+ if (CurrentPairs.count(C->second) && !Visited.count(C->second))
Q.push_back(C->second);
}
- }
+ } while (!Q.empty());
return false;
}
// Each of these pairs is viewed as the root node of a Tree. The Tree
// is then walked (depth-first). As this happens, we keep track of
// the pairs that compose the Tree and the maximum depth of the Tree.
- std::vector<ValuePairWithDepth> Q;
+ SmallVector<ValuePairWithDepth, 32> Q;
// General depth-first post-order traversal:
Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
- while (!Q.empty()) {
+ do {
ValuePairWithDepth QTop = Q.back();
// Push each child onto the queue:
bool MoreChildren = false;
size_t MaxChildDepth = QTop.second;
VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
- for (std::map<ValuePair, ValuePair>::iterator k = qtRange.first;
+ for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
k != qtRange.second; ++k) {
// Make sure that this child pair is still a candidate:
bool IsStillCand = false;
Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
Q.pop_back();
}
- }
+ } while (!Q.empty());
}
// Given some initial tree, prune it by removing conflicting pairs (pairs
DenseMap<ValuePair, size_t> &Tree,
DenseSet<ValuePair> &PrunedTree, ValuePair J,
bool UseCycleCheck) {
- std::vector<ValuePairWithDepth> Q;
+ SmallVector<ValuePairWithDepth, 32> Q;
// General depth-first post-order traversal:
Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
- while (!Q.empty()) {
- ValuePairWithDepth QTop = Q.back();
+ do {
+ ValuePairWithDepth QTop = Q.pop_back_val();
PrunedTree.insert(QTop.first);
- Q.pop_back();
// Visit each child, pruning as necessary...
DenseMap<ValuePair, size_t> BestChilden;
VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
- for (std::map<ValuePair, ValuePair>::iterator K = QTopRange.first;
+ for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
K != QTopRange.second; ++K) {
DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
if (C == Tree.end()) continue;
if (!CanAdd) continue;
// And check the queue too...
- for (std::vector<ValuePairWithDepth>::iterator C2 = Q.begin(),
+ for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
E2 = Q.end(); C2 != E2; ++C2) {
if (C2->first.first == C->first.first ||
C2->first.first == C->first.second ||
}
if (!CanAdd) continue;
- // To check for non-trivial cycles formed by the addition of the
- // current pair we've formed a list of all relevant pairs, now use a
- // graph walk to check for a cycle. We start from the current pair and
- // walk the use tree to see if we again reach the current pair. If we
- // do, then the current pair is rejected.
+ // To check for non-trivial cycles formed by the addition of the
+ // current pair we've formed a list of all relevant pairs, now use a
+ // graph walk to check for a cycle. We start from the current pair and
+ // walk the use tree to see if we again reach the current pair. If we
+ // do, then the current pair is rejected.
// FIXME: It may be more efficient to use a topological-ordering
// algorithm to improve the cycle check. This should be investigated.
size_t DepthF = getDepthFactor(C->first.first);
Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
}
- }
+ } while (!Q.empty());
}
// This function finds the best tree of mututally-compatible connected
if (LEE->getOperand(0) == HEE->getOperand(0)) {
if (LowIndx == 0 && HighIndx == 1)
return LEE->getOperand(0);
-
+
std::vector<Constant*> Mask(2);
Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
std::multimap<Value *, Value *> &LoadMoveSet,
Instruction *I, Instruction *J) {
// Skip to the first instruction past I.
- BasicBlock::iterator L = BB.begin();
- for (; cast<Instruction>(L) != I; ++L);
- ++L;
+ BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
DenseSet<Value *> Users;
AliasSetTracker WriteSet(*AA);
Instruction *&InsertionPt,
Instruction *I, Instruction *J) {
// Skip to the first instruction past I.
- BasicBlock::iterator L = BB.begin();
- for (; cast<Instruction>(L) != I; ++L);
- ++L;
+ BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
DenseSet<Value *> Users;
AliasSetTracker WriteSet(*AA);
std::multimap<Value *, Value *> &LoadMoveSet,
Instruction *I) {
// Skip to the first instruction past I.
- BasicBlock::iterator L = BB.begin();
- for (; cast<Instruction>(L) != I; ++L);
- ++L;
+ BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
DenseSet<Value *> Users;
AliasSetTracker WriteSet(*AA);