ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
cl::desc("The required chain depth for vectorization"));
+static cl::opt<bool>
+UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
+ cl::Hidden, cl::desc("Use the chain depth requirement with"
+ " target information"));
+
static cl::opt<unsigned>
SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
cl::desc("The maximum search distance for instruction pairs"));
DenseMap<ValuePair, int> &CandidatePairCostSavings,
std::vector<Value *> &PairableInsts, bool NonPow2Len);
+ // FIXME: The current implementation does not account for pairs that
+ // are connected in multiple ways. For example:
+ // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
enum PairConnectionType {
PairConnectionDirect,
PairConnectionSwap,
int EffSize = 0;
if (VTTI) {
+ DenseSet<Value *> PrunedTreeInstrs;
+ for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+ E = PrunedTree.end(); S != E; ++S) {
+ PrunedTreeInstrs.insert(S->first);
+ PrunedTreeInstrs.insert(S->second);
+ }
+
+ // The set of pairs that have already contributed to the total cost.
+ DenseSet<ValuePair> IncomingPairs;
+
// The node weights represent the cost savings associated with
// fusing the pair of instructions.
for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
E = PrunedTree.end(); S != E; ++S) {
- if (getDepthFactor(S->first))
- EffSize += CandidatePairCostSavings.find(*S)->second;
+ bool FlipOrder = false;
+
+ if (getDepthFactor(S->first)) {
+ int ESContrib = CandidatePairCostSavings.find(*S)->second;
+ DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
+ << *S->first << " <-> " << *S->second << "} = " <<
+ ESContrib << "\n");
+ EffSize += ESContrib;
+ }
- // The edge weights contribute in a negative sense: they represent
- // the cost of shuffles.
+ // The edge weights contribute in a negative sense: they represent
+ // the cost of shuffles.
VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
if (IP.first != ConnectedPairDeps.end()) {
unsigned NumDepsDirect = 0, NumDepsSwap = 0;
for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
Q != IP.second; ++Q) {
+ if (!PrunedTree.count(Q->second))
+ continue;
DenseMap<VPPair, unsigned>::iterator R =
PairConnectionTypes.find(VPPair(Q->second, Q->first));
assert(R != PairConnectionTypes.end() &&
// If there are more swaps than direct connections, then
// the pair order will be flipped during fusion. So the real
// number of swaps is the minimum number.
- bool FlipOrder = !FixedOrderPairs.count(*S) &&
+ FlipOrder = !FixedOrderPairs.count(*S) &&
((NumDepsSwap > NumDepsDirect) ||
FixedOrderPairs.count(ValuePair(S->second, S->first)));
for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
Q != IP.second; ++Q) {
+ if (!PrunedTree.count(Q->second))
+ continue;
DenseMap<VPPair, unsigned>::iterator R =
PairConnectionTypes.find(VPPair(Q->second, Q->first));
assert(R != PairConnectionTypes.end() &&
Type *VTy = getVecTypeForPair(Ty1, Ty2);
if ((R->second == PairConnectionDirect && FlipOrder) ||
(R->second == PairConnectionSwap && !FlipOrder) ||
- R->second == PairConnectionSplat)
- EffSize -= (int) getInstrCost(Instruction::ShuffleVector,
- VTy, VTy);
+ R->second == PairConnectionSplat) {
+ int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, VTy);
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *Q->second.first << " <-> " << *Q->second.second <<
+ "} -> {" <<
+ *S->first << " <-> " << *S->second << "} = " <<
+ ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+ }
+ }
+
+ // Compute the cost of outgoing edges. We assume that edges outgoing
+ // to shuffles, inserts or extracts can be merged, and so contribute
+ // no additional cost.
+ if (!S->first->getType()->isVoidTy()) {
+ Type *Ty1 = S->first->getType(),
+ *Ty2 = S->second->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+ bool NeedsExtraction = false;
+ for (Value::use_iterator I = S->first->use_begin(),
+ IE = S->first->use_end(); I != IE; ++I) {
+ if (isa<ShuffleVectorInst>(*I) ||
+ isa<InsertElementInst>(*I) ||
+ isa<ExtractElementInst>(*I))
+ continue;
+ if (PrunedTreeInstrs.count(*I))
+ continue;
+ NeedsExtraction = true;
+ break;
+ }
+
+ if (NeedsExtraction) {
+ int ESContrib;
+ if (Ty1->isVectorTy())
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ Ty1, VTy);
+ else
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::ExtractElement, VTy, 0);
+
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *S->first << "} = " << ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+
+ NeedsExtraction = false;
+ for (Value::use_iterator I = S->second->use_begin(),
+ IE = S->second->use_end(); I != IE; ++I) {
+ if (isa<ShuffleVectorInst>(*I) ||
+ isa<InsertElementInst>(*I) ||
+ isa<ExtractElementInst>(*I))
+ continue;
+ if (PrunedTreeInstrs.count(*I))
+ continue;
+ NeedsExtraction = true;
+ break;
+ }
+
+ if (NeedsExtraction) {
+ int ESContrib;
+ if (Ty2->isVectorTy())
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ Ty2, VTy);
+ else
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::ExtractElement, VTy, 1);
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+ *S->second << "} = " << ESContrib << "\n");
+ EffSize -= ESContrib;
+ }
+ }
+
+ // Compute the cost of incoming edges.
+ if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
+ Instruction *S1 = cast<Instruction>(S->first),
+ *S2 = cast<Instruction>(S->second);
+ for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
+ Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
+
+ // Combining constants into vector constants (or small vector
+ // constants into larger ones are assumed free).
+ if (isa<Constant>(O1) && isa<Constant>(O2))
+ continue;
+
+ if (FlipOrder)
+ std::swap(O1, O2);
+
+ ValuePair VP = ValuePair(O1, O2);
+ ValuePair VPR = ValuePair(O2, O1);
+
+ // Internal edges are not handled here.
+ if (PrunedTree.count(VP) || PrunedTree.count(VPR))
+ continue;
+
+ Type *Ty1 = O1->getType(),
+ *Ty2 = O2->getType();
+ Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+ // Combining vector operations of the same type is also assumed
+ // folded with other operations.
+ if (Ty1 == Ty2 &&
+ (isa<ShuffleVectorInst>(O1) ||
+ isa<InsertElementInst>(O1) ||
+ isa<InsertElementInst>(O1)) &&
+ (isa<ShuffleVectorInst>(O2) ||
+ isa<InsertElementInst>(O2) ||
+ isa<InsertElementInst>(O2)))
+ continue;
+
+ int ESContrib;
+ // This pair has already been formed.
+ if (IncomingPairs.count(VP)) {
+ continue;
+ } else if (IncomingPairs.count(VPR)) {
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, VTy);
+ } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, VTy, 0);
+ ESContrib += (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, VTy, 1);
+ } else if (!Ty1->isVectorTy()) {
+ // O1 needs to be inserted into a vector of size O2, and then
+ // both need to be shuffled together.
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty2, 0);
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, Ty2);
+ } else if (!Ty2->isVectorTy()) {
+ // O2 needs to be inserted into a vector of size O1, and then
+ // both need to be shuffled together.
+ ESContrib = (int) VTTI->getVectorInstrCost(
+ Instruction::InsertElement, Ty1, 0);
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, Ty1);
+ } else {
+ Type *TyBig = Ty1, *TySmall = Ty2;
+ if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
+ std::swap(TyBig, TySmall);
+
+ ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+ VTy, TyBig);
+ if (TyBig != TySmall)
+ ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+ TyBig, TySmall);
+ }
+
+ DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
+ << *O1 << " <-> " << *O2 << "} = " <<
+ ESContrib << "\n");
+ EffSize -= ESContrib;
+ IncomingPairs.insert(VP);
}
}
}
<< *J->first << " <-> " << *J->second << "} of depth " <<
MaxDepth << " and size " << PrunedTree.size() <<
" (effective size: " << EffSize << ")\n");
- if (MaxDepth >= Config.ReqChainDepth &&
+ if (((VTTI && !UseChainDepthWithTI) ||
+ MaxDepth >= Config.ReqChainDepth) &&
EffSize > 0 && EffSize > BestEffSize) {
BestMaxDepth = MaxDepth;
BestEffSize = EffSize;
; Basic depth-3 chain
define double @test1(double %A1, double %A2, double %B1, double %B2) {
+ %X1 = fsub double %A1, %B1
+ %X2 = fsub double %A2, %B2
+ %Y1 = fmul double %X1, %A1
+ %Y2 = fmul double %X2, %A2
+ %Z1 = fadd double %Y1, %B1
+ %Z2 = fadd double %Y2, %B2
+ %R = fmul double %Z1, %Z2
+ ret double %R
; CHECK: @test1
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain
+define double @test1a(double %A1, double %A2, double %B1, double %B2) {
+ %X1 = fsub double %A1, %B1
+ %X2 = fsub double %A2, %B2
+ %Y1 = fmul double %X1, %A1
+ %Y2 = fmul double %X2, %A2
+ %Z1 = fadd double %Y1, %B1
+ %Z2 = fadd double %Y2, %B2
+ %W1 = fadd double %Y1, %Z1
+ %W2 = fadd double %Y2, %Z2
+ %V1 = fadd double %W1, %Z1
+ %V2 = fadd double %W2, %Z2
+ %Q1 = fadd double %W1, %V1
+ %Q2 = fadd double %W2, %V2
+ %S1 = fadd double %W1, %Q1
+ %S2 = fadd double %W2, %Q2
+ %R = fmul double %S1, %S2
+ ret double %R
+; CHECK: @test1a
; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %W1 = fadd <2 x double> %Y1, %Z1
+; CHECK: %V1 = fadd <2 x double> %W1, %Z1
+; CHECK: %Q1 = fadd <2 x double> %W1, %V1
+; CHECK: %S1 = fadd <2 x double> %W1, %Q1
+; CHECK: %S1.v.r1 = extractelement <2 x double> %S1, i32 0
+; CHECK: %S1.v.r2 = extractelement <2 x double> %S1, i32 1
+; CHECK: %R = fmul double %S1.v.r1, %S1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
%X1 = fsub double %A1, %B1
%X2 = fsub double %A2, %B2
-; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
%Y1 = fmul double %X1, %A1
%Y2 = fmul double %X2, %A2
-; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
- %Z1 = fadd double %Y1, %B1
- %Z2 = fadd double %Y2, %B2
-; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+ %Z1 = fadd double %Y2, %B1
+ %Z2 = fadd double %Y1, %B2
+ %R = fmul double %Z1, %Z2
+ ret double %R
+; CHECK: @test2
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic depth-4 chain (internal permutation)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+ %X1 = fsub double %A1, %B1
+ %X2 = fsub double %A2, %B2
+ %Y1 = fmul double %X1, %A1
+ %Y2 = fmul double %X2, %A2
+ %Z1 = fadd double %Y2, %B1
+ %Z2 = fadd double %Y1, %B2
+ %W1 = fadd double %Y2, %Z1
+ %W2 = fadd double %Y1, %Z2
%R = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
ret double %R
+; CHECK: @test4
+; CHECK-NOT: fmul <2 x double>
; CHECK: ret double %R
}
%Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
%R = mul <8 x i8> %Q1, %Q2
ret <8 x i8> %R
-; CHECK-TI: @test6
-; CHECK-TI-NOT: sub <16 x i8>
-; CHECK-TI: ret <8 x i8>
+; CHECK: @test6
+; CHECK-NOT: sub <16 x i8>
+; CHECK: ret <8 x i8>
}