"trip count that is smaller than this "
"value."));
+static cl::opt<bool> MaximizeBandwidth(
+ "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+ cl::desc("Maximize bandwidth when selecting vectorization factor which "
+ "will be determined by the smallest type in loop."));
+
/// This enables versioning on the strides of symbolically striding memory
/// accesses in code like the following.
/// for (i = 0; i < N; ++i)
/// possible.
VectorizationFactor selectVectorizationFactor(bool OptForSize);
- /// \return The size (in bits) of the widest type in the code that
- /// needs to be vectorized. We ignore values that remain scalar such as
+ /// \return The size (in bits) of the smallest and widest types in the code
+ /// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
- unsigned getWidestType();
+ std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
/// \return The desired interleave count.
/// If interleave count has been specified by metadata it will be returned.
unsigned NumInstructions;
};
- /// \return information about the register usage of the loop.
- RegisterUsage calculateRegisterUsage();
+ /// \return Returns information about the register usages of the loop for the
+ /// given vectorization factors.
+ SmallVector<RegisterUsage, 8>
+ calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
private:
/// Returns the expected execution cost. The unit of the cost does
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
- unsigned WidestType = getWidestType();
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
unsigned MaxSafeDepDist = -1U;
if (Legal->getMaxSafeDepDistBytes() != -1U)
WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
WidestRegister : MaxSafeDepDist);
unsigned MaxVectorSize = WidestRegister / WidestType;
- DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+
+ DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
+ << WidestType << " bits.\n");
DEBUG(dbgs() << "LV: The Widest register is: "
<< WidestRegister << " bits.\n");
" into one vector!");
unsigned VF = MaxVectorSize;
+ if (MaximizeBandwidth && !OptForSize) {
+ // Collect all viable vectorization factors.
+ SmallVector<unsigned, 8> VFs;
+ unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+ for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
+ VFs.push_back(VS);
+
+ // For each VF calculate its register usage.
+ auto RUs = calculateRegisterUsage(VFs);
+
+ // Select the largest VF which doesn't require more registers than existing
+ // ones.
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
+ for (int i = RUs.size() - 1; i >= 0; --i) {
+ if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+ VF = VFs[i];
+ break;
+ }
+ }
+ }
// If we optimize the program for size, avoid creating the tail loop.
if (OptForSize) {
return Factor;
}
-unsigned LoopVectorizationCostModel::getWidestType() {
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+ unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
continue;
+ MinWidth = std::min(MinWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
MaxWidth = std::max(MaxWidth,
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
}
}
- return MaxWidth;
+ return {MinWidth, MaxWidth};
}
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
TargetNumRegisters = ForceTargetNumVectorRegs;
}
- LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+ RegisterUsage R = calculateRegisterUsage({VF})[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
return 1;
}
-LoopVectorizationCostModel::RegisterUsage
-LoopVectorizationCostModel::calculateRegisterUsage() {
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+LoopVectorizationCostModel::calculateRegisterUsage(
+ const SmallVector<unsigned, 8> &VFs) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
- RegisterUsage R;
- R.NumInstructions = 0;
+ RegisterUsage RU;
+ RU.NumInstructions = 0;
// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
unsigned Index = 0;
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb) {
- R.NumInstructions += (*bb)->size();
+ RU.NumInstructions += (*bb)->size();
for (Instruction &I : **bb) {
IdxToInstr[Index++] = &I;
TransposeEnds[it->second].push_back(it->first);
SmallSet<Instruction*, 8> OpenIntervals;
- unsigned MaxUsage = 0;
+ // Get the size of the widest register.
+ unsigned MaxSafeDepDist = -1U;
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+ unsigned WidestRegister =
+ std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+ SmallVector<RegisterUsage, 8> RUs(VFs.size());
+ SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
for (unsigned int i = 0; i < Index; ++i) {
Instruction *I = IdxToInstr[i];
// Ignore instructions that are never used within the loop.
// Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[i];
- for (unsigned int j=0, e = List.size(); j < e; ++j)
+ for (unsigned int j = 0, e = List.size(); j < e; ++j)
OpenIntervals.erase(List[j]);
- // Count the number of live interals.
- MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+ // For each VF find the maximum usage of registers.
+ for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+ // Count the number of live interals.
+ unsigned RegUsage = 0;
+ for (auto Inst : OpenIntervals) {
+ unsigned TypeSize =
+ DL.getTypeSizeInBits(Inst->getType()->getScalarType());
+ RegUsage += std::max<unsigned>(1, VFs[j] * TypeSize / WidestRegister);
+ }
+ MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+ }
- DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
- OpenIntervals.size() << '\n');
+ DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+ << OpenIntervals.size() << '\n');
// Add the current instruction to the list of open intervals.
OpenIntervals.insert(I);
}
- unsigned Invariant = LoopInvariants.size();
- DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
- DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
- DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
+ for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+ unsigned Invariant = 0;
+ for (auto Inst : LoopInvariants) {
+ unsigned TypeSize =
+ DL.getTypeSizeInBits(Inst->getType()->getScalarType());
+ Invariant += std::max<unsigned>(1, VFs[i] * TypeSize / WidestRegister);
+ }
+
+ DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
+ DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
+ DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+ DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
+
+ RU.LoopInvariantRegs = Invariant;
+ RU.MaxLocalUsers = MaxUsages[i];
+ RUs[i] = RU;
+ }
- R.LoopInvariantRegs = Invariant;
- R.MaxLocalUsers = MaxUsage;
- return R;
+ return RUs;
}
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
--- /dev/null
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global [1000 x i8] zeroinitializer, align 16
+@b = global [1000 x i8] zeroinitializer, align 16
+@c = global [1000 x i8] zeroinitializer, align 16
+@u = global [1000 x i32] zeroinitializer, align 16
+@v = global [1000 x i32] zeroinitializer, align 16
+@w = global [1000 x i32] zeroinitializer, align 16
+
+; Tests that the vectorization factor is determined by the smallest instead of
+; widest type in the loop for maximum bandwidth when
+; -vectorizer-maximize-bandwidth is indicated.
+;
+; CHECK-label: foo
+; CHECK: LV: Selecting VF: 16.
+define void @foo() {
+entry:
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv
+ %0 = load i8, i8* %arrayidx, align 1
+ %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv
+ %1 = load i8, i8* %arrayidx2, align 1
+ %add = add i8 %1, %0
+ %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv
+ store i8 %add, i8* %arrayidx6, align 1
+ %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv
+ %2 = load i32, i32* %arrayidx8, align 4
+ %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv
+ %3 = load i32, i32* %arrayidx10, align 4
+ %add11 = add nsw i32 %3, %2
+ %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv
+ store i32 %add11, i32* %arrayidx13, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}