#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils/VectorUtils.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/VectorUtils.h"
using namespace llvm;
-#define DEBUG_TYPE "loop-vectorize"
+#define DEBUG_TYPE "loop-accesses"
+
+static cl::opt<unsigned, true>
+VectorizationFactor("force-vector-width", cl::Hidden,
+ cl::desc("Sets the SIMD width. Zero is autoselect."),
+ cl::location(VectorizerParams::VectorizationFactor));
+unsigned VectorizerParams::VectorizationFactor;
+
+static cl::opt<unsigned, true>
+VectorizationInterleave("force-vector-interleave", cl::Hidden,
+ cl::desc("Sets the vectorization interleave count. "
+ "Zero is autoselect."),
+ cl::location(
+ VectorizerParams::VectorizationInterleave));
+unsigned VectorizerParams::VectorizationInterleave;
+
+static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
+ "runtime-memory-check-threshold", cl::Hidden,
+ cl::desc("When performing memory disambiguation checks at runtime do not "
+ "generate more than this number of comparisons (default = 8)."),
+ cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
+unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
+
+/// \brief The maximum iterations used to merge memory checks
+static cl::opt<unsigned> MemoryCheckMergeThreshold(
+ "memory-check-merge-threshold", cl::Hidden,
+ cl::desc("Maximum number of comparisons done when trying to merge "
+ "runtime memory checks. (default = 100)"),
+ cl::init(100));
+
+/// Maximum SIMD width.
+const unsigned VectorizerParams::MaxVectorWidth = 64;
+
+/// \brief We collect dependences up to this threshold.
+static cl::opt<unsigned>
+ MaxDependences("max-dependences", cl::Hidden,
+ cl::desc("Maximum number of dependences collected by "
+ "loop-access analysis (default = 100)"),
+ cl::init(100));
+
+bool VectorizerParams::isInterleaveForced() {
+ return ::VectorizationInterleave.getNumOccurrences() > 0;
+}
-void VectorizationReport::emitAnalysis(VectorizationReport &Message,
- const Function *TheFunction,
- const Loop *TheLoop) {
+void LoopAccessReport::emitAnalysis(const LoopAccessReport &Message,
+ const Function *TheFunction,
+ const Loop *TheLoop,
+ const char *PassName) {
DebugLoc DL = TheLoop->getStartLoc();
- if (Instruction *I = Message.getInstr())
+ if (const Instruction *I = Message.getInstr())
DL = I->getDebugLoc();
- emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
+ emitOptimizationRemarkAnalysis(TheFunction->getContext(), PassName,
*TheFunction, DL, Message.str());
}
}
const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
- ValueToValueMap &PtrToStride,
+ const ValueToValueMap &PtrToStride,
+ SCEVUnionPredicate &Preds,
Value *Ptr, Value *OrigPtr) {
-
const SCEV *OrigSCEV = SE->getSCEV(Ptr);
// If there is an entry in the map return the SCEV of the pointer with the
// symbolic stride replaced by one.
- ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
+ ValueToValueMap::const_iterator SI =
+ PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
if (SI != PtrToStride.end()) {
Value *StrideVal = SI->second;
ValueToValueMap RewriteMap;
RewriteMap[StrideVal] = One;
- const SCEV *ByOne =
- SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
- DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
+ const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
+ const auto *CT =
+ static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
+
+ Preds.add(SE->getEqualPredicate(U, CT));
+
+ const SCEV *ByOne = SE->rewriteUsingPredicate(OrigSCEV, Preds);
+ DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
<< "\n");
return ByOne;
}
// Otherwise, just return the SCEV of the original pointer.
- return SE->getSCEV(Ptr);
+ return OrigSCEV;
}
-void LoopAccessInfo::RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp,
- Value *Ptr, bool WritePtr,
- unsigned DepSetId,
- unsigned ASId,
- ValueToValueMap &Strides) {
+void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
+ unsigned DepSetId, unsigned ASId,
+ const ValueToValueMap &Strides,
+ SCEVUnionPredicate &Preds) {
// Get the stride replaced scev.
- const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+ const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
assert(AR && "Invalid addrec expression");
const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
+
+ const SCEV *ScStart = AR->getStart();
const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
- Pointers.push_back(Ptr);
- Starts.push_back(AR->getStart());
- Ends.push_back(ScEnd);
- IsWritePtr.push_back(WritePtr);
- DependencySetId.push_back(DepSetId);
- AliasSetId.push_back(ASId);
+ const SCEV *Step = AR->getStepRecurrence(*SE);
+
+ // For expressions with negative step, the upper bound is ScStart and the
+ // lower bound is ScEnd.
+ if (const SCEVConstant *CStep = dyn_cast<const SCEVConstant>(Step)) {
+ if (CStep->getValue()->isNegative())
+ std::swap(ScStart, ScEnd);
+ } else {
+ // Fallback case: the step is not constant, but the we can still
+ // get the upper and lower bounds of the interval by using min/max
+ // expressions.
+ ScStart = SE->getUMinExpr(ScStart, ScEnd);
+ ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
+ }
+
+ Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
+}
+
+SmallVector<RuntimePointerChecking::PointerCheck, 4>
+RuntimePointerChecking::generateChecks() const {
+ SmallVector<PointerCheck, 4> Checks;
+
+ for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
+ for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
+ const RuntimePointerChecking::CheckingPtrGroup &CGI = CheckingGroups[I];
+ const RuntimePointerChecking::CheckingPtrGroup &CGJ = CheckingGroups[J];
+
+ if (needsChecking(CGI, CGJ))
+ Checks.push_back(std::make_pair(&CGI, &CGJ));
+ }
+ }
+ return Checks;
+}
+
+void RuntimePointerChecking::generateChecks(
+ MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
+ assert(Checks.empty() && "Checks is not empty");
+ groupChecks(DepCands, UseDependencies);
+ Checks = generateChecks();
+}
+
+bool RuntimePointerChecking::needsChecking(const CheckingPtrGroup &M,
+ const CheckingPtrGroup &N) const {
+ for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
+ for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
+ if (needsChecking(M.Members[I], N.Members[J]))
+ return true;
+ return false;
+}
+
+/// Compare \p I and \p J and return the minimum.
+/// Return nullptr in case we couldn't find an answer.
+static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
+ ScalarEvolution *SE) {
+ const SCEV *Diff = SE->getMinusSCEV(J, I);
+ const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
+
+ if (!C)
+ return nullptr;
+ if (C->getValue()->isNegative())
+ return J;
+ return I;
+}
+
+bool RuntimePointerChecking::CheckingPtrGroup::addPointer(unsigned Index) {
+ const SCEV *Start = RtCheck.Pointers[Index].Start;
+ const SCEV *End = RtCheck.Pointers[Index].End;
+
+ // Compare the starts and ends with the known minimum and maximum
+ // of this set. We need to know how we compare against the min/max
+ // of the set in order to be able to emit memchecks.
+ const SCEV *Min0 = getMinFromExprs(Start, Low, RtCheck.SE);
+ if (!Min0)
+ return false;
+
+ const SCEV *Min1 = getMinFromExprs(End, High, RtCheck.SE);
+ if (!Min1)
+ return false;
+
+ // Update the low bound expression if we've found a new min value.
+ if (Min0 == Start)
+ Low = Start;
+
+ // Update the high bound expression if we've found a new max value.
+ if (Min1 != End)
+ High = End;
+
+ Members.push_back(Index);
+ return true;
+}
+
+void RuntimePointerChecking::groupChecks(
+ MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
+ // We build the groups from dependency candidates equivalence classes
+ // because:
+ // - We know that pointers in the same equivalence class share
+ // the same underlying object and therefore there is a chance
+ // that we can compare pointers
+ // - We wouldn't be able to merge two pointers for which we need
+ // to emit a memcheck. The classes in DepCands are already
+ // conveniently built such that no two pointers in the same
+ // class need checking against each other.
+
+ // We use the following (greedy) algorithm to construct the groups
+ // For every pointer in the equivalence class:
+ // For each existing group:
+ // - if the difference between this pointer and the min/max bounds
+ // of the group is a constant, then make the pointer part of the
+ // group and update the min/max bounds of that group as required.
+
+ CheckingGroups.clear();
+
+ // If we need to check two pointers to the same underlying object
+ // with a non-constant difference, we shouldn't perform any pointer
+ // grouping with those pointers. This is because we can easily get
+ // into cases where the resulting check would return false, even when
+ // the accesses are safe.
+ //
+ // The following example shows this:
+ // for (i = 0; i < 1000; ++i)
+ // a[5000 + i * m] = a[i] + a[i + 9000]
+ //
+ // Here grouping gives a check of (5000, 5000 + 1000 * m) against
+ // (0, 10000) which is always false. However, if m is 1, there is no
+ // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
+ // us to perform an accurate check in this case.
+ //
+ // The above case requires that we have an UnknownDependence between
+ // accesses to the same underlying object. This cannot happen unless
+ // ShouldRetryWithRuntimeCheck is set, and therefore UseDependencies
+ // is also false. In this case we will use the fallback path and create
+ // separate checking groups for all pointers.
+
+ // If we don't have the dependency partitions, construct a new
+ // checking pointer group for each pointer. This is also required
+ // for correctness, because in this case we can have checking between
+ // pointers to the same underlying object.
+ if (!UseDependencies) {
+ for (unsigned I = 0; I < Pointers.size(); ++I)
+ CheckingGroups.push_back(CheckingPtrGroup(I, *this));
+ return;
+ }
+
+ unsigned TotalComparisons = 0;
+
+ DenseMap<Value *, unsigned> PositionMap;
+ for (unsigned Index = 0; Index < Pointers.size(); ++Index)
+ PositionMap[Pointers[Index].PointerValue] = Index;
+
+ // We need to keep track of what pointers we've already seen so we
+ // don't process them twice.
+ SmallSet<unsigned, 2> Seen;
+
+ // Go through all equivalence classes, get the "pointer check groups"
+ // and add them to the overall solution. We use the order in which accesses
+ // appear in 'Pointers' to enforce determinism.
+ for (unsigned I = 0; I < Pointers.size(); ++I) {
+ // We've seen this pointer before, and therefore already processed
+ // its equivalence class.
+ if (Seen.count(I))
+ continue;
+
+ MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
+ Pointers[I].IsWritePtr);
+
+ SmallVector<CheckingPtrGroup, 2> Groups;
+ auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
+
+ // Because DepCands is constructed by visiting accesses in the order in
+ // which they appear in alias sets (which is deterministic) and the
+ // iteration order within an equivalence class member is only dependent on
+ // the order in which unions and insertions are performed on the
+ // equivalence class, the iteration order is deterministic.
+ for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
+ MI != ME; ++MI) {
+ unsigned Pointer = PositionMap[MI->getPointer()];
+ bool Merged = false;
+ // Mark this pointer as seen.
+ Seen.insert(Pointer);
+
+ // Go through all the existing sets and see if we can find one
+ // which can include this pointer.
+ for (CheckingPtrGroup &Group : Groups) {
+ // Don't perform more than a certain amount of comparisons.
+ // This should limit the cost of grouping the pointers to something
+ // reasonable. If we do end up hitting this threshold, the algorithm
+ // will create separate groups for all remaining pointers.
+ if (TotalComparisons > MemoryCheckMergeThreshold)
+ break;
+
+ TotalComparisons++;
+
+ if (Group.addPointer(Pointer)) {
+ Merged = true;
+ break;
+ }
+ }
+
+ if (!Merged)
+ // We couldn't add this pointer to any existing set or the threshold
+ // for the number of comparisons has been reached. Create a new group
+ // to hold the current pointer.
+ Groups.push_back(CheckingPtrGroup(Pointer, *this));
+ }
+
+ // We've computed the grouped checks for this partition.
+ // Save the results and continue with the next one.
+ std::copy(Groups.begin(), Groups.end(), std::back_inserter(CheckingGroups));
+ }
}
-bool LoopAccessInfo::RuntimePointerCheck::needsChecking(unsigned I,
- unsigned J) const {
+bool RuntimePointerChecking::arePointersInSamePartition(
+ const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
+ unsigned PtrIdx2) {
+ return (PtrToPartition[PtrIdx1] != -1 &&
+ PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
+}
+
+bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
+ const PointerInfo &PointerI = Pointers[I];
+ const PointerInfo &PointerJ = Pointers[J];
+
// No need to check if two readonly pointers intersect.
- if (!IsWritePtr[I] && !IsWritePtr[J])
+ if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
return false;
// Only need to check pointers between two different dependency sets.
- if (DependencySetId[I] == DependencySetId[J])
+ if (PointerI.DependencySetId == PointerJ.DependencySetId)
return false;
// Only need to check pointers in the same alias set.
- if (AliasSetId[I] != AliasSetId[J])
+ if (PointerI.AliasSetId != PointerJ.AliasSetId)
return false;
return true;
}
+void RuntimePointerChecking::printChecks(
+ raw_ostream &OS, const SmallVectorImpl<PointerCheck> &Checks,
+ unsigned Depth) const {
+ unsigned N = 0;
+ for (const auto &Check : Checks) {
+ const auto &First = Check.first->Members, &Second = Check.second->Members;
+
+ OS.indent(Depth) << "Check " << N++ << ":\n";
+
+ OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
+ for (unsigned K = 0; K < First.size(); ++K)
+ OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
+
+ OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
+ for (unsigned K = 0; K < Second.size(); ++K)
+ OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
+ }
+}
+
+void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
+
+ OS.indent(Depth) << "Run-time memory checks:\n";
+ printChecks(OS, Checks, Depth);
+
+ OS.indent(Depth) << "Grouped accesses:\n";
+ for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
+ const auto &CG = CheckingGroups[I];
+
+ OS.indent(Depth + 2) << "Group " << &CG << ":\n";
+ OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
+ << ")\n";
+ for (unsigned J = 0; J < CG.Members.size(); ++J) {
+ OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
+ << "\n";
+ }
+ }
+}
+
namespace {
/// \brief Analyses memory accesses in a loop.
///
typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
- /// \brief Set of potential dependent memory accesses.
- typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
-
- AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
- DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
+ AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
+ MemoryDepChecker::DepCandidates &DA, SCEVUnionPredicate &Preds)
+ : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false),
+ Preds(Preds) {}
/// \brief Register a load and whether it is only read from.
- void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
+ void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
Value *Ptr = const_cast<Value*>(Loc.Ptr);
- AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+ AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
Accesses.insert(MemAccessInfo(Ptr, false));
if (IsReadOnly)
ReadOnlyPtr.insert(Ptr);
}
/// \brief Register a store.
- void addStore(AliasAnalysis::Location &Loc) {
+ void addStore(MemoryLocation &Loc) {
Value *Ptr = const_cast<Value*>(Loc.Ptr);
- AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+ AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
Accesses.insert(MemAccessInfo(Ptr, true));
}
/// \brief Check whether we can check the pointers at runtime for
/// non-intersection.
- bool canCheckPtrAtRT(LoopAccessInfo::RuntimePointerCheck &RtCheck,
- unsigned &NumComparisons,
- ScalarEvolution *SE, Loop *TheLoop,
- ValueToValueMap &Strides,
+ ///
+ /// Returns true if we need no check or if we do and we can generate them
+ /// (i.e. the pointers have computable bounds).
+ bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
+ Loop *TheLoop, const ValueToValueMap &Strides,
bool ShouldCheckStride = false);
/// \brief Goes over all memory accesses, checks whether a RT check is needed
processMemAccesses();
}
- bool isRTCheckNeeded() { return IsRTCheckNeeded; }
-
+ /// \brief Initial processing of memory accesses determined that we need to
+ /// perform dependency checking.
+ ///
+ /// Note that this can later be cleared if we retry memcheck analysis without
+ /// dependency checking (i.e. ShouldRetryWithRuntimeCheck).
bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
- void resetDepChecks() { CheckDeps.clear(); }
+
+ /// We decided that no dependence analysis would be used. Reset the state.
+ void resetDepChecks(MemoryDepChecker &DepChecker) {
+ CheckDeps.clear();
+ DepChecker.clearDependences();
+ }
MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
typedef SetVector<MemAccessInfo> PtrAccessSet;
/// \brief Go over all memory access and check whether runtime pointer checks
- /// are needed /// and build sets of dependency check candidates.
+ /// are needed and build sets of dependency check candidates.
void processMemAccesses();
/// Set of all accesses.
PtrAccessSet Accesses;
+ const DataLayout &DL;
+
/// Set of accesses that need a further dependence check.
MemAccessInfoSet CheckDeps;
/// Set of pointers that are read only.
SmallPtrSet<Value*, 16> ReadOnlyPtr;
- const DataLayout *DL;
-
/// An alias set tracker to partition the access set by underlying object and
//intrinsic property (such as TBAA metadata).
AliasSetTracker AST;
+ LoopInfo *LI;
+
/// Sets of potentially dependent accesses - members of one set share an
/// underlying pointer. The set "CheckDeps" identfies which sets really need a
/// dependence check.
- DepCandidates &DepCands;
+ MemoryDepChecker::DepCandidates &DepCands;
- bool IsRTCheckNeeded;
+ /// \brief Initial processing of memory accesses determined that we may need
+ /// to add memchecks. Perform the analysis to determine the necessary checks.
+ ///
+ /// Note that, this is different from isDependencyCheckNeeded. When we retry
+ /// memcheck analysis without dependency checking
+ /// (i.e. ShouldRetryWithRuntimeCheck), isDependencyCheckNeeded is cleared
+ /// while this remains set if we have potentially dependent accesses.
+ bool IsRTCheckAnalysisNeeded;
+
+ /// The SCEV predicate containing all the SCEV-related assumptions.
+ SCEVUnionPredicate &Preds;
};
} // end anonymous namespace
/// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
- Value *Ptr) {
- const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
+static bool hasComputableBounds(ScalarEvolution *SE,
+ const ValueToValueMap &Strides, Value *Ptr,
+ Loop *L, SCEVUnionPredicate &Preds) {
+ const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
if (!AR)
return false;
return AR->isAffine();
}
-/// \brief Check the stride of the pointer and ensure that it does not wrap in
-/// the address space.
-static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
- const Loop *Lp, ValueToValueMap &StridesMap);
-
-bool AccessAnalysis::canCheckPtrAtRT(
- LoopAccessInfo::RuntimePointerCheck &RtCheck,
- unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop,
- ValueToValueMap &StridesMap, bool ShouldCheckStride) {
+bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
+ ScalarEvolution *SE, Loop *TheLoop,
+ const ValueToValueMap &StridesMap,
+ bool ShouldCheckStride) {
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
bool CanDoRT = true;
+ bool NeedRTCheck = false;
+ if (!IsRTCheckAnalysisNeeded) return true;
+
bool IsDepCheckNeeded = isDependencyCheckNeeded();
- NumComparisons = 0;
// We assign a consecutive id to access from different alias sets.
// Accesses between different groups doesn't need to be checked.
unsigned ASId = 1;
for (auto &AS : AST) {
- unsigned NumReadPtrChecks = 0;
- unsigned NumWritePtrChecks = 0;
+ int NumReadPtrChecks = 0;
+ int NumWritePtrChecks = 0;
// We assign consecutive id to access from different dependence sets.
// Accesses within the same set don't need a runtime check.
else
++NumReadPtrChecks;
- if (hasComputableBounds(SE, StridesMap, Ptr) &&
- // When we run after a failing dependency check we have to make sure we
- // don't have wrapping pointers.
+ if (hasComputableBounds(SE, StridesMap, Ptr, TheLoop, Preds) &&
+ // When we run after a failing dependency check we have to make sure
+ // we don't have wrapping pointers.
(!ShouldCheckStride ||
- isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
+ isStridedPtr(SE, Ptr, TheLoop, StridesMap, Preds) == 1)) {
// The id of the dependence set.
unsigned DepId;
// Each access has its own dependence set.
DepId = RunningDepId++;
- RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
+ RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, Preds);
- DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
+ DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
} else {
+ DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
CanDoRT = false;
}
}
- if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
- NumComparisons += 0; // Only one dependence set.
- else {
- NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
- NumWritePtrChecks - 1));
- }
+ // If we have at least two writes or one write and a read then we need to
+ // check them. But there is no need to checks if there is only one
+ // dependence set for this alias set.
+ //
+ // Note that this function computes CanDoRT and NeedRTCheck independently.
+ // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
+ // for which we couldn't find the bounds but we don't actually need to emit
+ // any checks so it does not matter.
+ if (!(IsDepCheckNeeded && CanDoRT && RunningDepId == 2))
+ NeedRTCheck |= (NumWritePtrChecks >= 2 || (NumReadPtrChecks >= 1 &&
+ NumWritePtrChecks >= 1));
++ASId;
}
for (unsigned i = 0; i < NumPointers; ++i) {
for (unsigned j = i + 1; j < NumPointers; ++j) {
// Only need to check pointers between two different dependency sets.
- if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+ if (RtCheck.Pointers[i].DependencySetId ==
+ RtCheck.Pointers[j].DependencySetId)
continue;
// Only need to check pointers in the same alias set.
- if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
+ if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
continue;
- Value *PtrI = RtCheck.Pointers[i];
- Value *PtrJ = RtCheck.Pointers[j];
+ Value *PtrI = RtCheck.Pointers[i].PointerValue;
+ Value *PtrJ = RtCheck.Pointers[j].PointerValue;
unsigned ASi = PtrI->getType()->getPointerAddressSpace();
unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
if (ASi != ASj) {
- DEBUG(dbgs() << "LV: Runtime check would require comparison between"
- " different address spaces\n");
+ DEBUG(dbgs() << "LAA: Runtime check would require comparison between"
+ " different address spaces\n");
return false;
}
}
}
- return CanDoRT;
+ if (NeedRTCheck && CanDoRT)
+ RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
+
+ DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
+ << " pointer comparisons.\n");
+
+ RtCheck.Need = NeedRTCheck;
+
+ bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
+ if (!CanDoRTIfNeeded)
+ RtCheck.reset();
+ return CanDoRTIfNeeded;
}
void AccessAnalysis::processMemAccesses() {
// process read-only pointers. This allows us to skip dependence tests for
// read-only pointers.
- DEBUG(dbgs() << "LV: Processing memory accesses...\n");
+ DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
DEBUG(dbgs() << " AST: "; AST.dump());
- DEBUG(dbgs() << "LV: Accesses:\n");
+ DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
DEBUG({
for (auto A : Accesses)
dbgs() << "\t" << *A.getPointer() << " (" <<
// catch "a[i] = a[i] + " without having to do a dependence check).
if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
CheckDeps.insert(Access);
- IsRTCheckNeeded = true;
+ IsRTCheckAnalysisNeeded = true;
}
if (IsWrite)
// underlying object.
typedef SmallVector<Value *, 16> ValueVector;
ValueVector TempObjects;
- GetUnderlyingObjects(Ptr, TempObjects, DL);
+
+ GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
+ DEBUG(dbgs() << "Underlying objects for pointer " << *Ptr << "\n");
for (Value *UnderlyingObj : TempObjects) {
+ // nullptr never alias, don't join sets for pointer that have "null"
+ // in their UnderlyingObjects list.
+ if (isa<ConstantPointerNull>(UnderlyingObj))
+ continue;
+
UnderlyingObjToAccessMap::iterator Prev =
ObjToLastAccess.find(UnderlyingObj);
if (Prev != ObjToLastAccess.end())
DepCands.unionSets(Access, Prev->second);
ObjToLastAccess[UnderlyingObj] = Access;
+ DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
}
}
}
}
}
-namespace {
-/// \brief Checks memory dependences among accesses to the same underlying
-/// object to determine whether there vectorization is legal or not (and at
-/// which vectorization factor).
-///
-/// This class works under the assumption that we already checked that memory
-/// locations with different underlying pointers are "must-not alias".
-/// We use the ScalarEvolution framework to symbolically evalutate access
-/// functions pairs. Since we currently don't restructure the loop we can rely
-/// on the program order of memory accesses to determine their safety.
-/// At the moment we will only deem accesses as safe for:
-/// * A negative constant distance assuming program order.
-///
-/// Safe: tmp = a[i + 1]; OR a[i + 1] = x;
-/// a[i] = tmp; y = a[i];
-///
-/// The latter case is safe because later checks guarantuee that there can't
-/// be a cycle through a phi node (that is, we check that "x" and "y" is not
-/// the same variable: a header phi can only be an induction or a reduction, a
-/// reduction can't have a memory sink, an induction can't have a memory
-/// source). This is important and must not be violated (or we have to
-/// resort to checking for cycles through memory).
-///
-/// * A positive constant distance assuming program order that is bigger
-/// than the biggest memory access.
-///
-/// tmp = a[i] OR b[i] = x
-/// a[i+2] = tmp y = b[i+2];
-///
-/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
-///
-/// * Zero distances and all accesses have the same size.
-///
-class MemoryDepChecker {
-public:
- typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
- typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
-
- MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L,
- const LoopAccessInfo::VectorizerParams &VectParams)
- : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
- ShouldRetryWithRuntimeCheck(false), VectParams(VectParams) {}
-
- /// \brief Register the location (instructions are given increasing numbers)
- /// of a write access.
- void addAccess(StoreInst *SI) {
- Value *Ptr = SI->getPointerOperand();
- Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
- InstMap.push_back(SI);
- ++AccessIdx;
- }
-
- /// \brief Register the location (instructions are given increasing numbers)
- /// of a write access.
- void addAccess(LoadInst *LI) {
- Value *Ptr = LI->getPointerOperand();
- Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
- InstMap.push_back(LI);
- ++AccessIdx;
- }
-
- /// \brief Check whether the dependencies between the accesses are safe.
- ///
- /// Only checks sets with elements in \p CheckDeps.
- bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
- MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
-
- /// \brief The maximum number of bytes of a vector register we can vectorize
- /// the accesses safely with.
- unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
-
- /// \brief In same cases when the dependency check fails we can still
- /// vectorize the loop with a dynamic array access check.
- bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
-
-private:
- ScalarEvolution *SE;
- const DataLayout *DL;
- const Loop *InnermostLoop;
-
- /// \brief Maps access locations (ptr, read/write) to program order.
- DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
-
- /// \brief Memory access instructions in program order.
- SmallVector<Instruction *, 16> InstMap;
-
- /// \brief The program order index to be used for the next instruction.
- unsigned AccessIdx;
+static bool isInBoundsGep(Value *Ptr) {
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+ return GEP->isInBounds();
+ return false;
+}
- // We can access this many bytes in parallel safely.
- unsigned MaxSafeDepDistBytes;
+/// \brief Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
+/// i.e. monotonically increasing/decreasing.
+static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
+ ScalarEvolution *SE, const Loop *L) {
+ // FIXME: This should probably only return true for NUW.
+ if (AR->getNoWrapFlags(SCEV::NoWrapMask))
+ return true;
- /// \brief If we see a non-constant dependence distance we can still try to
- /// vectorize this loop with runtime checks.
- bool ShouldRetryWithRuntimeCheck;
+ // Scalar evolution does not propagate the non-wrapping flags to values that
+ // are derived from a non-wrapping induction variable because non-wrapping
+ // could be flow-sensitive.
+ //
+ // Look through the potentially overflowing instruction to try to prove
+ // non-wrapping for the *specific* value of Ptr.
- /// \brief Vectorizer parameters used by the analysis.
- LoopAccessInfo::VectorizerParams VectParams;
+ // The arithmetic implied by an inbounds GEP can't overflow.
+ auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!GEP || !GEP->isInBounds())
+ return false;
- /// \brief Check whether there is a plausible dependence between the two
- /// accesses.
- ///
- /// Access \p A must happen before \p B in program order. The two indices
- /// identify the index into the program order map.
- ///
- /// This function checks whether there is a plausible dependence (or the
- /// absence of such can't be proved) between the two accesses. If there is a
- /// plausible dependence but the dependence distance is bigger than one
- /// element access it records this distance in \p MaxSafeDepDistBytes (if this
- /// distance is smaller than any other distance encountered so far).
- /// Otherwise, this function returns true signaling a possible dependence.
- bool isDependent(const MemAccessInfo &A, unsigned AIdx,
- const MemAccessInfo &B, unsigned BIdx,
- ValueToValueMap &Strides);
-
- /// \brief Check whether the data dependence could prevent store-load
- /// forwarding.
- bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
-};
+ // Make sure there is only one non-const index and analyze that.
+ Value *NonConstIndex = nullptr;
+ for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+ if (!isa<ConstantInt>(*Index)) {
+ if (NonConstIndex)
+ return false;
+ NonConstIndex = *Index;
+ }
+ if (!NonConstIndex)
+ // The recurrence is on the pointer, ignore for now.
+ return false;
-} // end anonymous namespace
+ // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
+ // AddRec using a NSW operation.
+ if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
+ if (OBO->hasNoSignedWrap() &&
+ // Assume constant for other the operand so that the AddRec can be
+ // easily found.
+ isa<ConstantInt>(OBO->getOperand(1))) {
+ auto *OpScev = SE->getSCEV(OBO->getOperand(0));
+
+ if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
+ return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
+ }
-static bool isInBoundsGep(Value *Ptr) {
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
- return GEP->isInBounds();
return false;
}
/// \brief Check whether the access through \p Ptr has a constant stride.
-static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
- const Loop *Lp, ValueToValueMap &StridesMap) {
- const Type *Ty = Ptr->getType();
+int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+ const ValueToValueMap &StridesMap,
+ SCEVUnionPredicate &Preds) {
+ Type *Ty = Ptr->getType();
assert(Ty->isPointerTy() && "Unexpected non-ptr");
// Make sure that the pointer does not point to aggregate types.
- const PointerType *PtrTy = cast<PointerType>(Ty);
+ auto *PtrTy = cast<PointerType>(Ty);
if (PtrTy->getElementType()->isAggregateType()) {
- DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr
- << "\n");
+ DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
+ << *Ptr << "\n");
return 0;
}
- const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
+ const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Preds, Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
if (!AR) {
- DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " << *Ptr
- << " SCEV: " << *PtrScev << "\n");
+ DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer "
+ << *Ptr << " SCEV: " << *PtrScev << "\n");
return 0;
}
// The accesss function must stride over the innermost loop.
if (Lp != AR->getLoop()) {
- DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr
- << " SCEV: " << *PtrScev << "\n");
+ DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " <<
+ *Ptr << " SCEV: " << *PtrScev << "\n");
}
// The address calculation must not wrap. Otherwise, a dependence could be
// to access the pointer value "0" which is undefined behavior in address
// space 0, therefore we can also vectorize this case.
bool IsInBoundsGEP = isInBoundsGep(Ptr);
- bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+ bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp);
bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
- DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
- << *Ptr << " SCEV: " << *PtrScev << "\n");
+ DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
+ << *Ptr << " SCEV: " << *PtrScev << "\n");
return 0;
}
// Check the step is constant.
const SCEV *Step = AR->getStepRecurrence(*SE);
- // Calculate the pointer stride and check if it is consecutive.
+ // Calculate the pointer stride and check if it is constant.
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
if (!C) {
- DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr
- << " SCEV: " << *PtrScev << "\n");
+ DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr <<
+ " SCEV: " << *PtrScev << "\n");
return 0;
}
- int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
+ auto &DL = Lp->getHeader()->getModule()->getDataLayout();
+ int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
const APInt &APStepVal = C->getValue()->getValue();
// Huge step value - give up.
return Stride;
}
+bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
+ switch (Type) {
+ case NoDep:
+ case Forward:
+ case BackwardVectorizable:
+ return true;
+
+ case Unknown:
+ case ForwardButPreventsForwarding:
+ case Backward:
+ case BackwardVectorizableButPreventsForwarding:
+ return false;
+ }
+ llvm_unreachable("unexpected DepType!");
+}
+
+bool MemoryDepChecker::Dependence::isBackward() const {
+ switch (Type) {
+ case NoDep:
+ case Forward:
+ case ForwardButPreventsForwarding:
+ case Unknown:
+ return false;
+
+ case BackwardVectorizable:
+ case Backward:
+ case BackwardVectorizableButPreventsForwarding:
+ return true;
+ }
+ llvm_unreachable("unexpected DepType!");
+}
+
+bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
+ return isBackward() || Type == Unknown;
+}
+
+bool MemoryDepChecker::Dependence::isForward() const {
+ switch (Type) {
+ case Forward:
+ case ForwardButPreventsForwarding:
+ return true;
+
+ case NoDep:
+ case Unknown:
+ case BackwardVectorizable:
+ case Backward:
+ case BackwardVectorizableButPreventsForwarding:
+ return false;
+ }
+ llvm_unreachable("unexpected DepType!");
+}
+
bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
unsigned TypeByteSize) {
// If loads occur at a distance that is not a multiple of a feasible vector
const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
// Maximum vector factor.
unsigned MaxVFWithoutSLForwardIssues =
- VectParams.MaxVectorWidth * TypeByteSize;
- if (MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
+ VectorizerParams::MaxVectorWidth * TypeByteSize;
+ if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
}
}
- if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
- DEBUG(dbgs() << "LV: Distance " << Distance
- << " that could cause a store-load forwarding conflict\n");
+ if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
+ DEBUG(dbgs() << "LAA: Distance " << Distance <<
+ " that could cause a store-load forwarding conflict\n");
return true;
}
if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
- MaxVFWithoutSLForwardIssues != VectParams.MaxVectorWidth * TypeByteSize)
+ MaxVFWithoutSLForwardIssues !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
return false;
}
-bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
- const MemAccessInfo &B, unsigned BIdx,
- ValueToValueMap &Strides) {
+/// \brief Check the dependence for two accesses with the same stride \p Stride.
+/// \p Distance is the positive distance and \p TypeByteSize is type size in
+/// bytes.
+///
+/// \returns true if they are independent.
+static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride,
+ unsigned TypeByteSize) {
+ assert(Stride > 1 && "The stride must be greater than 1");
+ assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
+ assert(Distance > 0 && "The distance must be non-zero");
+
+ // Skip if the distance is not multiple of type byte size.
+ if (Distance % TypeByteSize)
+ return false;
+
+ unsigned ScaledDist = Distance / TypeByteSize;
+
+ // No dependence if the scaled distance is not multiple of the stride.
+ // E.g.
+ // for (i = 0; i < 1024 ; i += 4)
+ // A[i+2] = A[i] + 1;
+ //
+ // Two accesses in memory (scaled distance is 2, stride is 4):
+ // | A[0] | | | | A[4] | | | |
+ // | | | A[2] | | | | A[6] | |
+ //
+ // E.g.
+ // for (i = 0; i < 1024 ; i += 3)
+ // A[i+4] = A[i] + 1;
+ //
+ // Two accesses in memory (scaled distance is 4, stride is 3):
+ // | A[0] | | | A[3] | | | A[6] | | |
+ // | | | | | A[4] | | | A[7] | |
+ return ScaledDist % Stride;
+}
+
+MemoryDepChecker::Dependence::DepType
+MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+ const MemAccessInfo &B, unsigned BIdx,
+ const ValueToValueMap &Strides) {
assert (AIdx < BIdx && "Must pass arguments in program order");
Value *APtr = A.getPointer();
// Two reads are independent.
if (!AIsWrite && !BIsWrite)
- return false;
+ return Dependence::NoDep;
// We cannot check pointers in different address spaces.
if (APtr->getType()->getPointerAddressSpace() !=
BPtr->getType()->getPointerAddressSpace())
- return true;
+ return Dependence::Unknown;
- const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
- const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
+ const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, APtr);
+ const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, BPtr);
- int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
- int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
+ int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides, Preds);
+ int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides, Preds);
const SCEV *Src = AScev;
const SCEV *Sink = BScev;
const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
- DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
- << "(Induction step: " << StrideAPtr << ")\n");
- DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
- << *InstMap[BIdx] << ": " << *Dist << "\n");
+ DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
+ << "(Induction step: " << StrideAPtr << ")\n");
+ DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
+ << *InstMap[BIdx] << ": " << *Dist << "\n");
- // Need consecutive accesses. We don't want to vectorize
+ // Need accesses with constant stride. We don't want to vectorize
// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
// the address space.
if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
- DEBUG(dbgs() << "Non-consecutive pointer access\n");
- return true;
+ DEBUG(dbgs() << "Pointer access with non-constant stride\n");
+ return Dependence::Unknown;
}
const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
if (!C) {
- DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n");
+ DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
ShouldRetryWithRuntimeCheck = true;
- return true;
+ return Dependence::Unknown;
}
Type *ATy = APtr->getType()->getPointerElementType();
Type *BTy = BPtr->getType()->getPointerElementType();
- unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
+ auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+ unsigned TypeByteSize = DL.getTypeAllocSize(ATy);
// Negative distances are not plausible dependencies.
const APInt &Val = C->getValue()->getValue();
if (IsTrueDataDependence &&
(couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
ATy != BTy))
- return true;
+ return Dependence::ForwardButPreventsForwarding;
- DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
- return false;
+ DEBUG(dbgs() << "LAA: Dependence is negative: NoDep\n");
+ return Dependence::Forward;
}
// Write to the same location with the same size.
// Could be improved to assert type sizes are the same (i32 == float, etc).
if (Val == 0) {
if (ATy == BTy)
- return false;
- DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
- return true;
+ return Dependence::Forward;
+ DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n");
+ return Dependence::Unknown;
}
assert(Val.isStrictlyPositive() && "Expect a positive value");
- // Positive distance bigger than max vectorization factor.
if (ATy != BTy) {
- DEBUG(dbgs()
- << "LV: ReadWrite-Write positive dependency with different types\n");
- return false;
+ DEBUG(dbgs() <<
+ "LAA: ReadWrite-Write positive dependency with different types\n");
+ return Dependence::Unknown;
}
unsigned Distance = (unsigned) Val.getZExtValue();
+ unsigned Stride = std::abs(StrideAPtr);
+ if (Stride > 1 &&
+ areStridedAccessesIndependent(Distance, Stride, TypeByteSize)) {
+ DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
+ return Dependence::NoDep;
+ }
+
// Bail out early if passed-in parameters make vectorization not feasible.
- unsigned ForcedFactor =
- (VectParams.VectorizationFactor ? VectParams.VectorizationFactor : 1);
- unsigned ForcedUnroll =
- (VectParams.VectorizationInterleave ? VectParams.VectorizationInterleave
- : 1);
-
- // The distance must be bigger than the size needed for a vectorized version
- // of the operation and the size of the vectorized operation must not be
- // bigger than the currrent maximum size.
- if (Distance < 2*TypeByteSize ||
- 2*TypeByteSize > MaxSafeDepDistBytes ||
- Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
- DEBUG(dbgs() << "LV: Failure because of Positive distance "
- << Val.getSExtValue() << '\n');
- return true;
+ unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
+ VectorizerParams::VectorizationFactor : 1);
+ unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
+ VectorizerParams::VectorizationInterleave : 1);
+ // The minimum number of iterations for a vectorized/unrolled version.
+ unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
+
+ // It's not vectorizable if the distance is smaller than the minimum distance
+ // needed for a vectroized/unrolled version. Vectorizing one iteration in
+ // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
+ // TypeByteSize (No need to plus the last gap distance).
+ //
+ // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
+ // foo(int *A) {
+ // int *B = (int *)((char *)A + 14);
+ // for (i = 0 ; i < 1024 ; i += 2)
+ // B[i] = A[i] + 1;
+ // }
+ //
+ // Two accesses in memory (stride is 2):
+ // | A[0] | | A[2] | | A[4] | | A[6] | |
+ // | B[0] | | B[2] | | B[4] |
+ //
+ // Distance needs for vectorizing iterations except the last iteration:
+ // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
+ // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
+ //
+ // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
+ // 12, which is less than distance.
+ //
+ // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
+ // the minimum distance needed is 28, which is greater than distance. It is
+ // not safe to do vectorization.
+ unsigned MinDistanceNeeded =
+ TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
+ if (MinDistanceNeeded > Distance) {
+ DEBUG(dbgs() << "LAA: Failure because of positive distance " << Distance
+ << '\n');
+ return Dependence::Backward;
}
- MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
- Distance : MaxSafeDepDistBytes;
+ // Unsafe if the minimum distance needed is greater than max safe distance.
+ if (MinDistanceNeeded > MaxSafeDepDistBytes) {
+ DEBUG(dbgs() << "LAA: Failure because it needs at least "
+ << MinDistanceNeeded << " size in bytes");
+ return Dependence::Backward;
+ }
+
+ // Positive distance bigger than max vectorization factor.
+ // FIXME: Should use max factor instead of max distance in bytes, which could
+ // not handle different types.
+ // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
+ // void foo (int *A, char *B) {
+ // for (unsigned i = 0; i < 1024; i++) {
+ // A[i+2] = A[i] + 1;
+ // B[i+2] = B[i] + 1;
+ // }
+ // }
+ //
+ // This case is currently unsafe according to the max safe distance. If we
+ // analyze the two accesses on array B, the max safe dependence distance
+ // is 2. Then we analyze the accesses on array A, the minimum distance needed
+ // is 8, which is less than 2 and forbidden vectorization, But actually
+ // both A and B could be vectorized by 2 iterations.
+ MaxSafeDepDistBytes =
+ Distance < MaxSafeDepDistBytes ? Distance : MaxSafeDepDistBytes;
bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
if (IsTrueDataDependence &&
couldPreventStoreLoadForward(Distance, TypeByteSize))
- return true;
+ return Dependence::BackwardVectorizableButPreventsForwarding;
- DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue()
- << " with max VF = " << MaxSafeDepDistBytes / TypeByteSize
- << '\n');
+ DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
+ << " with max VF = "
+ << MaxSafeDepDistBytes / (TypeByteSize * Stride) << '\n');
- return false;
+ return Dependence::BackwardVectorizable;
}
-bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
MemAccessInfoSet &CheckDeps,
- ValueToValueMap &Strides) {
+ const ValueToValueMap &Strides) {
MaxSafeDepDistBytes = -1U;
while (!CheckDeps.empty()) {
I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
- if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides))
- return false;
- if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides))
+ auto A = std::make_pair(&*AI, *I1);
+ auto B = std::make_pair(&*OI, *I2);
+
+ assert(*I1 != *I2);
+ if (*I1 > *I2)
+ std::swap(A, B);
+
+ Dependence::DepType Type =
+ isDependent(*A.first, A.second, *B.first, B.second, Strides);
+ SafeForVectorization &= Dependence::isSafeForVectorization(Type);
+
+ // Gather dependences unless we accumulated MaxDependences
+ // dependences. In that case return as soon as we find the first
+ // unsafe dependence. This puts a limit on this quadratic
+ // algorithm.
+ if (RecordDependences) {
+ if (Type != Dependence::NoDep)
+ Dependences.push_back(Dependence(A.second, B.second, Type));
+
+ if (Dependences.size() >= MaxDependences) {
+ RecordDependences = false;
+ Dependences.clear();
+ DEBUG(dbgs() << "Too many dependences, stopped recording\n");
+ }
+ }
+ if (!RecordDependences && !SafeForVectorization)
return false;
}
++OI;
AI++;
}
}
+
+ DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
+ return SafeForVectorization;
+}
+
+SmallVector<Instruction *, 4>
+MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
+ MemAccessInfo Access(Ptr, isWrite);
+ auto &IndexVector = Accesses.find(Access)->second;
+
+ SmallVector<Instruction *, 4> Insts;
+ std::transform(IndexVector.begin(), IndexVector.end(),
+ std::back_inserter(Insts),
+ [&](unsigned Idx) { return this->InstMap[Idx]; });
+ return Insts;
+}
+
+const char *MemoryDepChecker::Dependence::DepName[] = {
+ "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
+ "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
+
+void MemoryDepChecker::Dependence::print(
+ raw_ostream &OS, unsigned Depth,
+ const SmallVectorImpl<Instruction *> &Instrs) const {
+ OS.indent(Depth) << DepName[Type] << ":\n";
+ OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
+ OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
+}
+
+bool LoopAccessInfo::canAnalyzeLoop() {
+ // We need to have a loop header.
+ DEBUG(dbgs() << "LAA: Found a loop: " <<
+ TheLoop->getHeader()->getName() << '\n');
+
+ // We can only analyze innermost loops.
+ if (!TheLoop->empty()) {
+ DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
+ emitAnalysis(LoopAccessReport() << "loop is not the innermost loop");
+ return false;
+ }
+
+ // We must have a single backedge.
+ if (TheLoop->getNumBackEdges() != 1) {
+ DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
+ emitAnalysis(
+ LoopAccessReport() <<
+ "loop control flow is not understood by analyzer");
+ return false;
+ }
+
+ // We must have a single exiting block.
+ if (!TheLoop->getExitingBlock()) {
+ DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
+ emitAnalysis(
+ LoopAccessReport() <<
+ "loop control flow is not understood by analyzer");
+ return false;
+ }
+
+ // We only handle bottom-tested loops, i.e. loop in which the condition is
+ // checked at the end of each iteration. With that we can assume that all
+ // instructions in the loop are executed the same number of times.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ DEBUG(dbgs() << "LAA: loop control flow is not understood by analyzer\n");
+ emitAnalysis(
+ LoopAccessReport() <<
+ "loop control flow is not understood by analyzer");
+ return false;
+ }
+
+ // ScalarEvolution needs to be able to find the exit count.
+ const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
+ if (ExitCount == SE->getCouldNotCompute()) {
+ emitAnalysis(LoopAccessReport() <<
+ "could not determine number of loop iterations");
+ DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
+ return false;
+ }
+
return true;
}
-bool LoopAccessInfo::canVectorizeMemory(ValueToValueMap &Strides) {
+void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
typedef SmallVector<Value*, 16> ValueVector;
typedef SmallPtrSet<Value*, 16> ValueSet;
unsigned NumReads = 0;
unsigned NumReadWrites = 0;
- PtrRtCheck.Pointers.clear();
- PtrRtCheck.Need = false;
+ PtrRtChecking.Pointers.clear();
+ PtrRtChecking.Need = false;
const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
- MemoryDepChecker DepChecker(SE, DL, TheLoop, VectParams);
// For each block.
for (Loop::block_iterator bb = TheLoop->block_begin(),
if (Call && getIntrinsicIDForCall(Call, TLI))
continue;
+ // If the function has an explicit vectorized counterpart, we can safely
+ // assume that it can be vectorized.
+ if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
+ TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
+ continue;
+
LoadInst *Ld = dyn_cast<LoadInst>(it);
if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
- emitAnalysis(VectorizationReport(Ld)
+ emitAnalysis(LoopAccessReport(Ld)
<< "read with atomic ordering or volatile read");
- DEBUG(dbgs() << "LV: Found a non-simple load.\n");
- return false;
+ DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
+ CanVecMem = false;
+ return;
}
NumLoads++;
Loads.push_back(Ld);
if (it->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(it);
if (!St) {
- emitAnalysis(VectorizationReport(it)
- << "instruction cannot be vectorized");
- return false;
+ emitAnalysis(LoopAccessReport(&*it) <<
+ "instruction cannot be vectorized");
+ CanVecMem = false;
+ return;
}
if (!St->isSimple() && !IsAnnotatedParallel) {
- emitAnalysis(VectorizationReport(St)
+ emitAnalysis(LoopAccessReport(St)
<< "write with atomic ordering or volatile write");
- DEBUG(dbgs() << "LV: Found a non-simple store.\n");
- return false;
+ DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
+ CanVecMem = false;
+ return;
}
NumStores++;
Stores.push_back(St);
// Check if we see any stores. If there are no stores, then we don't
// care if the pointers are *restrict*.
if (!Stores.size()) {
- DEBUG(dbgs() << "LV: Found a read-only loop!\n");
- return true;
+ DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
+ CanVecMem = true;
+ return;
}
- AccessAnalysis::DepCandidates DependentAccesses;
- AccessAnalysis Accesses(DL, AA, DependentAccesses);
+ MemoryDepChecker::DepCandidates DependentAccesses;
+ AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
+ AA, LI, DependentAccesses, Preds);
// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
StoreInst *ST = cast<StoreInst>(*I);
Value* Ptr = ST->getPointerOperand();
-
- if (isUniform(Ptr)) {
- emitAnalysis(
- VectorizationReport(ST)
- << "write to a loop invariant address could not be vectorized");
- DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
- return false;
- }
-
+ // Check for store to loop invariant address.
+ StoreToLoopInvariantAddress |= isUniform(Ptr);
// If we did *not* see this pointer before, insert it to the read-write
// list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr).second) {
++NumReadWrites;
- AliasAnalysis::Location Loc = AA->getLocation(ST);
+ MemoryLocation Loc = MemoryLocation::get(ST);
// The TBAA metadata could have a control dependency on the predication
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
}
if (IsAnnotatedParallel) {
- DEBUG(dbgs() << "LV: A loop annotated parallel, ignore memory dependency "
- << "checks.\n");
- return true;
+ DEBUG(dbgs()
+ << "LAA: A loop annotated parallel, ignore memory dependency "
+ << "checks.\n");
+ CanVecMem = true;
+ return;
}
for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
// words may be written to the same address.
bool IsReadOnlyPtr = false;
if (Seen.insert(Ptr).second ||
- !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
+ !isStridedPtr(SE, Ptr, TheLoop, Strides, Preds)) {
++NumReads;
IsReadOnlyPtr = true;
}
- AliasAnalysis::Location Loc = AA->getLocation(LD);
+ MemoryLocation Loc = MemoryLocation::get(LD);
// The TBAA metadata could have a control dependency on the predication
// condition, so we cannot rely on it when determining whether or not we
// need runtime pointer checks.
// If we write (or read-write) to a single destination and there are no
// other reads in this loop then is it safe to vectorize.
if (NumReadWrites == 1 && NumReads == 0) {
- DEBUG(dbgs() << "LV: Found a write-only loop!\n");
- return true;
+ DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
+ CanVecMem = true;
+ return;
}
// Build dependence sets and check whether we need a runtime pointer bounds
// check.
Accesses.buildDependenceSets();
- bool NeedRTCheck = Accesses.isRTCheckNeeded();
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
- unsigned NumComparisons = 0;
- bool CanDoRT = false;
- if (NeedRTCheck)
- CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
- Strides);
-
- DEBUG(dbgs() << "LV: We need to do " << NumComparisons
- << " pointer comparisons.\n");
-
- // If we only have one set of dependences to check pointers among we don't
- // need a runtime check.
- if (NumComparisons == 0 && NeedRTCheck)
- NeedRTCheck = false;
-
- // Check that we did not collect too many pointers or found an unsizeable
- // pointer.
- if (!CanDoRT || NumComparisons > VectParams.RuntimeMemoryCheckThreshold) {
- PtrRtCheck.reset();
- CanDoRT = false;
- }
-
- if (CanDoRT) {
- DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
- }
-
- if (NeedRTCheck && !CanDoRT) {
- emitAnalysis(VectorizationReport() << "cannot identify array bounds");
- DEBUG(dbgs() << "LV: We can't vectorize because we can't find "
+ bool CanDoRTIfNeeded =
+ Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides);
+ if (!CanDoRTIfNeeded) {
+ emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
+ DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
<< "the array bounds.\n");
- PtrRtCheck.reset();
- return false;
+ CanVecMem = false;
+ return;
}
- PtrRtCheck.Need = NeedRTCheck;
+ DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
- bool CanVecMem = true;
+ CanVecMem = true;
if (Accesses.isDependencyCheckNeeded()) {
- DEBUG(dbgs() << "LV: Checking memory dependencies\n");
+ DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
CanVecMem = DepChecker.areDepsSafe(
DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
- DEBUG(dbgs() << "LV: Retrying with memory checks\n");
- NeedRTCheck = true;
+ DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
// Clear the dependency checks. We assume they are not needed.
- Accesses.resetDepChecks();
-
- PtrRtCheck.reset();
- PtrRtCheck.Need = true;
-
- CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
- TheLoop, Strides, true);
- // Check that we did not collect too many pointers or found an unsizeable
- // pointer.
- if (!CanDoRT || NumComparisons > VectParams.RuntimeMemoryCheckThreshold) {
- if (!CanDoRT && NumComparisons > 0)
- emitAnalysis(VectorizationReport()
- << "cannot check memory dependencies at runtime");
- else
- emitAnalysis(VectorizationReport()
- << NumComparisons << " exceeds limit of "
- << VectParams.RuntimeMemoryCheckThreshold
- << " dependent memory operations checked at runtime");
- DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
- PtrRtCheck.reset();
- return false;
+ Accesses.resetDepChecks(DepChecker);
+
+ PtrRtChecking.reset();
+ PtrRtChecking.Need = true;
+
+ CanDoRTIfNeeded =
+ Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
+
+ // Check that we found the bounds for the pointer.
+ if (!CanDoRTIfNeeded) {
+ emitAnalysis(LoopAccessReport()
+ << "cannot check memory dependencies at runtime");
+ DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
+ CanVecMem = false;
+ return;
}
CanVecMem = true;
}
}
- if (!CanVecMem)
- emitAnalysis(VectorizationReport()
- << "unsafe dependent memory operations in loop");
-
- DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't")
- << " need a runtime memory check.\n");
-
- return CanVecMem;
+ if (CanVecMem)
+ DEBUG(dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
+ << (PtrRtChecking.Need ? "" : " don't")
+ << " need runtime memory checks.\n");
+ else {
+ emitAnalysis(LoopAccessReport() <<
+ "unsafe dependent memory operations in loop");
+ DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
+ }
}
bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
return !DT->dominates(BB, Latch);
}
-void LoopAccessInfo::emitAnalysis(VectorizationReport &Message) {
- VectorizationReport::emitAnalysis(Message, TheFunction, TheLoop);
+void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
+ assert(!Report && "Multiple reports generated");
+ Report = Message;
}
-bool LoopAccessInfo::isUniform(Value *V) {
+bool LoopAccessInfo::isUniform(Value *V) const {
return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
}
return nullptr;
}
-std::pair<Instruction *, Instruction *>
-LoopAccessInfo::addRuntimeCheck(Instruction *Loc) {
- Instruction *tnullptr = nullptr;
- if (!PtrRtCheck.Need)
- return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
+namespace {
+/// \brief IR Values for the lower and upper bounds of a pointer evolution. We
+/// need to use value-handles because SCEV expansion can invalidate previously
+/// expanded values. Thus expansion of a pointer can invalidate the bounds for
+/// a previous one.
+struct PointerBounds {
+ TrackingVH<Value> Start;
+ TrackingVH<Value> End;
+};
+} // end anonymous namespace
- unsigned NumPointers = PtrRtCheck.Pointers.size();
- SmallVector<TrackingVH<Value> , 2> Starts;
- SmallVector<TrackingVH<Value> , 2> Ends;
+/// \brief Expand code for the lower and upper bound of the pointer group \p CG
+/// in \p TheLoop. \return the values for the bounds.
+static PointerBounds
+expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
+ Instruction *Loc, SCEVExpander &Exp, ScalarEvolution *SE,
+ const RuntimePointerChecking &PtrRtChecking) {
+ Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue;
+ const SCEV *Sc = SE->getSCEV(Ptr);
+
+ if (SE->isLoopInvariant(Sc, TheLoop)) {
+ DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr
+ << "\n");
+ return {Ptr, Ptr};
+ } else {
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ LLVMContext &Ctx = Loc->getContext();
+
+ // Use this type for pointer arithmetic.
+ Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+ Value *Start = nullptr, *End = nullptr;
+
+ DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+ Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
+ End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
+ DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High << "\n");
+ return {Start, End};
+ }
+}
- LLVMContext &Ctx = Loc->getContext();
- SCEVExpander Exp(*SE, "induction");
- Instruction *FirstInst = nullptr;
+/// \brief Turns a collection of checks into a collection of expanded upper and
+/// lower bounds for both pointers in the check.
+static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(
+ const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks,
+ Loop *L, Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp,
+ const RuntimePointerChecking &PtrRtChecking) {
+ SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
+
+ // Here we're relying on the SCEV Expander's cache to only emit code for the
+ // same bounds once.
+ std::transform(
+ PointerChecks.begin(), PointerChecks.end(),
+ std::back_inserter(ChecksWithBounds),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ PointerBounds
+ First = expandBounds(Check.first, L, Loc, Exp, SE, PtrRtChecking),
+ Second = expandBounds(Check.second, L, Loc, Exp, SE, PtrRtChecking);
+ return std::make_pair(First, Second);
+ });
+
+ return ChecksWithBounds;
+}
- for (unsigned i = 0; i < NumPointers; ++i) {
- Value *Ptr = PtrRtCheck.Pointers[i];
- const SCEV *Sc = SE->getSCEV(Ptr);
-
- if (SE->isLoopInvariant(Sc, TheLoop)) {
- DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << *Ptr
- << "\n");
- Starts.push_back(Ptr);
- Ends.push_back(Ptr);
- } else {
- DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
-
- // Use this type for pointer arithmetic.
- Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
-
- Value *Start = Exp.expandCodeFor(PtrRtCheck.Starts[i], PtrArithTy, Loc);
- Value *End = Exp.expandCodeFor(PtrRtCheck.Ends[i], PtrArithTy, Loc);
- Starts.push_back(Start);
- Ends.push_back(End);
- }
- }
+std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
+ Instruction *Loc,
+ const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
+ const {
+ SCEVExpander Exp(*SE, DL, "induction");
+ auto ExpandedChecks =
+ expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking);
+
+ LLVMContext &Ctx = Loc->getContext();
+ Instruction *FirstInst = nullptr;
IRBuilder<> ChkBuilder(Loc);
// Our instructions might fold to a constant.
Value *MemoryRuntimeCheck = nullptr;
- for (unsigned i = 0; i < NumPointers; ++i) {
- for (unsigned j = i+1; j < NumPointers; ++j) {
- if (!PtrRtCheck.needsChecking(i, j))
- continue;
-
- unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
- unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
-
- assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
- (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
- "Trying to bounds check pointers with different address spaces");
-
- Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
- Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
- Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
- Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
- Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc");
- Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc");
-
- Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
- FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
- Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
- FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
- Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+ for (const auto &Check : ExpandedChecks) {
+ const PointerBounds &A = Check.first, &B = Check.second;
+ // Check if two pointers (A and B) conflict where conflict is computed as:
+ // start(A) <= end(B) && start(B) <= end(A)
+ unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
+ unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
+
+ assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
+ (AS1 == A.End->getType()->getPointerAddressSpace()) &&
+ "Trying to bounds check pointers with different address spaces");
+
+ Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+ Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+ Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
+ Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
+ Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
+ Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
+
+ Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+ FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
+ Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+ FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
+ Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+ FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+ if (MemoryRuntimeCheck) {
+ IsConflict =
+ ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
- if (MemoryRuntimeCheck) {
- IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
- "conflict.rdx");
- FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
- }
- MemoryRuntimeCheck = IsConflict;
}
+ MemoryRuntimeCheck = IsConflict;
}
+ if (!MemoryRuntimeCheck)
+ return std::make_pair(nullptr, nullptr);
+
// We have to do this trickery because the IRBuilder might fold the check to a
// constant expression in which case there is no Instruction anchored in a
// the block.
FirstInst = getFirstInst(FirstInst, Check, Loc);
return std::make_pair(FirstInst, Check);
}
+
+std::pair<Instruction *, Instruction *>
+LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const {
+ if (!PtrRtChecking.Need)
+ return std::make_pair(nullptr, nullptr);
+
+ return addRuntimeChecks(Loc, PtrRtChecking.getChecks());
+}
+
+LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
+ const DataLayout &DL,
+ const TargetLibraryInfo *TLI, AliasAnalysis *AA,
+ DominatorTree *DT, LoopInfo *LI,
+ const ValueToValueMap &Strides)
+ : PtrRtChecking(SE), DepChecker(SE, L, Preds), TheLoop(L), SE(SE), DL(DL),
+ TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
+ MaxSafeDepDistBytes(-1U), CanVecMem(false),
+ StoreToLoopInvariantAddress(false) {
+ if (canAnalyzeLoop())
+ analyzeLoop(Strides);
+}
+
+void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
+ if (CanVecMem) {
+ if (PtrRtChecking.Need)
+ OS.indent(Depth) << "Memory dependences are safe with run-time checks\n";
+ else
+ OS.indent(Depth) << "Memory dependences are safe\n";
+ }
+
+ if (Report)
+ OS.indent(Depth) << "Report: " << Report->str() << "\n";
+
+ if (auto *Dependences = DepChecker.getDependences()) {
+ OS.indent(Depth) << "Dependences:\n";
+ for (auto &Dep : *Dependences) {
+ Dep.print(OS, Depth + 2, DepChecker.getMemoryInstructions());
+ OS << "\n";
+ }
+ } else
+ OS.indent(Depth) << "Too many dependences, not recorded\n";
+
+ // List the pair of accesses need run-time checks to prove independence.
+ PtrRtChecking.print(OS, Depth);
+ OS << "\n";
+
+ OS.indent(Depth) << "Store to invariant address was "
+ << (StoreToLoopInvariantAddress ? "" : "not ")
+ << "found in loop.\n";
+
+ OS.indent(Depth) << "SCEV assumptions:\n";
+ Preds.print(OS, Depth);
+}
+
+const LoopAccessInfo &
+LoopAccessAnalysis::getInfo(Loop *L, const ValueToValueMap &Strides) {
+ auto &LAI = LoopAccessInfoMap[L];
+
+#ifndef NDEBUG
+ assert((!LAI || LAI->NumSymbolicStrides == Strides.size()) &&
+ "Symbolic strides changed for loop");
+#endif
+
+ if (!LAI) {
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ LAI =
+ llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI, Strides);
+#ifndef NDEBUG
+ LAI->NumSymbolicStrides = Strides.size();
+#endif
+ }
+ return *LAI.get();
+}
+
+void LoopAccessAnalysis::print(raw_ostream &OS, const Module *M) const {
+ LoopAccessAnalysis &LAA = *const_cast<LoopAccessAnalysis *>(this);
+
+ ValueToValueMap NoSymbolicStrides;
+
+ for (Loop *TopLevelLoop : *LI)
+ for (Loop *L : depth_first(TopLevelLoop)) {
+ OS.indent(2) << L->getHeader()->getName() << ":\n";
+ auto &LAI = LAA.getInfo(L, NoSymbolicStrides);
+ LAI.print(OS, 4);
+ }
+}
+
+bool LoopAccessAnalysis::runOnFunction(Function &F) {
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+ return false;
+}
+
+void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+
+ AU.setPreservesAll();
+}
+
+char LoopAccessAnalysis::ID = 0;
+static const char laa_name[] = "Loop Access Analysis";
+#define LAA_NAME "loop-accesses"
+
+INITIALIZE_PASS_BEGIN(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopAccessAnalysis, LAA_NAME, laa_name, false, true)
+
+namespace llvm {
+ Pass *createLAAPass() {
+ return new LoopAccessAnalysis();
+ }
+}