From c89c6a964c6091e07a51a22ad8c1c38295eb8599 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Tue, 2 Dec 2008 08:16:11 +0000 Subject: [PATCH] Implement PRE of loads in the GVN pass with a pretty cheap and straight-forward implementation. This does not require any extra alias analysis queries beyond what we already do for non-local loads. Some programs really really like load PRE. For example, SPASS triggers this ~1000 times, ~300 times in 255.vortex, and ~1500 times on 403.gcc. The biggest limitation to the implementation is that it does not split critical edges. This is a huge killer on many programs and should be addressed after the initial patch is enabled by default. The implementation of this should incidentally speed up rejection of non-local loads because it avoids creating the repl densemap in cases when it won't be used for fully redundant loads. This is currently disabled by default. Before I turn this on, I need to fix a couple of miscompilations in the testsuite, look at compile time performance numbers, and look at perf impact. This is pretty close to ready though. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@60408 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/GVN.cpp | 247 +++++++++++++++++++++++++------- test/Transforms/GVN/pre-load.ll | 18 +++ 2 files changed, 211 insertions(+), 54 deletions(-) create mode 100644 test/Transforms/GVN/pre-load.ll diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 039d52029aa..e641b245d97 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -43,9 +43,11 @@ STATISTIC(NumGVNInstr, "Number of instructions deleted"); STATISTIC(NumGVNLoad, "Number of loads deleted"); STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); STATISTIC(NumGVNBlocks, "Number of blocks merged"); +STATISTIC(NumPRELoad, "Number of loads PRE'd"); static cl::opt EnablePRE("enable-pre", cl::init(true), cl::Hidden); +cl::opt EnableLoadPRE("enable-load-pre"); //===----------------------------------------------------------------------===// // ValueTable Class @@ -863,23 +865,46 @@ Value *GVN::GetValueForBlock(BasicBlock *BB, LoadInst* orig, return v; } +/// IsValueFullyAvailableInBlock - Return true if we can prove that the value +/// we're analyzing is fully available in the specified block. As we go, keep +/// track of which blocks we know it is fully alive or not in +/// FullyAvailableBlocks. +static bool IsValueFullyAvailableInBlock(BasicBlock *BB, + DenseMap &FullyAvailableBlocks) { + // Optimistically assume that the block is fully available and check to see + // if we already know about this block in one lookup. + std::pair::iterator, bool> IV = + FullyAvailableBlocks.insert(std::make_pair(BB, true)); + + // If the entry already existed for this block, return the precomputed value. + if (!IV.second) + return IV.first->second; + + // Otherwise, see if it is fully available in all predecessors. + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + + // If this block has no predecessors, it isn't live-in here. + if (PI == PE) + return FullyAvailableBlocks[BB] = false; + + for (; PI != PE; ++PI) + // If the value isn't fully available in one of our predecessors, then it + // isn't fully available in this block either. Undo our previous + // optimistic assumption and bail out. + if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks)) + return FullyAvailableBlocks[BB] = false; + + return true; +} + /// processNonLocalLoad - Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. -bool GVN::processNonLocalLoad(LoadInst* L, +bool GVN::processNonLocalLoad(LoadInst *LI, SmallVectorImpl &toErase) { - // Find the non-local dependencies of the load + // Find the non-local dependencies of the load. const MemoryDependenceAnalysis::NonLocalDepInfo &deps = - MD->getNonLocalDependency(L); - DEBUG(cerr << "INVESTIGATING NONLOCAL LOAD: " << deps.size() << *L); -#if 0 - DEBUG(for (unsigned i = 0, e = deps.size(); i != e; ++i) { - cerr << " " << deps[i].first->getName(); - if (Instruction *I = deps[i].second.getInst()) - cerr << *I; - else - cerr << "\n"; - }); -#endif + MD->getNonLocalDependency(LI); + //DEBUG(cerr << "INVESTIGATING NONLOCAL LOAD: " << deps.size() << *LI); // If we had to process more than one hundred blocks to find the // dependencies, this load isn't worth worrying about. Optimizing @@ -887,11 +912,15 @@ bool GVN::processNonLocalLoad(LoadInst* L, if (deps.size() > 100) return false; - BasicBlock *EntryBlock = &L->getParent()->getParent()->getEntryBlock(); + BasicBlock *EntryBlock = &LI->getParent()->getParent()->getEntryBlock(); - DenseMap repl; + // Filter out useless results (non-locals, etc). Keep track of the blocks + // where we have a value available in repl, also keep track of whether we see + // dependencies that produce an unknown value for the load (such as a call + // that could potentially clobber the load). + SmallVector, 16> ValuesPerBlock; + SmallVector UnavailableBlocks; - // Filter out useless results (non-locals, etc) for (unsigned i = 0, e = deps.size(); i != e; ++i) { BasicBlock *DepBB = deps[i].first; MemDepResult DepInfo = deps[i].second; @@ -900,17 +929,14 @@ bool GVN::processNonLocalLoad(LoadInst* L, // If this is a non-local dependency in the entry block, then we depend on // the value live-in at the start of the function. We could insert a load // in the entry block to get this, but for now we'll just bail out. - // - // FIXME: Consider emitting a load in the entry block to catch this case! - // Tricky part is to sink so that it doesn't execute in places where it - // isn't needed. if (DepBB == EntryBlock) - return false; + UnavailableBlocks.push_back(DepBB); continue; } if (DepInfo.isNone()) { - repl[DepBB] = UndefValue::get(L->getType()); + ValuesPerBlock.push_back(std::make_pair(DepBB, + UndefValue::get(LI->getType()))); continue; } @@ -920,52 +946,165 @@ bool GVN::processNonLocalLoad(LoadInst* L, // NOTE: 403.gcc does have this case (e.g. in readonly_fields_p) because // of bitfield access, it would be interesting to optimize for it at some // point. - if (S->getOperand(0)->getType() != L->getType()) - return false; + if (S->getOperand(0)->getType() != LI->getType()) { + UnavailableBlocks.push_back(DepBB); + continue; + } - if (S->getPointerOperand() != L->getPointerOperand() && + if (S->getPointerOperand() != LI->getPointerOperand() && VN.getAliasAnalysis()->alias(S->getPointerOperand(), 1, - L->getPointerOperand(), 1) - != AliasAnalysis::MustAlias) - return false; - repl[DepBB] = S->getOperand(0); + LI->getPointerOperand(), 1) + != AliasAnalysis::MustAlias) { + UnavailableBlocks.push_back(DepBB); + continue; + } + ValuesPerBlock.push_back(std::make_pair(DepBB, S->getOperand(0))); + } else if (LoadInst* LD = dyn_cast(DepInfo.getInst())) { - if (LD->getType() != L->getType()) - return false; + if (LD->getType() != LI->getType()) { + UnavailableBlocks.push_back(DepBB); + continue; + } - if (LD->getPointerOperand() != L->getPointerOperand() && + if (LD->getPointerOperand() != LI->getPointerOperand() && VN.getAliasAnalysis()->alias(LD->getPointerOperand(), 1, - L->getPointerOperand(), 1) - != AliasAnalysis::MustAlias) - return false; - repl[DepBB] = LD; + LI->getPointerOperand(), 1) + != AliasAnalysis::MustAlias) { + UnavailableBlocks.push_back(DepBB); + continue; + } + ValuesPerBlock.push_back(std::make_pair(DepBB, LD)); } else { - return false; + UnavailableBlocks.push_back(DepBB); + continue; } } - // Use cached PHI construction information from previous runs - SmallPtrSet& p = phiMap[L->getPointerOperand()]; - for (SmallPtrSet::iterator I = p.begin(), E = p.end(); - I != E; ++I) { - if ((*I)->getParent() == L->getParent()) { - L->replaceAllUsesWith(*I); - toErase.push_back(L); - NumGVNLoad++; - return true; + // If we have no predecessors that produce a known value for this load, exit + // early. + if (ValuesPerBlock.empty()) return false; + + // If all of the instructions we depend on produce a known value for this + // load, then it is fully redundant and we can use PHI insertion to compute + // its value. Insert PHIs and remove the fully redundant value now. + if (UnavailableBlocks.empty()) { + // Use cached PHI construction information from previous runs + SmallPtrSet &p = phiMap[LI->getPointerOperand()]; + for (SmallPtrSet::iterator I = p.begin(), E = p.end(); + I != E; ++I) { + if ((*I)->getParent() == LI->getParent()) { + DEBUG(cerr << "GVN REMOVING NONLOCAL LOAD #1: " << *LI); + LI->replaceAllUsesWith(*I); + toErase.push_back(LI); + NumGVNLoad++; + return true; + } + + ValuesPerBlock.push_back(std::make_pair((*I)->getParent(), *I)); } - repl.insert(std::make_pair((*I)->getParent(), *I)); + DEBUG(cerr << "GVN REMOVING NONLOCAL LOAD: " << *LI); + + DenseMap BlockReplValues; + BlockReplValues.insert(ValuesPerBlock.begin(), ValuesPerBlock.end()); + // Perform PHI construction. + Value* v = GetValueForBlock(LI->getParent(), LI, BlockReplValues, true); + LI->replaceAllUsesWith(v); + toErase.push_back(LI); + NumGVNLoad++; + return true; } + + if (!EnablePRE || !EnableLoadPRE) + return false; - DEBUG(cerr << "GVN REMOVING NONLOCAL LOAD: " << *L); + // Okay, we have *some* definitions of the value. This means that the value + // is available in some of our (transitive) predecessors. Lets think about + // doing PRE of this load. This will involve inserting a new load into the + // predecessor when it's not available. We could do this in general, but + // prefer to not increase code size. As such, we only do this when we know + // that we only have to insert *one* load (which means we're basically moving + // the load, not inserting a new one). + + // Everything we do here is based on local predecessors of LI's block. If it + // only has one predecessor, bail now. + BasicBlock *LoadBB = LI->getParent(); + if (LoadBB->getSinglePredecessor()) + return false; + + // If we have a repl set with LI itself in it, this means we have a loop where + // at least one of the values is LI. Since this means that we won't be able + // to eliminate LI even if we insert uses in the other predecessors, we will + // end up increasing code size. Reject this by scanning for LI. + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) + if (ValuesPerBlock[i].second == LI) + return false; + + // Okay, we have some hope :). Check to see if the loaded value is fully + // available in all but one predecessor. + // FIXME: If we could restructure the CFG, we could make a common pred with + // all the preds that don't have an available LI and insert a new load into + // that one block. + BasicBlock *UnavailablePred = 0; - // Perform PHI construction - SmallPtrSet visited; - Value* v = GetValueForBlock(L->getParent(), L, repl, true); - L->replaceAllUsesWith(v); - toErase.push_back(L); - NumGVNLoad++; + DenseMap FullyAvailableBlocks; + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) + FullyAvailableBlocks[ValuesPerBlock[i].first] = true; + for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) + FullyAvailableBlocks[UnavailableBlocks[i]] = false; + + for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); + PI != E; ++PI) { + if (IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks)) + continue; + + // If this load is not available in multiple predecessors, reject it. + if (UnavailablePred && UnavailablePred != *PI) + return false; + UnavailablePred = *PI; + } + + assert(UnavailablePred != 0 && + "Fully available value should be eliminated above!"); + + // If the loaded pointer is PHI node defined in this block, do PHI translation + // to get its value in the predecessor. + Value *LoadPtr = LI->getOperand(0)->DoPHITranslation(LoadBB, UnavailablePred); + + // Make sure the value is live in the predecessor. If it was defined by a + // non-PHI instruction in this block, we don't know how to recompute it above. + if (Instruction *LPInst = dyn_cast(LoadPtr)) + if (!DT->dominates(LPInst->getParent(), UnavailablePred)) { + DEBUG(cerr << "COULDN'T PRE LOAD BECAUSE PTR IS UNAVAILABLE IN PRED: " + << *LPInst << *LI << "\n"); + return false; + } + + // We don't currently handle critical edges :( + if (UnavailablePred->getTerminator()->getNumSuccessors() != 1) { + DEBUG(cerr << "COULD NOT PRE LOAD BECAUSE OF CRITICAL EDGE '" + << UnavailablePred->getName() << "': " << *LI); + return false; + } + + // Okay, we can eliminate this load by inserting a reload in the predecessor + // and using PHI construction to get the value in the other predecessors, do + // it. + DEBUG(cerr << "GVN REMOVING PRE LOAD: " << *LI); + + Value *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, + LI->getAlignment(), + UnavailablePred->getTerminator()); + + DenseMap BlockReplValues; + BlockReplValues.insert(ValuesPerBlock.begin(), ValuesPerBlock.end()); + BlockReplValues[UnavailablePred] = NewLoad; + + // Perform PHI construction. + Value* v = GetValueForBlock(LI->getParent(), LI, BlockReplValues, true); + LI->replaceAllUsesWith(v); + toErase.push_back(LI); + NumPRELoad++; return true; } diff --git a/test/Transforms/GVN/pre-load.ll b/test/Transforms/GVN/pre-load.ll new file mode 100644 index 00000000000..bc3ec5ba1a1 --- /dev/null +++ b/test/Transforms/GVN/pre-load.ll @@ -0,0 +1,18 @@ +; RUN: llvm-as < %s | opt -gvn -enable-load-pre | llvm-dis | grep {%PRE.rle = phi} + +define i32 @test(i32* %p, i1 %C) { +block1: + br i1 %C, label %block2, label %block3 + +block2: + br label %block4 + +block3: + %b = bitcast i32 0 to i32 + store i32 %b, i32* %p + br label %block4 + +block4: + %PRE = load i32* %p + ret i32 %PRE +} -- 2.34.1