From c05d30601ced172b55be81bb529df6be91d6ae15 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 6 Sep 2012 09:17:37 +0000 Subject: [PATCH] Add a new optimization pass: Stack Coloring, that merges disjoint static allocations (allocas). Allocas are known to be disjoint if they are marked by disjoint lifetime markers (@llvm.lifetime.XXX intrinsics). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@163299 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/ReleaseNotes.html | 7 + include/llvm/ADT/BitVector.h | 21 +- include/llvm/CodeGen/ISDOpcodes.h | 4 + include/llvm/CodeGen/MachineFrameInfo.h | 24 +- include/llvm/CodeGen/Passes.h | 4 + include/llvm/InitializePasses.h | 1 + include/llvm/Target/Target.td | 12 + include/llvm/Target/TargetOpcodes.h | 6 +- lib/CodeGen/CMakeLists.txt | 1 + lib/CodeGen/CodeGen.cpp | 1 + lib/CodeGen/MachineFunction.cpp | 4 +- lib/CodeGen/Passes.cpp | 4 + .../SelectionDAG/FunctionLoweringInfo.cpp | 2 +- lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 11 + .../SelectionDAG/ScheduleDAGRRList.cpp | 2 + lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 + .../SelectionDAG/SelectionDAGBuilder.cpp | 21 +- .../SelectionDAG/SelectionDAGDumper.cpp | 2 + lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 13 +- lib/CodeGen/StackColoring.cpp | 657 ++++++++++++++++++ test/CodeGen/X86/StackColoring.ll | 272 ++++++++ utils/TableGen/CodeGenTarget.cpp | 2 + 22 files changed, 1057 insertions(+), 16 deletions(-) create mode 100644 lib/CodeGen/StackColoring.cpp create mode 100644 test/CodeGen/X86/StackColoring.ll diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html index e31bad77475..75a6fd1ca10 100644 --- a/docs/ReleaseNotes.html +++ b/docs/ReleaseNotes.html @@ -499,6 +499,13 @@ Release Notes.
+

Stack Coloring - We have implemented a new optimization pass + to merge stack objects which are used in disjoin areas of the code. + This optimization reduces the required stack space significantly, in cases + where it is clear to the optimizer that the stack slot is not shared. + We use the lifetime markers to tell the codegen that a certain alloca + is used within a region.

+

We have put a significant amount of work into the code generator infrastructure, which allows us to implement more aggressive algorithms and make it run faster:

diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h index df896b98f4d..26ec346b182 100644 --- a/include/llvm/ADT/BitVector.h +++ b/include/llvm/ADT/BitVector.h @@ -311,7 +311,7 @@ public: return !(*this == RHS); } - // Intersection, union, disjoint union. + /// Intersection, union, disjoint union. BitVector &operator&=(const BitVector &RHS) { unsigned ThisWords = NumBitWords(size()); unsigned RHSWords = NumBitWords(RHS.size()); @@ -328,7 +328,7 @@ public: return *this; } - // reset - Reset bits that are set in RHS. Same as *this &= ~RHS. + /// reset - Reset bits that are set in RHS. Same as *this &= ~RHS. BitVector &reset(const BitVector &RHS) { unsigned ThisWords = NumBitWords(size()); unsigned RHSWords = NumBitWords(RHS.size()); @@ -338,6 +338,23 @@ public: return *this; } + /// test - Check if (This - RHS) is zero. + /// This is the same as reset(RHS) and any(). + bool test(const BitVector &RHS) const { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + if ((Bits[i] & ~RHS.Bits[i]) != 0) + return true; + + for (; i != ThisWords ; ++i) + if (Bits[i] != 0) + return true; + + return false; + } + BitVector &operator|=(const BitVector &RHS) { if (size() < RHS.size()) resize(RHS.size()); diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index f387bd518f1..5d0a3b4c706 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -637,6 +637,10 @@ namespace ISD { ATOMIC_LOAD_UMIN, ATOMIC_LOAD_UMAX, + /// This corresponds to the llvm.lifetime.* intrinsics. The first operand + /// is the chain and the second operand is the alloca pointer. + LIFETIME_START, LIFETIME_END, + /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h index 8b958e437ed..3c07cebfcc6 100644 --- a/include/llvm/CodeGen/MachineFrameInfo.h +++ b/include/llvm/CodeGen/MachineFrameInfo.h @@ -28,6 +28,7 @@ class MachineFunction; class MachineBasicBlock; class TargetFrameLowering; class BitVector; +class Value; /// The CalleeSavedInfo class tracks the information need to locate where a /// callee saved register is in the current frame. @@ -103,14 +104,18 @@ class MachineFrameInfo { // protector. bool MayNeedSP; + /// Alloca - If this stack object is originated from an Alloca instruction + /// this value saves the original IR allocation. Can be NULL. + const Value *Alloca; + // PreAllocated - If true, the object was mapped into the local frame // block and doesn't need additional handling for allocation beyond that. bool PreAllocated; StackObject(uint64_t Sz, unsigned Al, int64_t SP, bool IM, - bool isSS, bool NSP) + bool isSS, bool NSP, const Value *Val) : SPOffset(SP), Size(Sz), Alignment(Al), isImmutable(IM), - isSpillSlot(isSS), MayNeedSP(NSP), PreAllocated(false) {} + isSpillSlot(isSS), MayNeedSP(NSP), Alloca(Val), PreAllocated(false) {} }; /// Objects - The list of stack objects allocated... @@ -362,6 +367,14 @@ public: ensureMaxAlignment(Align); } + /// getObjectAllocation - Return the underlying Alloca of the specified + /// stack object if it exists. Returns 0 if none exists. + const Value* getObjectAllocation(int ObjectIdx) const { + assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() && + "Invalid Object Idx!"); + return Objects[ObjectIdx+NumFixedObjects].Alloca; + } + /// NeedsStackProtector - Returns true if the object may need stack /// protectors. bool MayNeedStackProtector(int ObjectIdx) const { @@ -482,9 +495,10 @@ public: /// a nonnegative identifier to represent it. /// int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS, - bool MayNeedSP = false) { + bool MayNeedSP = false, const Value *Alloca = 0) { assert(Size != 0 && "Cannot allocate zero size stack objects!"); - Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP)); + Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP, + Alloca)); int Index = (int)Objects.size() - NumFixedObjects - 1; assert(Index >= 0 && "Bad frame index!"); ensureMaxAlignment(Alignment); @@ -516,7 +530,7 @@ public: /// int CreateVariableSizedObject(unsigned Alignment) { HasVarSizedObjects = true; - Objects.push_back(StackObject(0, Alignment, 0, false, false, true)); + Objects.push_back(StackObject(0, Alignment, 0, false, false, true, 0)); ensureMaxAlignment(Alignment); return (int)Objects.size()-NumFixedObjects-1; } diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index 07b3b45873a..7bd576494ef 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -404,6 +404,10 @@ namespace llvm { /// inserting cmov instructions. extern char &EarlyIfConverterID; + /// StackSlotColoring - This pass performs stack coloring and merging. + /// It merges disjoint allocas to reduce the stack size. + extern char &StackColoringID; + /// IfConverter - This pass performs machine code if conversion. extern char &IfConverterID; diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 994311334cd..bede4eb621a 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -232,6 +232,7 @@ void initializeSinkingPass(PassRegistry&); void initializeSlotIndexesPass(PassRegistry&); void initializeSpillPlacementPass(PassRegistry&); void initializeStackProtectorPass(PassRegistry&); +void initializeStackColoringPass(PassRegistry&); void initializeStackSlotColoringPass(PassRegistry&); void initializeStripDeadDebugInfoPass(PassRegistry&); void initializeStripDeadPrototypesPassPass(PassRegistry&); diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td index f0458ecba6d..52b491d4349 100644 --- a/include/llvm/Target/Target.td +++ b/include/llvm/Target/Target.td @@ -745,6 +745,18 @@ def BUNDLE : Instruction { let InOperandList = (ins variable_ops); let AsmString = "BUNDLE"; } +def LIFETIME_START : Instruction { + let OutOperandList = (outs); + let InOperandList = (ins i32imm:$id); + let AsmString = "LIFETIME_START"; + let neverHasSideEffects = 1; +} +def LIFETIME_END : Instruction { + let OutOperandList = (outs); + let InOperandList = (ins i32imm:$id); + let AsmString = "LIFETIME_END"; + let neverHasSideEffects = 1; +} } //===----------------------------------------------------------------------===// diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h index f0b181e345b..516e0706b89 100644 --- a/include/llvm/Target/TargetOpcodes.h +++ b/include/llvm/Target/TargetOpcodes.h @@ -87,7 +87,11 @@ namespace TargetOpcode { /// BUNDLE - This instruction represents an instruction bundle. Instructions /// which immediately follow a BUNDLE instruction which are marked with /// 'InsideBundle' flag are inside the bundle. - BUNDLE + BUNDLE = 14, + + /// Lifetime markers. + LIFETIME_START = 15, + LIFETIME_END = 16 }; } // end namespace TargetOpcode } // end namespace llvm diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 2e189ad7e7d..386509b702e 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -95,6 +95,7 @@ add_llvm_library(LLVMCodeGen SplitKit.cpp StackProtector.cpp StackSlotColoring.cpp + StackColoring.cpp StrongPHIElimination.cpp TailDuplication.cpp TargetFrameLoweringImpl.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index fb2c2e83f1b..65f09412876 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -56,6 +56,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeRegisterCoalescerPass(Registry); initializeSlotIndexesPass(Registry); initializeStackProtectorPass(Registry); + initializeStackColoringPass(Registry); initializeStackSlotColoringPass(Registry); initializeStrongPHIEliminationPass(Registry); initializeTailDuplicatePassPass(Registry); diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index 8d6452f7141..70d1bb32b1b 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -458,7 +458,9 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, unsigned StackAlign = TFI.getStackAlignment(); unsigned Align = MinAlign(SPOffset, StackAlign); Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, - /*isSS*/false, false)); + /*isSS*/ false, + /*NeedSP*/ false, + /*Alloca*/ 0)); return -++NumFixedObjects; } diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp index 9564b5dbe1e..a6dd5decbea 100644 --- a/lib/CodeGen/Passes.cpp +++ b/lib/CodeGen/Passes.cpp @@ -529,6 +529,10 @@ void TargetPassConfig::addMachineSSAOptimization() { // instructions dead. addPass(&OptimizePHIsID); + // This pass merges large allocas. StackSlotColoring is a different pass + // which merges spill slots. + addPass(&StackColoringID); + // If the target requests it, assign local variables to stack slots relative // to one another and simplify frame index references where possible. addPass(&LocalStackSlotAllocationID); diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 3e18ea7ac95..b2a2a5cb255 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -97,7 +97,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) { cast(Ty)->getElementType()->isIntegerTy(8))); StaticAllocaMap[AI] = MF->getFrameInfo()->CreateStackObject(TySize, Align, false, - MayNeedSP); + MayNeedSP, AI); } for (; BB != EB; ++BB) diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 46ddab0d761..6d2cdeabd15 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -873,6 +873,17 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned, break; } + case ISD::LIFETIME_START: + case ISD::LIFETIME_END: { + unsigned TarOp = (Node->getOpcode() == ISD::LIFETIME_START) ? + TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END; + + FrameIndexSDNode *FI = dyn_cast(Node->getOperand(1)); + BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp)) + .addFrameIndex(FI->getIndex()); + break; + } + case ISD::INLINEASM: { unsigned NumOps = Node->getNumOperands(); if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue) diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index bf0a43785b7..a8f6f0dff80 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -656,6 +656,8 @@ void ScheduleDAGRRList::EmitNode(SUnit *SU) { break; case ISD::MERGE_VALUES: case ISD::TokenFactor: + case ISD::LIFETIME_START: + case ISD::LIFETIME_END: case ISD::CopyToReg: case ISD::CopyFromReg: case ISD::EH_LABEL: diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 74500d517b7..928385a0ad1 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4161,6 +4161,8 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList, assert((Opcode == ISD::INTRINSIC_VOID || Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::PREFETCH || + Opcode == ISD::LIFETIME_START || + Opcode == ISD::LIFETIME_END || (Opcode <= INT_MAX && (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) && "Opcode is not a memory-accessing opcode!"); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 31d633c89b4..b587884b504 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Constants.h" #include "llvm/CallingConv.h" #include "llvm/DebugInfo.h" @@ -5215,14 +5216,30 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { rw==1)); /* write */ return 0; } + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: { + SDValue Ops[2]; + AllocaInst *LifetimeObject =dyn_cast_or_null( + GetUnderlyingObject(I.getArgOperand(1), TD)); + // Could not find an Alloca. + if (!LifetimeObject) + return 0; + + int FI = FuncInfo.StaticAllocaMap[LifetimeObject]; + Ops[0] = getRoot(); + Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true); + bool IsStart = (Intrinsic == Intrinsic::lifetime_start); + unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END); + Res = DAG.getNode(Opcode, dl, MVT::Other, Ops, 2); + DAG.setRoot(Res); + return 0; + } case Intrinsic::invariant_start: - case Intrinsic::lifetime_start: // Discard region information. setValue(&I, DAG.getUNDEF(TLI.getPointerTy())); return 0; case Intrinsic::invariant_end: - case Intrinsic::lifetime_end: // Discard region information. return 0; case Intrinsic::donothing: diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 2f41e648424..75989ad48fa 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -267,6 +267,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::STACKRESTORE: return "stackrestore"; case ISD::TRAP: return "trap"; case ISD::DEBUGTRAP: return "debugtrap"; + case ISD::LIFETIME_START: return "lifetime.start"; + case ISD::LIFETIME_END: return "lifetime.end"; // Bit manipulation case ISD::BSWAP: return "bswap"; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 1f5f8250266..754294129a8 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1801,10 +1801,13 @@ WalkChainUsers(const SDNode *ChainedNode, User->getOpcode() == ISD::HANDLENODE) // Root of the graph. continue; - if (User->getOpcode() == ISD::CopyToReg || - User->getOpcode() == ISD::CopyFromReg || - User->getOpcode() == ISD::INLINEASM || - User->getOpcode() == ISD::EH_LABEL) { + unsigned UserOpcode = User->getOpcode(); + if (UserOpcode == ISD::CopyToReg || + UserOpcode == ISD::CopyFromReg || + UserOpcode == ISD::INLINEASM || + UserOpcode == ISD::EH_LABEL || + UserOpcode == ISD::LIFETIME_START || + UserOpcode == ISD::LIFETIME_END) { // If their node ID got reset to -1 then they've already been selected. // Treat them like a MachineOpcode. if (User->getNodeId() == -1) @@ -2220,6 +2223,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable, case ISD::CopyFromReg: case ISD::CopyToReg: case ISD::EH_LABEL: + case ISD::LIFETIME_START: + case ISD::LIFETIME_END: NodeToMatch->setNodeId(-1); // Mark selected. return 0; case ISD::AssertSext: diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp new file mode 100644 index 00000000000..f1818f642c2 --- /dev/null +++ b/lib/CodeGen/StackColoring.cpp @@ -0,0 +1,657 @@ +//===-- StackColoring.cpp -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements the stack-coloring optimization that looks for +// lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END), +// which represent the possible lifetime of stack slots. It attempts to +// merge disjoint stack slots and reduce the used stack space. +// NOTE: This pass is not StackSlotColoring, which optimizes spill slots. +// +// TODO: In the future we plan to improve stack coloring in the following ways: +// 1. Allow merging multiple small slots into a single larger slot at different +// offsets. +// 2. Merge this pass with StackSlotColoring and allow merging of allocas with +// spill slots. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "stackcoloring" +#include "MachineTraceMetrics.h" +#include "llvm/Function.h" +#include "llvm/Module.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SparseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/DebugInfo.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt +DisableColoring("no-stack-coloring", + cl::init(false), cl::Hidden, + cl::desc("Suppress stack coloring")); + +STATISTIC(NumMarkerSeen, "Number of life markers found."); +STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots."); +STATISTIC(StackSlotMerged, "Number of stack slot merged."); + +//===----------------------------------------------------------------------===// +// StackColoring Pass +//===----------------------------------------------------------------------===// + +namespace { +/// StackColoring - A machine pass for merging disjoint stack allocations, +/// marked by the LIFETIME_START and LIFETIME_END pseudo instructions. +class StackColoring : public MachineFunctionPass { + MachineFrameInfo *MFI; + MachineFunction *MF; + + /// A class representing liveness information for a single basic block. + /// Each bit in the BitVector represents the liveness property + /// for a different stack slot. + struct BlockLifetimeInfo { + /// Which slots BEGINs in each basic block. + BitVector Begin; + /// Which slots ENDs in each basic block. + BitVector End; + /// Which slots are marked as LIVE_IN, coming into each basic block. + BitVector LiveIn; + /// Which slots are marked as LIVE_OUT, coming out of each basic block. + BitVector LiveOut; + }; + + /// Maps active slots (per bit) for each basic block. + DenseMap BlockLiveness; + + /// Maps serial numbers to basic blocks. + DenseMap BasicBlocks; + /// Maps basic blocks to a serial number. + SmallVector BasicBlockNumbering; + + /// Maps liveness intervals for each slot. + SmallVector Intervals; + /// VNInfo is used for the construction of LiveIntervals. + VNInfo::Allocator VNInfoAllocator; + /// SlotIndex analysis object. + SlotIndexes* Indexes; + + /// The list of lifetime markers found. These markers are to be removed + /// once the coloring is done. + SmallVector Markers; + + /// SlotSizeSorter - A Sort utility for arranging stack slots according + /// to their size. + struct SlotSizeSorter { + MachineFrameInfo *MFI; + SlotSizeSorter(MachineFrameInfo *mfi) : MFI(mfi) { } + bool operator()(int LHS, int RHS) { + // We use -1 to denote a uninteresting slot. Place these slots at the end. + if (LHS == -1) return false; + if (RHS == -1) return true; + // Sort according to size. + return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS); + } +}; + +public: + static char ID; + StackColoring() : MachineFunctionPass(ID) { + initializeStackColoringPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnMachineFunction(MachineFunction &MF); + +private: + /// Debug. + void dump(); + + /// Removes all of the lifetime marker instructions from the function. + /// \returns true if any markers were removed. + bool removeAllMarkers(); + + /// Scan the machine function and find all of the lifetime markers. + /// Record the findings in the BEGIN and END vectors. + /// \returns the number of markers found. + unsigned collectMarkers(unsigned NumSlot); + + /// Perform the dataflow calculation and calculate the lifetime for each of + /// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and + /// LifetimeLIVE_OUT maps that represent which stack slots are live coming + /// in and out blocks. + void calculateLocalLiveness(); + + /// Construct the LiveIntervals for the slots. + void calculateLiveIntervals(unsigned NumSlots); + + /// Go over the machine function and change instructions which use stack + /// slots to use the joint slots. + void remapInstructions(DenseMap &SlotRemap); + + /// Map entries which point to other entries to their destination. + /// A->B->C becomes A->C. + void expungeSlotMap(DenseMap &SlotRemap, unsigned NumSlots); +}; +} // end anonymous namespace + +char StackColoring::ID = 0; +char &llvm::StackColoringID = StackColoring::ID; + +INITIALIZE_PASS_BEGIN(StackColoring, + "stack-coloring", "Merge disjoint stack slots", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(StackColoring, + "stack-coloring", "Merge disjoint stack slots", false, false) + +void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +void StackColoring::dump() { + for (df_iterator FI = df_begin(MF), FE = df_end(MF); + FI != FE; ++FI) { + unsigned Num = BasicBlocks[*FI]; + DEBUG(dbgs()<<"Inspecting block #"<getName()<<"]\n"); + Num = 0; + DEBUG(dbgs()<<"BEGIN : {"); + for (unsigned i=0; i < BlockLiveness[*FI].Begin.size(); ++i) + DEBUG(dbgs()< FI = df_begin(MF), FE = df_end(MF); + FI != FE; ++FI) { + + // Assign a serial number to this basic block. + BasicBlocks[*FI] = BasicBlockNumbering.size();; + BasicBlockNumbering.push_back(*FI); + + BlockLiveness[*FI].Begin.resize(NumSlot); + BlockLiveness[*FI].End.resize(NumSlot); + + for (MachineBasicBlock::iterator BI = (*FI)->begin(), BE = (*FI)->end(); + BI != BE; ++BI) { + + if (BI->getOpcode() != TargetOpcode::LIFETIME_START && + BI->getOpcode() != TargetOpcode::LIFETIME_END) + continue; + + Markers.push_back(BI); + + bool IsStart = BI->getOpcode() == TargetOpcode::LIFETIME_START; + MachineOperand &MI = BI->getOperand(0); + unsigned Slot = MI.getIndex(); + + MarkersFound++; + + const Value* Allocation = MFI->getObjectAllocation(Slot); + if (Allocation) { + DEBUG(dbgs()<<"Found lifetime marker for allocation: "<< + Allocation->getName()<<"\n"); + } + + if (IsStart) { + BlockLiveness[*FI].Begin.set(Slot); + } else { + if (BlockLiveness[*FI].Begin.test(Slot)) { + // Allocas that start and end within a single block are handled + // specially when computing the LiveIntervals to avoid pessimizing + // the liveness propagation. + BlockLiveness[*FI].Begin.reset(Slot); + } else { + BlockLiveness[*FI].End.set(Slot); + } + } + } + } + + // Update statistics. + NumMarkerSeen += MarkersFound; + return MarkersFound; +} + +void StackColoring::calculateLocalLiveness() { + // Perform a standard reverse dataflow computation to solve for + // global liveness. The BEGIN set here is equivalent to KILL in the standard + // formulation, and END is equivalent to GEN. The result of this computation + // is a map from blocks to bitvectors where the bitvectors represent which + // allocas are live in/out of that block. + SmallPtrSet BBSet(BasicBlockNumbering.begin(), + BasicBlockNumbering.end()); + unsigned NumSSMIters = 0; + bool changed = true; + while (changed) { + changed = false; + ++NumSSMIters; + + SmallPtrSet NextBBSet; + + for (SmallVector::iterator + PI = BasicBlockNumbering.begin(), PE = BasicBlockNumbering.end(); + PI != PE; ++PI) { + + MachineBasicBlock *BB = *PI; + if (!BBSet.count(BB)) continue; + + BitVector LocalLiveIn; + BitVector LocalLiveOut; + + // Forward propagation from begins to ends. + for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(), + PE = BB->pred_end(); PI != PE; ++PI) + LocalLiveIn |= BlockLiveness[*PI].LiveOut; + LocalLiveIn |= BlockLiveness[BB].End; + LocalLiveIn.reset(BlockLiveness[BB].Begin); + + // Reverse propagation from ends to begins. + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + SE = BB->succ_end(); SI != SE; ++SI) + LocalLiveOut |= BlockLiveness[*SI].LiveIn; + LocalLiveOut |= BlockLiveness[BB].Begin; + LocalLiveOut.reset(BlockLiveness[BB].End); + + LocalLiveIn |= LocalLiveOut; + LocalLiveOut |= LocalLiveIn; + + // After adopting the live bits, we need to turn-off the bits which + // are de-activated in this block. + LocalLiveOut.reset(BlockLiveness[BB].End); + LocalLiveIn.reset(BlockLiveness[BB].Begin); + + if (LocalLiveIn.test(BlockLiveness[BB].LiveIn)) { + changed = true; + BlockLiveness[BB].LiveIn |= LocalLiveIn; + + for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(), + PE = BB->pred_end(); PI != PE; ++PI) + NextBBSet.insert(*PI); + } + + if (LocalLiveOut.test(BlockLiveness[BB].LiveOut)) { + changed = true; + BlockLiveness[BB].LiveOut |= LocalLiveOut; + + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + SE = BB->succ_end(); SI != SE; ++SI) + NextBBSet.insert(*SI); + } + } + + BBSet = NextBBSet; + }// while changed. +} + +void StackColoring::calculateLiveIntervals(unsigned NumSlots) { + SmallVector Starts; + SmallVector Finishes; + + // For each block, find which slots are active within this block + // and update the live intervals. + for (MachineFunction::iterator MBB = MF->begin(), MBBe = MF->end(); + MBB != MBBe; ++MBB) { + Starts.clear(); + Starts.resize(NumSlots); + Finishes.clear(); + Finishes.resize(NumSlots); + + BitVector Alive = BlockLiveness[MBB].LiveIn; + Alive |= BlockLiveness[MBB].LiveOut; + + if (Alive.any()) { + for (int pos = Alive.find_first(); pos != -1; + pos = Alive.find_next(pos)) { + Starts[pos] = Indexes->getMBBStartIdx(MBB); + Finishes[pos] = Indexes->getMBBEndIdx(MBB); + } + } + + for (SmallVector::iterator it = Markers.begin(), + e = Markers.end(); it != e; ++it) { + MachineInstr *MI = *it; + assert((MI->getOpcode() == TargetOpcode::LIFETIME_START || + MI->getOpcode() == TargetOpcode::LIFETIME_END) && + "Invalid Lifetime marker"); + + if (MI->getParent() == MBB) { + bool IsStart = MI->getOpcode() == TargetOpcode::LIFETIME_START; + MachineOperand &Mo = MI->getOperand(0); + int Slot = Mo.getIndex(); + assert(Slot >= 0 && "Invalid slot"); + if (IsStart) { + Starts[Slot] = Indexes->getInstructionIndex(MI); + } else { + Finishes[Slot] = Indexes->getInstructionIndex(MI); + } + } + } + + for (unsigned i = 0; i < NumSlots; ++i) { + assert(!!Starts[i] == !!Finishes[i] && "Unmatched range"); + if (Starts[i] == Finishes[i]) + continue; + + assert(Starts[i] && Finishes[i] && "Invalid interval"); + VNInfo *ValNum = Intervals[i]->getValNumInfo(0); + SlotIndex S = Starts[i]; + SlotIndex F = Finishes[i]; + if (S < F) { + // We have a single consecutive region. + Intervals[i]->addRange(LiveRange(S, F, ValNum)); + } else { + // We have two non consecutive regions. This happens when + // LIFETIME_START appears after the LIFETIME_END marker. + SlotIndex NewStart = Indexes->getMBBStartIdx(MBB); + SlotIndex NewFin = Indexes->getMBBEndIdx(MBB); + Intervals[i]->addRange(LiveRange(NewStart, F, ValNum)); + Intervals[i]->addRange(LiveRange(S, NewFin, ValNum)); + } + } + } +} + +bool StackColoring::removeAllMarkers() { + unsigned Count = 0; + for (unsigned i = 0; i < Markers.size(); ++i) { + Markers[i]->eraseFromParent(); + Count++; + } + Markers.clear(); + + DEBUG(dbgs()<<"Removed "< &SlotRemap) { + unsigned FixedInstr = 0; + unsigned FixedMemOp = 0; + unsigned FixedDbg = 0; + MachineModuleInfo *MMI = &MF->getMMI(); + + // Remap debug information that refers to stack slots. + MachineModuleInfo::VariableDbgInfoMapTy &VMap = MMI->getVariableDbgInfo(); + for (MachineModuleInfo::VariableDbgInfoMapTy::iterator VI = VMap.begin(), + VE = VMap.end(); VI != VE; ++VI) { + const MDNode *Var = VI->first; + if (!Var) continue; + std::pair &VP = VI->second; + if (SlotRemap.count(VP.first)) { + DEBUG(dbgs()<<"Remapping debug info for ["<getName()<<"].\n"); + VP.first = SlotRemap[VP.first]; + FixedDbg++; + } + } + + // Keep a list of *allocas* which need to be remapped. + DenseMap Allocas; + for (DenseMap::iterator it = SlotRemap.begin(), + e = SlotRemap.end(); it != e; ++it) { + const Value* From = MFI->getObjectAllocation(it->first); + const Value* To = MFI->getObjectAllocation(it->second); + assert(To && From && "Invalid allocation object"); + Allocas[From] = To; + } + + // Remap all instructions to the new stack slots. + MachineFunction::iterator BB, BBE; + MachineBasicBlock::iterator I, IE; + for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB) + for (I = BB->begin(), IE = BB->end(); I != IE; ++I) { + + // Update the MachineMemOperand to use the new alloca. + for (MachineInstr::mmo_iterator MM = I->memoperands_begin(), + E = I->memoperands_end(); MM != E; ++MM) { + MachineMemOperand *MMO = *MM; + + const Value *V = MMO->getValue(); + + if (!V) + continue; + + // Climb up and find the original alloca. + V = GetUnderlyingObject(V); + // If we did not find one, or if the one that we found is not in our + // map, then move on. + if (!V || !Allocas.count(V)) + continue; + + MMO->setValue(Allocas[V]); + FixedMemOp++; + } + + // Update all of the machine instruction operands. + for (unsigned i = 0 ; i < I->getNumOperands(); ++i) { + MachineOperand &MO = I->getOperand(i); + + if (!MO.isFI()) + continue; + int FromSlot = MO.getIndex(); + + // Don't touch arguments. + if (FromSlot<0) + continue; + + // Only look at mapped slots. + if (!SlotRemap.count(FromSlot)) + continue; + + // Fix the machine instructions. + int ToSlot = SlotRemap[FromSlot]; + MO.setIndex(ToSlot); + FixedInstr++; + } + } + + DEBUG(dbgs()<<"Fixed "< &SlotRemap, + unsigned NumSlots) { + // Expunge slot remap map. + for (unsigned i=0; i < NumSlots; ++i) { + // If we are remapping i + if (SlotRemap.count(i)) { + int Target = SlotRemap[i]; + // As long as our target is mapped to something else, follow it. + while (SlotRemap.count(Target)) { + Target = SlotRemap[Target]; + SlotRemap[i] = Target; + } + } + } +} + +bool StackColoring::runOnMachineFunction(MachineFunction &Func) { + DEBUG(dbgs() << "********** Stack Coloring **********\n" + << "********** Function: " + << ((Value*)Func.getFunction())->getName() << '\n'); + MF = &Func; + MFI = MF->getFrameInfo(); + Indexes = &getAnalysis(); + BlockLiveness.clear(); + BasicBlocks.clear(); + BasicBlockNumbering.clear(); + Markers.clear(); + Intervals.clear(); + VNInfoAllocator.Reset(); + + unsigned NumSlots = MFI->getObjectIndexEnd(); + + // If there are no stack slots then there are no markers to remove. + if (!NumSlots) + return false; + + SmallVector SortedSlots; + + SortedSlots.reserve(NumSlots); + Intervals.reserve(NumSlots); + + unsigned NumMarkers = collectMarkers(NumSlots); + + unsigned TotalSize = 0; + DEBUG(dbgs()<<"Found "<getObjectIndexEnd(); ++i) { + DEBUG(dbgs()<<"Slot #"<getObjectSize(i)<<" bytes.\n"); + TotalSize += MFI->getObjectSize(i); + } + + DEBUG(dbgs()<<"Total Stack size: "<getNextValue(Indexes->getZeroIndex(), VNInfoAllocator); + SortedSlots.push_back(i); + } + + // Calculate the liveness of each block. + calculateLocalLiveness(); + + // Propagate the liveness information. + calculateLiveIntervals(NumSlots); + + // Maps old slots to new slots. + DenseMap SlotRemap; + unsigned RemovedSlots = 0; + unsigned ReducedSize = 0; + + // Do not bother looking at empty intervals. + for (unsigned I = 0; I < NumSlots; ++I) { + if (Intervals[SortedSlots[I]]->empty()) + SortedSlots[I] = -1; + } + + // This is a simple greedy algorithm for merging allocas. First, sort the + // slots, placing the largest slots first. Next, perform an n^2 scan and look + // for disjoint slots. When you find disjoint slots, merge the samller one + // into the bigger one and update the live interval. Remove the small alloca + // and continue. + + // Sort the slots according to their size. Place unused slots at the end. + std::sort(SortedSlots.begin(), SortedSlots.end(), SlotSizeSorter(MFI)); + + bool Chanded = true; + while (Chanded) { + Chanded = false; + for (unsigned I = 0; I < NumSlots; ++I) { + if (SortedSlots[I] == -1) + continue; + + for (unsigned J=0; J < NumSlots; ++J) { + if (SortedSlots[J] == -1) + continue; + + int FirstSlot = SortedSlots[I]; + int SecondSlot = SortedSlots[J]; + LiveInterval *First = Intervals[FirstSlot]; + LiveInterval *Second = Intervals[SecondSlot]; + assert (!First->empty() && !Second->empty() && "Found an empty range"); + + // Merge disjoint slots. + if (!First->overlaps(*Second)) { + Chanded = true; + First->MergeRangesInAsValue(*Second, First->getValNumInfo(0)); + SlotRemap[SecondSlot] = FirstSlot; + SortedSlots[J] = -1; + DEBUG(dbgs()<<"Merging #"<getObjectAlignment(FirstSlot), + MFI->getObjectAlignment(SecondSlot)); + + assert(MFI->getObjectSize(FirstSlot) >= + MFI->getObjectSize(SecondSlot) && + "Merging a small object into a larger one"); + + RemovedSlots+=1; + ReducedSize += MFI->getObjectSize(SecondSlot); + MFI->setObjectAlignment(FirstSlot, MaxAlignment); + MFI->RemoveStackObject(SecondSlot); + } + } + } + }// While changed. + + // Record statistics. + StackSpaceSaved += ReducedSize; + StackSlotMerged += RemovedSlots; + DEBUG(dbgs()<<"Merge "< &Insts = getInstructions(); -- 2.34.1