From c34693f6efc670b71e11f3479844c36d9696b535 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Fri, 11 Oct 2013 18:01:14 +0000 Subject: [PATCH] [DAGCombiner] Slice a big load in two loads when the element are next to each other in memory and the target has paired load and performs post-isel loads combining. E.g., this optimization will transform something like this: a = load i64* addr b = trunc i64 a to i32 c = lshr i64 a, 32 d = trunc i64 c to i32 into: b = load i32* addr1 d = load i32* addr2 Where addr1 = addr2 +/- sizeof(i32), if the target supports paired load and performs post-isel loads combining. One should overload TargetLowering::hasPairedLoad to provide this information. The default is false. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192471 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 29 ++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 576 ++++++++++++++++++++++- test/CodeGen/X86/load-slice.ll | 140 ++++++ 3 files changed, 743 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/load-slice.ll diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 0130e07c49f..1c0ad63ac61 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1183,6 +1183,35 @@ public: return false; } + /// Return true if the target supplies and combines to a paired load + /// two loaded values of type LoadedType next to each other in memory. + /// RequiredAlignment gives the minimal alignment constraints that must be met to + /// be able to select this paired load. + /// + /// This information is *not* used to generate actual paired loads, but it is used + /// to generate a sequence of loads that is easier to combine into a paired load. + /// For instance, something like this: + /// a = load i64* addr + /// b = trunc i64 a to i32 + /// c = lshr i64 a, 32 + /// d = trunc i64 c to i32 + /// will be optimized into: + /// b = load i32* addr1 + /// d = load i32* addr2 + /// Where addr1 = addr2 +/- sizeof(i32). + /// + /// In other words, unless the target performs a post-isel load combining, this + /// information should not be provided because it will generate more loads. + virtual bool hasPairedLoad(Type * /*LoadedType*/, + unsigned & /*RequiredAligment*/) const { + return false; + } + + virtual bool hasPairedLoad(EVT /*LoadedType*/, + unsigned & /*RequiredAligment*/) const { + return false; + } + /// Return true if zero-extending the specific node Val to type VT2 is free /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or /// because it's folded such as X86 zero-extending loads). diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 72e001af5f8..8d6eab7c7b9 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -35,6 +35,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include using namespace llvm; @@ -44,6 +45,7 @@ STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); +STATISTIC(SlicedLoads, "Number of load sliced"); namespace { static cl::opt @@ -54,6 +56,14 @@ namespace { CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, cl::desc("Include global information in alias analysis")); + /// Hidden option to stress test load slicing, i.e., when this option + /// is enabled, load slicing bypasses most of its profitability guards. + static cl::opt + StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, + cl::desc("Bypass the profitability model of load " + "slicing"), + cl::init(false)); + //------------------------------ DAGCombiner ---------------------------------// class DAGCombiner { @@ -63,6 +73,7 @@ namespace { CodeGenOpt::Level OptLevel; bool LegalOperations; bool LegalTypes; + bool ForCodeSize; // Worklist of all of the nodes that need to be simplified. // @@ -145,6 +156,7 @@ namespace { bool CombineToPreIndexedLoadStore(SDNode *N); bool CombineToPostIndexedLoadStore(SDNode *N); + bool SliceUpLoad(SDNode *N); void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); @@ -316,8 +328,15 @@ namespace { public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) - : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), - OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {} + : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), + OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { + AttributeSet FnAttrs = + DAG.getMachineFunction().getFunction()->getAttributes(); + ForCodeSize = + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize) || + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + } /// Run - runs the dag combiner on all nodes in the work list void Run(CombineLevel AtLevel); @@ -7579,9 +7598,562 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); + // Try to slice up N to more direct loads if the slices are mapped to + // different register banks or pairing can take place. + if (SliceUpLoad(N)) + return SDValue(N, 0); + return SDValue(); } +namespace { +/// \brief Helper structure used to slice a load in smaller loads. +/// Basically a slice is obtained from the following sequence: +/// Origin = load Ty1, Base +/// Shift = srl Ty1 Origin, CstTy Amount +/// Inst = trunc Shift to Ty2 +/// +/// Then, it will be rewriten into: +/// Slice = load SliceTy, Base + SliceOffset +/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 +/// +/// SliceTy is deduced from the number of bits that are actually used to +/// build Inst. +struct LoadedSlice { + /// \brief Helper structure used to compute the cost of a slice. + struct Cost { + /// Are we optimizing for code size. + bool ForCodeSize; + /// Various cost. + unsigned Loads; + unsigned Truncates; + unsigned CrossRegisterBanksCopies; + unsigned ZExts; + unsigned Shift; + + Cost(bool ForCodeSize = false) + : ForCodeSize(ForCodeSize), Loads(0), Truncates(0), + CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {} + + /// \brief Get the cost of one isolated slice. + Cost(const LoadedSlice &LS, bool ForCodeSize = false) + : ForCodeSize(ForCodeSize), Loads(1), Truncates(0), + CrossRegisterBanksCopies(0), ZExts(0), Shift(0) { + EVT TruncType = LS.Inst->getValueType(0); + EVT LoadedType = LS.getLoadedType(); + if (TruncType != LoadedType && + !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType)) + ZExts = 1; + } + + /// \brief Account for slicing gain in the current cost. + /// Slicing provide a few gains like removing a shift or a + /// truncate. This method allows to grow the cost of the original + /// load with the gain from this slice. + void addSliceGain(const LoadedSlice &LS) { + // Each slice saves a truncate. + const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); + if (!TLI.isTruncateFree(LS.Inst->getValueType(0), + LS.Inst->getOperand(0).getValueType())) + ++Truncates; + // If there is a shift amount, this slice gets rid of it. + if (LS.Shift) + ++Shift; + // If this slice can merge a cross register bank copy, account for it. + if (LS.canMergeExpensiveCrossRegisterBankCopy()) + ++CrossRegisterBanksCopies; + } + + Cost &operator+=(const Cost &RHS) { + Loads += RHS.Loads; + Truncates += RHS.Truncates; + CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies; + ZExts += RHS.ZExts; + Shift += RHS.Shift; + return *this; + } + + bool operator==(const Cost &RHS) const { + return Loads == RHS.Loads && Truncates == RHS.Truncates && + CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies && + ZExts == RHS.ZExts && Shift == RHS.Shift; + } + + bool operator!=(const Cost &RHS) const { return !(*this == RHS); } + + bool operator<(const Cost &RHS) const { + // Assume cross register banks copies are as expensive as loads. + // FIXME: Do we want some more target hooks? + unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies; + unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies; + // Unless we are optimizing for code size, consider the + // expensive operation first. + if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS) + return ExpensiveOpsLHS < ExpensiveOpsRHS; + return (Truncates + ZExts + Shift + ExpensiveOpsLHS) < + (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS); + } + + bool operator>(const Cost &RHS) const { return RHS < *this; } + + bool operator<=(const Cost &RHS) const { return !(RHS < *this); } + + bool operator>=(const Cost &RHS) const { return !(*this < RHS); } + }; + // The last instruction that represent the slice. This should be a + // truncate instruction. + SDNode *Inst; + // The original load instruction. + LoadSDNode *Origin; + // The right shift amount in bits from the original load. + unsigned Shift; + // The DAG from which Origin came from. + // This is used to get some contextual information about legal types, etc. + SelectionDAG *DAG; + + LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL, + unsigned Shift = 0, SelectionDAG *DAG = NULL) + : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {} + + LoadedSlice(const LoadedSlice &LS) + : Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {} + + /// \brief Get the bits used in a chunk of bits \p BitWidth large. + /// \return Result is \p BitWidth and has used bits set to 1 and + /// not used bits set to 0. + APInt getUsedBits() const { + // Reproduce the trunc(lshr) sequence: + // - Start from the truncated value. + // - Zero extend to the desired bit width. + // - Shift left. + assert(Origin && "No original load to compare against."); + unsigned BitWidth = Origin->getValueSizeInBits(0); + assert(Inst && "This slice is not bound to an instruction"); + assert(Inst->getValueSizeInBits(0) <= BitWidth && + "Extracted slice is bigger than the whole type!"); + APInt UsedBits(Inst->getValueSizeInBits(0), 0); + UsedBits.setAllBits(); + UsedBits = UsedBits.zext(BitWidth); + UsedBits <<= Shift; + return UsedBits; + } + + /// \brief Get the size of the slice to be loaded in bytes. + unsigned getLoadedSize() const { + unsigned SliceSize = getUsedBits().countPopulation(); + assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); + return SliceSize / 8; + } + + /// \brief Get the type that will be loaded for this slice. + /// Note: This may not be the final type for the slice. + EVT getLoadedType() const { + assert(DAG && "Missing context"); + LLVMContext &Ctxt = *DAG->getContext(); + return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8); + } + + /// \brief Get the alignment of the load used for this slice. + unsigned getAlignment() const { + unsigned Alignment = Origin->getAlignment(); + unsigned Offset = getOffsetFromBase(); + if (Offset != 0) + Alignment = MinAlign(Alignment, Alignment + Offset); + return Alignment; + } + + /// \brief Check if this slice can be rewritten with legal operations. + bool isLegal() const { + // An invalid slice is not legal. + if (!Origin || !Inst || !DAG) + return false; + + // Offsets are for indexed load only, we do not handle that. + if (Origin->getOffset().getOpcode() != ISD::UNDEF) + return false; + + const TargetLowering &TLI = DAG->getTargetLoweringInfo(); + + // Check that the type is legal. + EVT SliceType = getLoadedType(); + if (!TLI.isTypeLegal(SliceType)) + return false; + + // Check that the load is legal for this type. + if (!TLI.isOperationLegal(ISD::LOAD, SliceType)) + return false; + + // Check that the offset can be computed. + // 1. Check its type. + EVT PtrType = Origin->getBasePtr().getValueType(); + if (PtrType == MVT::Untyped || PtrType.isExtended()) + return false; + + // 2. Check that it fits in the immediate. + if (!TLI.isLegalAddImmediate(getOffsetFromBase())) + return false; + + // 3. Check that the computation is legal. + if (!TLI.isOperationLegal(ISD::ADD, PtrType)) + return false; + + // Check that the zext is legal if it needs one. + EVT TruncateType = Inst->getValueType(0); + if (TruncateType != SliceType && + !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType)) + return false; + + return true; + } + + /// \brief Get the offset in bytes of this slice in the original chunk of + /// bits. + /// \pre DAG != NULL. + uint64_t getOffsetFromBase() const { + assert(DAG && "Missing context."); + bool IsBigEndian = + DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian(); + assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported."); + uint64_t Offset = Shift / 8; + unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8; + assert(!(Origin->getValueSizeInBits(0) & 0x7) && + "The size of the original loaded type is not a multiple of a" + " byte."); + // If Offset is bigger than TySizeInBytes, it means we are loading all + // zeros. This should have been optimized before in the process. + assert(TySizeInBytes > Offset && + "Invalid shift amount for given loaded size"); + if (IsBigEndian) + Offset = TySizeInBytes - Offset - getLoadedSize(); + return Offset; + } + + /// \brief Generate the sequence of instructions to load the slice + /// represented by this object and redirect the uses of this slice to + /// this new sequence of instructions. + /// \pre this->Inst && this->Origin are valid Instructions and this + /// object passed the legal check: LoadedSlice::isLegal returned true. + /// \return The last instruction of the sequence used to load the slice. + SDValue loadSlice() const { + assert(Inst && Origin && "Unable to replace a non-existing slice."); + const SDValue &OldBaseAddr = Origin->getBasePtr(); + SDValue BaseAddr = OldBaseAddr; + // Get the offset in that chunk of bytes w.r.t. the endianess. + int64_t Offset = static_cast(getOffsetFromBase()); + assert(Offset >= 0 && "Offset too big to fit in int64_t!"); + if (Offset) { + // BaseAddr = BaseAddr + Offset. + EVT ArithType = BaseAddr.getValueType(); + BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr, + DAG->getConstant(Offset, ArithType)); + } + + // Create the type of the loaded slice according to its size. + EVT SliceType = getLoadedType(); + + // Create the load for the slice. + SDValue LastInst = DAG->getLoad( + SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr, + Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(), + Origin->isNonTemporal(), Origin->isInvariant(), getAlignment()); + // If the final type is not the same as the loaded type, this means that + // we have to pad with zero. Create a zero extend for that. + EVT FinalType = Inst->getValueType(0); + if (SliceType != FinalType) + LastInst = + DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst); + return LastInst; + } + + /// \brief Check if this slice can be merged with an expensive cross register + /// bank copy. E.g., + /// i = load i32 + /// f = bitcast i32 i to float + bool canMergeExpensiveCrossRegisterBankCopy() const { + if (!Inst || !Inst->hasOneUse()) + return false; + SDNode *Use = *Inst->use_begin(); + if (Use->getOpcode() != ISD::BITCAST) + return false; + assert(DAG && "Missing context"); + const TargetLowering &TLI = DAG->getTargetLoweringInfo(); + EVT ResVT = Use->getValueType(0); + const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT()); + const TargetRegisterClass *ArgRC = + TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT()); + if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) + return false; + + // At this point, we know that we perform a cross-register-bank copy. + // Check if it is expensive. + const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo(); + // Assume bitcasts are cheap, unless both register classes do not + // explicitly share a common sub class. + if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) + return false; + + // Check if it will be merged with the load. + // 1. Check the alignment constraint. + unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment( + ResVT.getTypeForEVT(*DAG->getContext())); + + if (RequiredAlignment > getAlignment()) + return false; + + // 2. Check that the load is a legal operation for that type. + if (!TLI.isOperationLegal(ISD::LOAD, ResVT)) + return false; + + // 3. Check that we do not have a zext in the way. + if (Inst->getValueType(0) != getLoadedType()) + return false; + + return true; + } +}; +} + +/// \brief Sorts LoadedSlice according to their offset. +struct LoadedSliceSorter { + bool operator()(const LoadedSlice &LHS, const LoadedSlice &RHS) { + assert(LHS.Origin == RHS.Origin && "Different bases not implemented."); + return LHS.getOffsetFromBase() < RHS.getOffsetFromBase(); + } +}; + +/// \brief Check that all bits set in \p UsedBits form a dense region, i.e., +/// \p UsedBits looks like 0..0 1..1 0..0. +static bool areUsedBitsDense(const APInt &UsedBits) { + // If all the bits are one, this is dense! + if (UsedBits.isAllOnesValue()) + return true; + + // Get rid of the unused bits on the right. + APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); + // Get rid of the unused bits on the left. + if (NarrowedUsedBits.countLeadingZeros()) + NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); + // Check that the chunk of bits is completely used. + return NarrowedUsedBits.isAllOnesValue(); +} + +/// \brief Check whether or not \p First and \p Second are next to each other +/// in memory. This means that there is no hole between the bits loaded +/// by \p First and the bits loaded by \p Second. +static bool areSlicesNextToEachOther(const LoadedSlice &First, + const LoadedSlice &Second) { + assert(First.Origin == Second.Origin && First.Origin && + "Unable to match different memory origins."); + APInt UsedBits = First.getUsedBits(); + assert((UsedBits & Second.getUsedBits()) == 0 && + "Slices are not supposed to overlap."); + UsedBits |= Second.getUsedBits(); + return areUsedBitsDense(UsedBits); +} + +/// \brief Adjust the \p GlobalLSCost according to the target +/// paring capabilities and the layout of the slices. +/// \pre \p GlobalLSCost should account for at least as many loads as +/// there is in the slices in \p LoadedSlices. +static void adjustCostForPairing(SmallVectorImpl &LoadedSlices, + LoadedSlice::Cost &GlobalLSCost) { + unsigned NumberOfSlices = LoadedSlices.size(); + // If there is less than 2 elements, no pairing is possible. + if (NumberOfSlices < 2) + return; + + // Sort the slices so that elements that are likely to be next to each + // other in memory are next to each other in the list. + std::sort(LoadedSlices.begin(), LoadedSlices.end(), LoadedSliceSorter()); + const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo(); + // First (resp. Second) is the first (resp. Second) potentially candidate + // to be placed in a paired load. + const LoadedSlice *First = NULL; + const LoadedSlice *Second = NULL; + for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, + // Set the beginning of the pair. + First = Second) { + + Second = &LoadedSlices[CurrSlice]; + + // If First is NULL, it means we start a new pair. + // Get to the next slice. + if (!First) + continue; + + EVT LoadedType = First->getLoadedType(); + + // If the types of the slices are different, we cannot pair them. + if (LoadedType != Second->getLoadedType()) + continue; + + // Check if the target supplies paired loads for this type. + unsigned RequiredAlignment = 0; + if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) { + // move to the next pair, this type is hopeless. + Second = NULL; + continue; + } + // Check if we meet the alignment requirement. + if (RequiredAlignment > First->getAlignment()) + continue; + + // Check that both loads are next to each other in memory. + if (!areSlicesNextToEachOther(*First, *Second)) + continue; + + assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!"); + --GlobalLSCost.Loads; + // Move to the next pair. + Second = NULL; + } +} + +/// \brief Check the profitability of all involved LoadedSlice. +/// Currently, it is considered profitable if there is exactly two +/// involved slices (1) which are (2) next to each other in memory, and +/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3). +/// +/// Note: The order of the elements in \p LoadedSlices may be modified, but not +/// the elements themselves. +/// +/// FIXME: When the cost model will be mature enough, we can relax +/// constraints (1) and (2). +static bool isSlicingProfitable(SmallVectorImpl &LoadedSlices, + const APInt &UsedBits, bool ForCodeSize) { + unsigned NumberOfSlices = LoadedSlices.size(); + if (StressLoadSlicing) + return NumberOfSlices > 1; + + // Check (1). + if (NumberOfSlices != 2) + return false; + + // Check (2). + if (!areUsedBitsDense(UsedBits)) + return false; + + // Check (3). + LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize); + // The original code has one big load. + OrigCost.Loads = 1; + for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) { + const LoadedSlice &LS = LoadedSlices[CurrSlice]; + // Accumulate the cost of all the slices. + LoadedSlice::Cost SliceCost(LS, ForCodeSize); + GlobalSlicingCost += SliceCost; + + // Account as cost in the original configuration the gain obtained + // with the current slices. + OrigCost.addSliceGain(LS); + } + + // If the target supports paired load, adjust the cost accordingly. + adjustCostForPairing(LoadedSlices, GlobalSlicingCost); + return OrigCost > GlobalSlicingCost; +} + +/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr) +/// operations, split it in the various pieces being extracted. +/// +/// This sort of thing is introduced by SROA. +/// This slicing takes care not to insert overlapping loads. +/// \pre LI is a simple load (i.e., not an atomic or volatile load). +bool DAGCombiner::SliceUpLoad(SDNode *N) { + if (Level < AfterLegalizeDAG) + return false; + + LoadSDNode *LD = cast(N); + if (LD->isVolatile() || !ISD::isNormalLoad(LD) || + !LD->getValueType(0).isInteger()) + return false; + + // Keep track of already used bits to detect overlapping values. + // In that case, we will just abort the transformation. + APInt UsedBits(LD->getValueSizeInBits(0), 0); + + SmallVector LoadedSlices; + + // Check if this load is used as several smaller chunks of bits. + // Basically, look for uses in trunc or trunc(lshr) and record a new chain + // of computation for each trunc. + for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); + UI != UIEnd; ++UI) { + // Skip the uses of the chain. + if (UI.getUse().getResNo() != 0) + continue; + + SDNode *User = *UI; + unsigned Shift = 0; + + // Check if this is a trunc(lshr). + if (User->getOpcode() == ISD::SRL && User->hasOneUse() && + isa(User->getOperand(1))) { + Shift = cast(User->getOperand(1))->getZExtValue(); + User = *User->use_begin(); + } + + // At this point, User is a Truncate, iff we encountered, trunc or + // trunc(lshr). + if (User->getOpcode() != ISD::TRUNCATE) + return false; + + // The width of the type must be a power of 2 and greater than 8-bits. + // Otherwise the load cannot be represented in LLVM IR. + // Moreover, if we shifted with a non 8-bits multiple, the slice + // will be accross several bytes. We do not support that. + unsigned Width = User->getValueSizeInBits(0); + if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) + return 0; + + // Build the slice for this chain of computations. + LoadedSlice LS(User, LD, Shift, &DAG); + APInt CurrentUsedBits = LS.getUsedBits(); + + // Check if this slice overlaps with another. + if ((CurrentUsedBits & UsedBits) != 0) + return false; + // Update the bits used globally. + UsedBits |= CurrentUsedBits; + + // Check if the new slice would be legal. + if (!LS.isLegal()) + return false; + + // Record the slice. + LoadedSlices.push_back(LS); + } + + // Abort slicing if it does not seem to be profitable. + if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize)) + return false; + + ++SlicedLoads; + + // Rewrite each chain to use an independent load. + // By construction, each chain can be represented by a unique load. + + // Prepare the argument for the new token factor for all the slices. + SmallVector ArgChains; + for (SmallVectorImpl::const_iterator + LSIt = LoadedSlices.begin(), + LSItEnd = LoadedSlices.end(); + LSIt != LSItEnd; ++LSIt) { + SDValue SliceInst = LSIt->loadSlice(); + CombineTo(LSIt->Inst, SliceInst, true); + if (SliceInst.getNode()->getOpcode() != ISD::LOAD) + SliceInst = SliceInst.getOperand(0); + assert(SliceInst->getOpcode() == ISD::LOAD && + "It takes more than a zext to get to the loaded slice!!"); + ArgChains.push_back(SliceInst.getValue(1)); + } + + SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, + &ArgChains[0], ArgChains.size()); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); + return true; +} + /// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the /// load is having specific bytes cleared out. If so, return the byte size /// being masked out and the shift amount. diff --git a/test/CodeGen/X86/load-slice.ll b/test/CodeGen/X86/load-slice.ll new file mode 100644 index 00000000000..83c7aa7a10c --- /dev/null +++ b/test/CodeGen/X86/load-slice.ll @@ -0,0 +1,140 @@ +; RUN: llc -mtriple x86_64-apple-macosx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS +; RUN: llc -mtriple x86_64-apple-macosx < %s -o - | FileCheck %s --check-prefix=REGULAR +; +; + +%class.Complex = type { float, float } + + +; Check that independant slices leads to independant loads then the slices leads to +; different register file. +; +; The layout is: +; LSB 0 1 2 3 | 4 5 6 7 MSB +; Low High +; The base address points to 0 and is 8-bytes aligned. +; Low slice starts at 0 (base) and is 8-bytes aligned. +; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned. +; +; STRESS-LABEL: t1: +; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. +; STRESS: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]] +; Add high slice: out[out_start].imm, this is base + 4. +; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] +; Load out[out_start + 8].real, this is base + 8 * 8 + 0. +; STRESS-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]] +; Add low slice: out[out_start].real, this is base + 0. +; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] +; Swap Imm and Real. +; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] +; Put the results back into out[out_start]. +; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]]) +; +; Same for REGULAR, we eliminate register bank copy with each slices. +; REGULAR-LABEL: t1: +; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. +; REGULAR: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]] +; Add high slice: out[out_start].imm, this is base + 4. +; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] +; Load out[out_start + 8].real, this is base + 8 * 8 + 0. +; REGULAR-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]] +; Add low slice: out[out_start].real, this is base + 0. +; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] +; Swap Imm and Real. +; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] +; Put the results back into out[out_start]. +; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]]) +define void @t1(%class.Complex* nocapture %out, i64 %out_start) { +entry: + %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start + %tmp = bitcast %class.Complex* %arrayidx to i64* + %tmp1 = load i64* %tmp, align 8 + %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32 + %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float + %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32 + %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32 + %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float + %add = add i64 %out_start, 8 + %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add + %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0 + %tmp4 = load float* %i.i, align 4 + %add.i = fadd float %tmp4, %tmp2 + %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0 + %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1 + %tmp5 = load float* %r.i, align 4 + %add5.i = fadd float %tmp5, %tmp3 + %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1 + %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>* + store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4 + ret void +} + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) + +; Check that we do not read outside of the chunk of bits of the original loads. +; +; The 64-bits should have been split in one 32-bits and one 16-bits slices. +; The 16-bits should be zero extended to match the final type. +; +; The memory layout is: +; LSB 0 1 2 3 | 4 5 | 6 7 MSB +; Low High +; The base address points to 0 and is 8-bytes aligned. +; Low slice starts at 0 (base) and is 8-bytes aligned. +; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned. +; +; STRESS-LABEL: t2: +; STRESS: movzwl 6([[BASE:[^)]+]]), %eax +; STRESS-NEXT: addl ([[BASE]]), %eax +; STRESS-NEXT: ret +; +; For the REGULAR heuristic, this is not profitable to slice things that are not +; next to each other in memory. Here we have a hole with bytes #4-5. +; REGULAR-LABEL: t2: +; REGULAR: shrq $48 +define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) { + %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start + %bitcast = bitcast %class.Complex* %arrayidx to i64* + %chunk64 = load i64* %bitcast, align 8 + %slice32_low = trunc i64 %chunk64 to i32 + %shift48 = lshr i64 %chunk64, 48 + %slice32_high = trunc i64 %shift48 to i32 + %res = add i32 %slice32_high, %slice32_low + ret i32 %res +} + +; Check that we do not optimize overlapping slices. +; +; The 64-bits should NOT have been split in as slices are overlapping. +; First slice uses bytes numbered 0 to 3. +; Second slice uses bytes numbered 6 and 7. +; Third slice uses bytes numbered 4 to 7. +; +; STRESS-LABEL: t3: +; STRESS: shrq $48 +; STRESS: shrq $32 +; +; REGULAR-LABEL: t3: +; REGULAR: shrq $48 +; REGULAR: shrq $32 +define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) { + %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start + %bitcast = bitcast %class.Complex* %arrayidx to i64* + %chunk64 = load i64* %bitcast, align 8 + %slice32_low = trunc i64 %chunk64 to i32 + %shift48 = lshr i64 %chunk64, 48 + %slice32_high = trunc i64 %shift48 to i32 + %shift32 = lshr i64 %chunk64, 32 + %slice32_lowhigh = trunc i64 %shift32 to i32 + %tmpres = add i32 %slice32_high, %slice32_low + %res = add i32 %slice32_lowhigh, %tmpres + ret i32 %res +} + -- 2.34.1