X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelDAGToDAG.cpp;h=6041669f81825b746c279bd0bc9d7f57d6324a0e;hb=b8f0d89d0584e37e205c04ed5753f57a23365403;hp=1311dbabb54d23db90835dc34d55f427bc2d48d3;hpb=b20e0b1fddfd9099e12b84a71fbc8ccff5a12b10;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 1311dbabb54..6041669f818 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -19,24 +19,21 @@ #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/Support/CFG.h" -#include "llvm/Type.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" using namespace llvm; STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); @@ -60,7 +57,7 @@ namespace { int Base_FrameIndex; unsigned Scale; - SDValue IndexReg; + SDValue IndexReg; int32_t Disp; SDValue Segment; const GlobalValue *GV; @@ -80,11 +77,11 @@ namespace { bool hasSymbolicDisplacement() const { return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0; } - + bool hasBaseOrIndexReg() const { return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0; } - + /// isRIPRelative - Return true if this addressing mode is already RIP /// relative. bool isRIPRelative() const { @@ -94,17 +91,18 @@ namespace { return RegNode->getReg() == X86::RIP; return false; } - + void setBaseReg(SDValue Reg) { BaseType = RegBase; Base_Reg = Reg; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump() { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; if (Base_Reg.getNode() != 0) - Base_Reg.getNode()->dump(); + Base_Reg.getNode()->dump(); else dbgs() << "nul"; dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' @@ -113,7 +111,7 @@ namespace { if (IndexReg.getNode() != 0) IndexReg.getNode()->dump(); else - dbgs() << "nul"; + dbgs() << "nul"; dbgs() << " Disp " << Disp << '\n' << "GV "; if (GV) @@ -133,6 +131,7 @@ namespace { dbgs() << "nul"; dbgs() << " JT" << JT << " Align" << Align << '\n'; } +#endif }; } @@ -187,9 +186,11 @@ namespace { private: SDNode *Select(SDNode *N); + SDNode *SelectGather(SDNode *N, unsigned Opc); SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); - SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT); + SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); + bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); bool MatchAddress(SDValue N, X86ISelAddressMode &AM); @@ -210,21 +211,21 @@ namespace { SDValue &Index, SDValue &Disp, SDValue &Segment, SDValue &NodeWithChain); - + bool TryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, std::vector &OutOps); - + void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI); - inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, + inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? @@ -241,13 +242,15 @@ namespace { else if (AM.CP) Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp, AM.SymbolFlags); - else if (AM.ES) + else if (AM.ES) { + assert(!AM.Disp && "Non-zero displacement is ignored with ES."); Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); - else if (AM.JT != -1) + } else if (AM.JT != -1) { + assert(!AM.Disp && "Non-zero displacement is ignored with JT."); Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); - else if (AM.BlockAddr) - Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32, - true, AM.SymbolFlags); + } else if (AM.BlockAddr) + Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, + AM.SymbolFlags); else Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32); @@ -277,13 +280,13 @@ namespace { /// getTargetMachine - Return a reference to the TargetMachine, casted /// to the target-specific type. - const X86TargetMachine &getTargetMachine() { + const X86TargetMachine &getTargetMachine() const { return static_cast(TM); } /// getInstrInfo - Return a reference to the TargetInstrInfo, casted /// to the target-specific type. - const X86InstrInfo *getInstrInfo() { + const X86InstrInfo *getInstrInfo() const { return getTargetMachine().getInstrInfo(); } }; @@ -356,7 +359,7 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { /// MoveBelowCallOrigChain - Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, - SDValue Call, SDValue OrigChain) { + SDValue Call, SDValue OrigChain) { SmallVector Ops; SDValue Chain = OrigChain.getOperand(0); if (Chain.getNode() == Load.getNode()) @@ -380,11 +383,13 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size()); CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), Load.getOperand(1), Load.getOperand(2)); + + unsigned NumOps = Call.getNode()->getNumOperands(); Ops.clear(); Ops.push_back(SDValue(Load.getNode(), 1)); - for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i) + for (unsigned i = 1, e = NumOps; i != e; ++i) Ops.push_back(Call.getOperand(i)); - CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], Ops.size()); + CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps); } /// isCalleeLoad - Return true if call address is a load and it can be @@ -393,6 +398,10 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, /// In the case of a tail call, there isn't a callseq node between the call /// chain and the load. static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { + // The transformation is somewhat dangerous if the call's chain was glued to + // the call. After MoveBelowOrigChain the load is moved between the call and + // the chain, this can create a cycle if the load is not folded. So it is + // *really* important that we are sure the load will be folded. if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) return false; LoadSDNode *LD = dyn_cast(Callee.getNode()); @@ -411,6 +420,11 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { if (!Chain.getNumOperands()) return false; + // Since we are not checking for AA here, conservatively abort if the chain + // writes to memory. It's not safe to move the callee (a load) across a store. + if (isa(Chain.getNode()) && + cast(Chain.getNode())->writeMem()) + return false; if (Chain.getOperand(0).getNode() == Callee.getNode()) return true; if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && @@ -422,15 +436,21 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { void X86DAGToDAGISel::PreprocessISelDAG() { // OptForSize is used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize); - + OptForSize = MF->getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && - (N->getOpcode() == X86ISD::CALL || - N->getOpcode() == X86ISD::TC_RETURN)) { + // Only does this when target favors doesn't favor register indirect + // call. + ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) || + (N->getOpcode() == X86ISD::TC_RETURN && + // Only does this if load can be folded into TC_RETURN. + (Subtarget->is64Bit() || + getTargetMachine().getRelocationModel() != Reloc::PIC_)))) { /// Also try moving call address load from outside callseq_start to just /// before the call to allow it to be folded. /// @@ -459,7 +479,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { ++NumLoadMoved; continue; } - + // Lower fpround and fpextend nodes that target the FP stack to be store and // load to the stack. This is a gross hack. We would like to simply mark // these as being illegal, but when we do that, legalize produces these when @@ -470,11 +490,16 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // FIXME: This should only happen when not compiled with -O0. if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) continue; - - // If the source and destination are SSE registers, then this is a legal - // conversion that should not be lowered. + EVT SrcVT = N->getOperand(0).getValueType(); EVT DstVT = N->getValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) + continue; + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) @@ -488,7 +513,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { if (N->getConstantOperandVal(1)) continue; } - + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. @@ -497,16 +522,16 @@ void X86DAGToDAGISel::PreprocessISelDAG() { MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. else MemVT = SrcIsSSE ? SrcVT : DstVT; - + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); DebugLoc dl = N->getDebugLoc(); - + // FIXME: optimize the case where the src/dest is a load or store? SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MachinePointerInfo(), MemVT, false, false, 0); - SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, DstVT, dl, Store, MemTmp, + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, MachinePointerInfo(), MemVT, false, false, 0); @@ -516,12 +541,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // To avoid invalidating 'I', back it up to the convert node. --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); - + // Now that we did that, the node is dead. Increment the iterator to the // next node to process, then delete N. ++I; CurDAG->DeleteNode(N); - } + } } @@ -530,9 +555,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() { void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI) { const TargetInstrInfo *TII = TM.getInstrInfo(); - if (Subtarget->isTargetCygMing()) + if (Subtarget->isTargetCygMing()) { + unsigned CallOp = + Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32; BuildMI(BB, DebugLoc(), - TII->get(X86::CALLpcrel32)).addExternalSymbol("__main"); + TII->get(CallOp)).addExternalSymbol("__main"); + } } void X86DAGToDAGISel::EmitFunctionEntryCode() { @@ -542,10 +570,38 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() { EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo()); } +static bool isDispSafeForFrameIndex(int64_t Val) { + // On 64-bit platforms, we can run into an issue where a frame index + // includes a displacement that, when added to the explicit displacement, + // will overflow the displacement field. Assuming that the frame index + // displacement fits into a 31-bit integer (which is only slightly more + // aggressive than the current fundamental assumption that it fits into + // a 32-bit integer), a 31-bit disp should always be safe. + return isInt<31>(Val); +} + +bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, + X86ISelAddressMode &AM) { + int64_t Val = AM.Disp + Offset; + CodeModel::Model M = TM.getCodeModel(); + if (Subtarget->is64Bit()) { + if (!X86::isOffsetSuitableForCodeModel(Val, M, + AM.hasSymbolicDisplacement())) + return true; + // In addition to the checks required for a register base, check that + // we do not try to use an unsafe Disp with a frame index. + if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && + !isDispSafeForFrameIndex(Val)) + return true; + } + AM.Disp = Val; + return false; + +} bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ SDValue Address = N->getOperand(1); - + // load gs:0 -> GS segment register. // load fs:0 -> FS segment register. // @@ -554,7 +610,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ // For more information see http://people.redhat.com/drepper/tls.pdf if (ConstantSDNode *C = dyn_cast(Address)) if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 && - Subtarget->isTargetELF()) + Subtarget->isTargetLinux()) switch (N->getPointerInfo().getAddrSpace()) { case 256: AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); @@ -563,7 +619,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); return false; } - + return true; } @@ -582,37 +638,47 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { // Handle X86-64 rip-relative addresses. We check this before checking direct // folding because RIP is preferable to non-RIP accesses. - if (Subtarget->is64Bit() && + if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP && // Under X86-64 non-small code model, GV (and friends) are 64-bits, so // they cannot be folded into immediate fields. // FIXME: This can be improved for kernel and other models? - (M == CodeModel::Small || M == CodeModel::Kernel) && - // Base and index reg must be 0 in order to use %rip as base and lowering - // must allow RIP. - !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) { + (M == CodeModel::Small || M == CodeModel::Kernel)) { + // Base and index reg must be 0 in order to use %rip as base. + if (AM.hasBaseOrIndexReg()) + return true; if (GlobalAddressSDNode *G = dyn_cast(N0)) { - int64_t Offset = AM.Disp + G->getOffset(); - if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true; + X86ISelAddressMode Backup = AM; AM.GV = G->getGlobal(); - AM.Disp = Offset; AM.SymbolFlags = G->getTargetFlags(); + if (FoldOffsetIntoAddress(G->getOffset(), AM)) { + AM = Backup; + return true; + } } else if (ConstantPoolSDNode *CP = dyn_cast(N0)) { - int64_t Offset = AM.Disp + CP->getOffset(); - if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true; + X86ISelAddressMode Backup = AM; AM.CP = CP->getConstVal(); AM.Align = CP->getAlignment(); - AM.Disp = Offset; AM.SymbolFlags = CP->getTargetFlags(); + if (FoldOffsetIntoAddress(CP->getOffset(), AM)) { + AM = Backup; + return true; + } } else if (ExternalSymbolSDNode *S = dyn_cast(N0)) { AM.ES = S->getSymbol(); AM.SymbolFlags = S->getTargetFlags(); } else if (JumpTableSDNode *J = dyn_cast(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); - } else { - AM.BlockAddr = cast(N0)->getBlockAddress(); - AM.SymbolFlags = cast(N0)->getTargetFlags(); - } + } else if (BlockAddressSDNode *BA = dyn_cast(N0)) { + X86ISelAddressMode Backup = AM; + AM.BlockAddr = BA->getBlockAddress(); + AM.SymbolFlags = BA->getTargetFlags(); + if (FoldOffsetIntoAddress(BA->getOffset(), AM)) { + AM = Backup; + return true; + } + } else + llvm_unreachable("Unhandled symbol reference node."); if (N.getOpcode() == X86ISD::WrapperRIP) AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); @@ -620,11 +686,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { } // Handle the case when globals fit in our immediate field: This is true for - // X86-32 always and X86-64 when in -static -mcmodel=small mode. In 64-bit - // mode, this results in a non-RIP-relative computation. + // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit + // mode, this only applies to a non-RIP-relative computation. if (!Subtarget->is64Bit() || - ((M == CodeModel::Small || M == CodeModel::Kernel) && - TM.getRelocationModel() == Reloc::Static)) { + M == CodeModel::Small || M == CodeModel::Kernel) { + assert(N.getOpcode() != X86ISD::WrapperRIP && + "RIP-relative addressing already handled"); if (GlobalAddressSDNode *G = dyn_cast(N0)) { AM.GV = G->getGlobal(); AM.Disp += G->getOffset(); @@ -640,10 +707,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { } else if (JumpTableSDNode *J = dyn_cast(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); - } else { - AM.BlockAddr = cast(N0)->getBlockAddress(); - AM.SymbolFlags = cast(N0)->getTargetFlags(); - } + } else if (BlockAddressSDNode *BA = dyn_cast(N0)) { + AM.BlockAddr = BA->getBlockAddress(); + AM.Disp += BA->getOffset(); + AM.SymbolFlags = BA->getTargetFlags(); + } else + llvm_unreachable("Unhandled symbol reference node."); return false; } @@ -682,28 +751,215 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { return false; } -/// isLogicallyAddWithConstant - Return true if this node is semantically an -/// add of a value with a constantint. -static bool isLogicallyAddWithConstant(SDValue V, SelectionDAG *CurDAG) { - // Check for (add x, Cst) - if (V->getOpcode() == ISD::ADD) - return isa(V->getOperand(1)); +// Insert a node into the DAG at least before the Pos node's position. This +// will reposition the node as needed, and will assign it a node ID that is <= +// the Pos node's ID. Note that this does *not* preserve the uniqueness of node +// IDs! The selection DAG must no longer depend on their uniqueness when this +// is used. +static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { + if (N.getNode()->getNodeId() == -1 || + N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { + DAG.RepositionNode(Pos.getNode(), N.getNode()); + N.getNode()->setNodeId(Pos.getNode()->getNodeId()); + } +} - // Check for (or x, Cst), where Cst & x == 0. - if (V->getOpcode() != ISD::OR || - !isa(V->getOperand(1))) - return false; - - // Handle "X | C" as "X + C" iff X is known to have C bits clear. - ConstantSDNode *CN = cast(V->getOperand(1)); - - // Check to see if the LHS & C is zero. - return CurDAG->MaskedValueIsZero(V->getOperand(0), CN->getAPIntValue()); +// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This +// allows us to convert the shift and and into an h-register extract and +// a scaled index. Returns false if the simplification is performed. +static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || + !isa(Shift.getOperand(1)) || + !Shift.hasOneUse()) + return true; + + int ScaleLog = 8 - Shift.getConstantOperandVal(1); + if (ScaleLog <= 0 || ScaleLog >= 4 || + Mask != (0xffu << ScaleLog)) + return true; + + EVT VT = N.getValueType(); + DebugLoc DL = N.getDebugLoc(); + SDValue Eight = DAG.getConstant(8, MVT::i8); + SDValue NewMask = DAG.getConstant(0xff, VT); + SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); + SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, Eight); + InsertDAGNode(DAG, N, Srl); + InsertDAGNode(DAG, N, NewMask); + InsertDAGNode(DAG, N, And); + InsertDAGNode(DAG, N, ShlCount); + InsertDAGNode(DAG, N, Shl); + DAG.ReplaceAllUsesWith(N, Shl); + AM.IndexReg = And; + AM.Scale = (1 << ScaleLog); + return false; +} + +// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this +// allows us to fold the shift into this addressing mode. Returns false if the +// transform succeeded. +static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SHL || + !isa(Shift.getOperand(1))) + return true; + + // Not likely to be profitable if either the AND or SHIFT node has more + // than one use (unless all uses are for address computation). Besides, + // isel mechanism requires their node ids to be reused. + if (!N.hasOneUse() || !Shift.hasOneUse()) + return true; + + // Verify that the shift amount is something we can fold. + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) + return true; + + EVT VT = N.getValueType(); + DebugLoc DL = N.getDebugLoc(); + SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); + SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, NewMask); + InsertDAGNode(DAG, N, NewAnd); + InsertDAGNode(DAG, N, NewShift); + DAG.ReplaceAllUsesWith(N, NewShift); + + AM.Scale = 1 << ShiftAmt; + AM.IndexReg = NewAnd; + return false; +} + +// Implement some heroics to detect shifts of masked values where the mask can +// be replaced by extending the shift and undoing that in the addressing mode +// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and +// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in +// the addressing mode. This results in code such as: +// +// int f(short *y, int *lookup_table) { +// ... +// return *y + lookup_table[*y >> 11]; +// } +// +// Turning into: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $11, %ecx +// addl (%rsi,%rcx,4), %eax +// +// Instead of: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $9, %ecx +// andl $124, %rcx +// addl (%rsi,%rcx), %eax +// +// Note that this function assumes the mask is provided as a mask *after* the +// value is shifted. The input chain may or may not match that, but computing +// such a mask is trivial. +static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || + !isa(Shift.getOperand(1))) + return true; + + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + unsigned MaskLZ = CountLeadingZeros_64(Mask); + unsigned MaskTZ = CountTrailingZeros_64(Mask); + + // The amount of shift we're trying to fit into the addressing mode is taken + // from the trailing zeros of the mask. + unsigned AMShiftAmt = MaskTZ; + + // There is nothing we can do here unless the mask is removing some bits. + // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. + if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + + // We also need to ensure that mask is a continuous run of bits. + if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; + + // Scale the leading zero count down based on the actual size of the value. + // Also scale it down based on the size of the shift. + MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt; + + // The final check is to ensure that any masked out high bits of X are + // already known to be zero. Otherwise, the mask has a semantic impact + // other than masking out a couple of low bits. Unfortunately, because of + // the mask, zero extensions will be removed from operands in some cases. + // This code works extra hard to look through extensions because we can + // replace them with zero extensions cheaply if necessary. + bool ReplacingAnyExtend = false; + if (X.getOpcode() == ISD::ANY_EXTEND) { + unsigned ExtendBits = + X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits(); + // Assume that we'll replace the any-extend with a zero-extend, and + // narrow the search to the extended value. + X = X.getOperand(0); + MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; + ReplacingAnyExtend = true; + } + APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(), + MaskLZ); + APInt KnownZero, KnownOne; + DAG.ComputeMaskedBits(X, KnownZero, KnownOne); + if (MaskedHighBits != KnownZero) return true; + + // We've identified a pattern that can be transformed into a single shift + // and an addressing mode. Make it so. + EVT VT = N.getValueType(); + if (ReplacingAnyExtend) { + assert(X.getValueType() != VT); + // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. + SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X); + InsertDAGNode(DAG, N, NewX); + X = NewX; + } + DebugLoc DL = N.getDebugLoc(); + SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8); + SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); + SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8); + SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, NewSRLAmt); + InsertDAGNode(DAG, N, NewSRL); + InsertDAGNode(DAG, N, NewSHLAmt); + InsertDAGNode(DAG, N, NewSHL); + DAG.ReplaceAllUsesWith(N, NewSHL); + + AM.Scale = 1 << AMShiftAmt; + AM.IndexReg = NewSRL; + return false; } bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { - bool is64Bit = Subtarget->is64Bit(); DebugLoc dl = N.getDebugLoc(); DEBUG({ dbgs() << "MatchAddress: "; @@ -713,8 +969,6 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (Depth > 5) return MatchAddressBase(N, AM); - CodeModel::Model M = TM.getCodeModel(); - // If this is already a %rip relative address, we can only merge immediates // into it. Instead of handling this in every case, we handle it here. // RIP relative addressing: %rip + 32-bit displacement! @@ -724,14 +978,9 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // consistency. if (!AM.ES && AM.JT != -1) return true; - if (ConstantSDNode *Cst = dyn_cast(N)) { - int64_t Val = AM.Disp + Cst->getSExtValue(); - if (X86::isOffsetSuitableForCodeModel(Val, M, - AM.hasSymbolicDisplacement())) { - AM.Disp = Val; + if (ConstantSDNode *Cst = dyn_cast(N)) + if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM)) return false; - } - } return true; } @@ -739,12 +988,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, default: break; case ISD::Constant: { uint64_t Val = cast(N)->getSExtValue(); - if (!is64Bit || - X86::isOffsetSuitableForCodeModel(AM.Disp + Val, M, - AM.hasSymbolicDisplacement())) { - AM.Disp += Val; + if (!FoldOffsetIntoAddress(Val, AM)) return false; - } break; } @@ -760,8 +1005,9 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, break; case ISD::FrameIndex: - if (AM.BaseType == X86ISelAddressMode::RegBase - && AM.Base_Reg.getNode() == 0) { + if (AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0 && + (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { AM.BaseType = X86ISelAddressMode::FrameIndexBase; AM.Base_FrameIndex = cast(N)->getIndex(); return false; @@ -771,7 +1017,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, case ISD::SHL: if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; - + if (ConstantSDNode *CN = dyn_cast(N.getNode()->getOperand(1))) { unsigned Val = CN->getZExtValue(); @@ -786,24 +1032,47 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. - if (isLogicallyAddWithConstant(ShVal, CurDAG)) { + if (CurDAG->isBaseWithConstantOffset(ShVal)) { AM.IndexReg = ShVal.getNode()->getOperand(0); ConstantSDNode *AddVal = cast(ShVal.getNode()->getOperand(1)); - uint64_t Disp = AM.Disp + (AddVal->getSExtValue() << Val); - if (!is64Bit || - X86::isOffsetSuitableForCodeModel(Disp, M, - AM.hasSymbolicDisplacement())) - AM.Disp = Disp; - else - AM.IndexReg = ShVal; - } else { - AM.IndexReg = ShVal; + uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; + if (!FoldOffsetIntoAddress(Disp, AM)) + return false; } + + AM.IndexReg = ShVal; return false; } - break; } + break; + + case ISD::SRL: { + // Scale must not be used already. + if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + + SDValue And = N.getOperand(0); + if (And.getOpcode() != ISD::AND) break; + SDValue X = And.getOperand(0); + + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getValueSizeInBits() > 64) break; + + // The mask used for the transform is expected to be post-shift, but we + // found the shift first so just apply the shift to the mask before passing + // it down. + if (!isa(N.getOperand(1)) || + !isa(And.getOperand(1))) + break; + uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); + + // Try to fold the mask and shift into the scale, and return false if we + // succeed. + if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) + return false; + break; + } case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: @@ -833,13 +1102,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, Reg = MulVal.getNode()->getOperand(0); ConstantSDNode *AddVal = cast(MulVal.getNode()->getOperand(1)); - uint64_t Disp = AM.Disp + AddVal->getSExtValue() * - CN->getZExtValue(); - if (!is64Bit || - X86::isOffsetSuitableForCodeModel(Disp, M, - AM.hasSymbolicDisplacement())) - AM.Disp = Disp; - else + uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); + if (FoldOffsetIntoAddress(Disp, AM)) Reg = N.getNode()->getOperand(0); } else { Reg = N.getNode()->getOperand(0); @@ -913,16 +1177,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.Scale = 1; // Insert the new nodes into the topological ordering. - if (Zero.getNode()->getNodeId() == -1 || - Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), Zero.getNode()); - Zero.getNode()->setNodeId(N.getNode()->getNodeId()); - } - if (Neg.getNode()->getNodeId() == -1 || - Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), Neg.getNode()); - Neg.getNode()->setNodeId(N.getNode()->getNodeId()); - } + InsertDAGNode(*CurDAG, N, Zero); + InsertDAGNode(*CurDAG, N, Neg); return false; } @@ -930,24 +1186,18 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. HandleSDNode Handle(N); - SDValue LHS = Handle.getValue().getNode()->getOperand(0); - SDValue RHS = Handle.getValue().getNode()->getOperand(1); X86ISelAddressMode Backup = AM; - if (!MatchAddressRecursively(LHS, AM, Depth+1) && - !MatchAddressRecursively(RHS, AM, Depth+1)) + if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) return false; AM = Backup; - LHS = Handle.getValue().getNode()->getOperand(0); - RHS = Handle.getValue().getNode()->getOperand(1); // Try again after commuting the operands. - if (!MatchAddressRecursively(RHS, AM, Depth+1) && - !MatchAddressRecursively(LHS, AM, Depth+1)) + if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&& + !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) return false; AM = Backup; - LHS = Handle.getValue().getNode()->getOperand(0); - RHS = Handle.getValue().getNode()->getOperand(1); // If we couldn't fold both operands into the address at the same time, // see if we can just put each operand into a register and fold at least @@ -955,155 +1205,62 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.BaseType == X86ISelAddressMode::RegBase && !AM.Base_Reg.getNode() && !AM.IndexReg.getNode()) { - AM.Base_Reg = LHS; - AM.IndexReg = RHS; + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); AM.Scale = 1; return false; } + N = Handle.getValue(); break; } case ISD::OR: // Handle "X | C" as "X + C" iff X is known to have C bits clear. - if (isLogicallyAddWithConstant(N, CurDAG)) { + if (CurDAG->isBaseWithConstantOffset(N)) { X86ISelAddressMode Backup = AM; ConstantSDNode *CN = cast(N.getOperand(1)); - uint64_t Offset = CN->getSExtValue(); // Start with the LHS as an addr mode. if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && - // Address could not have picked a GV address for the displacement. - AM.GV == NULL && - // On x86-64, the resultant disp must fit in 32-bits. - (!is64Bit || - X86::isOffsetSuitableForCodeModel(AM.Disp + Offset, M, - AM.hasSymbolicDisplacement()))) { - AM.Disp += Offset; + !FoldOffsetIntoAddress(CN->getSExtValue(), AM)) return false; - } AM = Backup; } break; - + case ISD::AND: { // Perform some heroic transforms on an and of a constant-count shift // with a constant to enable use of the scaled offset field. - SDValue Shift = N.getOperand(0); - if (Shift.getNumOperands() != 2) break; - // Scale must not be used already. if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + SDValue Shift = N.getOperand(0); + if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break; SDValue X = Shift.getOperand(0); - ConstantSDNode *C2 = dyn_cast(N.getOperand(1)); - ConstantSDNode *C1 = dyn_cast(Shift.getOperand(1)); - if (!C1 || !C2) break; - - // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This - // allows us to convert the shift and and into an h-register extract and - // a scaled index. - if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) { - unsigned ScaleLog = 8 - C1->getZExtValue(); - if (ScaleLog > 0 && ScaleLog < 4 && - C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) { - SDValue Eight = CurDAG->getConstant(8, MVT::i8); - SDValue Mask = CurDAG->getConstant(0xff, N.getValueType()); - SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(), - X, Eight); - SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(), - Srl, Mask); - SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8); - SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(), - And, ShlCount); - - // Insert the new nodes into the topological ordering. - if (Eight.getNode()->getNodeId() == -1 || - Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), Eight.getNode()); - Eight.getNode()->setNodeId(X.getNode()->getNodeId()); - } - if (Mask.getNode()->getNodeId() == -1 || - Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), Mask.getNode()); - Mask.getNode()->setNodeId(X.getNode()->getNodeId()); - } - if (Srl.getNode()->getNodeId() == -1 || - Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) { - CurDAG->RepositionNode(Shift.getNode(), Srl.getNode()); - Srl.getNode()->setNodeId(Shift.getNode()->getNodeId()); - } - if (And.getNode()->getNodeId() == -1 || - And.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), And.getNode()); - And.getNode()->setNodeId(N.getNode()->getNodeId()); - } - if (ShlCount.getNode()->getNodeId() == -1 || - ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), ShlCount.getNode()); - ShlCount.getNode()->setNodeId(N.getNode()->getNodeId()); - } - if (Shl.getNode()->getNodeId() == -1 || - Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), Shl.getNode()); - Shl.getNode()->setNodeId(N.getNode()->getNodeId()); - } - CurDAG->ReplaceAllUsesWith(N, Shl); - AM.IndexReg = And; - AM.Scale = (1 << ScaleLog); - return false; - } - } - // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this - // allows us to fold the shift into this addressing mode. - if (Shift.getOpcode() != ISD::SHL) break; + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getValueSizeInBits() > 64) break; - // Not likely to be profitable if either the AND or SHIFT node has more - // than one use (unless all uses are for address computation). Besides, - // isel mechanism requires their node ids to be reused. - if (!N.hasOneUse() || !Shift.hasOneUse()) + if (!isa(N.getOperand(1))) break; - - // Verify that the shift amount is something we can fold. - unsigned ShiftCst = C1->getZExtValue(); - if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3) - break; - - // Get the new AND mask, this folds to a constant. - SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(), - SDValue(C2, 0), SDValue(C1, 0)); - SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X, - NewANDMask); - SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(), - NewAND, SDValue(C1, 0)); + uint64_t Mask = N.getConstantOperandVal(1); - // Insert the new nodes into the topological ordering. - if (C1->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), C1); - C1->setNodeId(X.getNode()->getNodeId()); - } - if (NewANDMask.getNode()->getNodeId() == -1 || - NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode()); - NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId()); - } - if (NewAND.getNode()->getNodeId() == -1 || - NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) { - CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode()); - NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId()); - } - if (NewSHIFT.getNode()->getNodeId() == -1 || - NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode()); - NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId()); - } + // Try to fold the mask and shift into an extract and scale. + if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + return false; - CurDAG->ReplaceAllUsesWith(N, NewSHIFT); - - AM.Scale = 1 << ShiftCst; - AM.IndexReg = NewAND; - return false; + // Try to fold the mask and shift directly into the scale. + if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to swap the mask and shift to place shifts which can be done as + // a scale on the outside of the mask. + if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) + return false; + break; } } @@ -1143,13 +1300,15 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; - + if (Parent && // This list of opcodes are all the nodes that have an "addr:$ptr" operand // that are not a MemSDNode, and thus don't have proper addrspace info. Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores - Parent->getOpcode() != X86ISD::TLSCALL) { // Fixme + Parent->getOpcode() != X86ISD::TLSCALL && // Fixme + Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp + Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp unsigned AddrSpace = cast(Parent)->getPointerInfo().getAddrSpace(); // AddrSpace 256 -> GS, 257 -> FS. @@ -1158,7 +1317,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, if (AddrSpace == 257) AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); } - + if (MatchAddress(N, AM)) return false; @@ -1204,7 +1363,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, // elements. This is a vector shuffle from the zero vector. if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() && // Check to see if the top elements are all zeros (or bitcast of zeros). - N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && N.getOperand(0).getNode()->hasOneUse() && ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) && N.getOperand(0).getOperand(0).hasOneUse() && @@ -1279,7 +1438,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, // If it isn't worth using an LEA, reject it. if (Complexity <= 2) return false; - + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); return true; } @@ -1290,7 +1449,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Disp, SDValue &Segment) { assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); const GlobalAddressSDNode *GA = cast(N); - + X86ISelAddressMode AM; AM.GV = GA->getGlobal(); AM.Disp += GA->getOffset(); @@ -1303,7 +1462,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, } else { AM.IndexReg = CurDAG->getRegister(0, MVT::i64); } - + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); return true; } @@ -1317,7 +1476,7 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, !IsProfitableToFold(N, P, P) || !IsLegalToFold(N, P, P, OptLevel)) return false; - + return SelectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } @@ -1336,6 +1495,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { SDValue In1 = Node->getOperand(1); SDValue In2L = Node->getOperand(2); SDValue In2H = Node->getOperand(3); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) return NULL; @@ -1349,16 +1509,192 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { return ResNode; } -SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { +/// Atomic opcode table +/// +enum AtomicOpc { + ADD, + SUB, + INC, + DEC, + OR, + AND, + XOR, + AtomicOpcEnd +}; + +enum AtomicSz { + ConstantI8, + I8, + SextConstantI16, + ConstantI16, + I16, + SextConstantI32, + ConstantI32, + I32, + SextConstantI64, + ConstantI64, + I64, + AtomicSzEnd +}; + +static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { + { + X86::LOCK_ADD8mi, + X86::LOCK_ADD8mr, + X86::LOCK_ADD16mi8, + X86::LOCK_ADD16mi, + X86::LOCK_ADD16mr, + X86::LOCK_ADD32mi8, + X86::LOCK_ADD32mi, + X86::LOCK_ADD32mr, + X86::LOCK_ADD64mi8, + X86::LOCK_ADD64mi32, + X86::LOCK_ADD64mr, + }, + { + X86::LOCK_SUB8mi, + X86::LOCK_SUB8mr, + X86::LOCK_SUB16mi8, + X86::LOCK_SUB16mi, + X86::LOCK_SUB16mr, + X86::LOCK_SUB32mi8, + X86::LOCK_SUB32mi, + X86::LOCK_SUB32mr, + X86::LOCK_SUB64mi8, + X86::LOCK_SUB64mi32, + X86::LOCK_SUB64mr, + }, + { + 0, + X86::LOCK_INC8m, + 0, + 0, + X86::LOCK_INC16m, + 0, + 0, + X86::LOCK_INC32m, + 0, + 0, + X86::LOCK_INC64m, + }, + { + 0, + X86::LOCK_DEC8m, + 0, + 0, + X86::LOCK_DEC16m, + 0, + 0, + X86::LOCK_DEC32m, + 0, + 0, + X86::LOCK_DEC64m, + }, + { + X86::LOCK_OR8mi, + X86::LOCK_OR8mr, + X86::LOCK_OR16mi8, + X86::LOCK_OR16mi, + X86::LOCK_OR16mr, + X86::LOCK_OR32mi8, + X86::LOCK_OR32mi, + X86::LOCK_OR32mr, + X86::LOCK_OR64mi8, + X86::LOCK_OR64mi32, + X86::LOCK_OR64mr, + }, + { + X86::LOCK_AND8mi, + X86::LOCK_AND8mr, + X86::LOCK_AND16mi8, + X86::LOCK_AND16mi, + X86::LOCK_AND16mr, + X86::LOCK_AND32mi8, + X86::LOCK_AND32mi, + X86::LOCK_AND32mr, + X86::LOCK_AND64mi8, + X86::LOCK_AND64mi32, + X86::LOCK_AND64mr, + }, + { + X86::LOCK_XOR8mi, + X86::LOCK_XOR8mr, + X86::LOCK_XOR16mi8, + X86::LOCK_XOR16mi, + X86::LOCK_XOR16mr, + X86::LOCK_XOR32mi8, + X86::LOCK_XOR32mi, + X86::LOCK_XOR32mr, + X86::LOCK_XOR64mi8, + X86::LOCK_XOR64mi32, + X86::LOCK_XOR64mr, + } +}; + +// Return the target constant operand for atomic-load-op and do simple +// translations, such as from atomic-load-add to lock-sub. The return value is +// one of the following 3 cases: +// + target-constant, the operand could be supported as a target constant. +// + empty, the operand is not needed any more with the new op selected. +// + non-empty, otherwise. +static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, + DebugLoc dl, + enum AtomicOpc &Op, EVT NVT, + SDValue Val) { + if (ConstantSDNode *CN = dyn_cast(Val)) { + int64_t CNVal = CN->getSExtValue(); + // Quit if not 32-bit imm. + if ((int32_t)CNVal != CNVal) + return Val; + // For atomic-load-add, we could do some optimizations. + if (Op == ADD) { + // Translate to INC/DEC if ADD by 1 or -1. + if ((CNVal == 1) || (CNVal == -1)) { + Op = (CNVal == 1) ? INC : DEC; + // No more constant operand after being translated into INC/DEC. + return SDValue(); + } + // Translate to SUB if ADD by negative value. + if (CNVal < 0) { + Op = SUB; + CNVal = -CNVal; + } + } + return CurDAG->getTargetConstant(CNVal, NVT); + } + + // If the value operand is single-used, try to optimize it. + if (Op == ADD && Val.hasOneUse()) { + // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x). + if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) { + Op = SUB; + return Val.getOperand(1); + } + // A special case for i16, which needs truncating as, in most cases, it's + // promoted to i32. We will translate + // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x)) + if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 && + Val.getOperand(0).getOpcode() == ISD::SUB && + X86::isZeroNode(Val.getOperand(0).getOperand(0))) { + Op = SUB; + Val = Val.getOperand(0); + return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT, + Val.getOperand(1)); + } + } + + return Val; +} + +SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { if (Node->hasAnyUseOfValue(0)) return 0; - // Optimize common patterns for __sync_add_and_fetch and - // __sync_sub_and_fetch where the result is not used. This allows us - // to use "lock" version of add, sub, inc, dec instructions. - // FIXME: Do not use special instructions but instead add the "lock" - // prefix to the target node somehow. The extra information will then be - // transferred to machine instruction and it denotes the prefix. + DebugLoc dl = Node->getDebugLoc(); + + // Optimize common patterns for __sync_or_and_fetch and similar arith + // operations where the result is not used. This allows us to use the "lock" + // version of the arithmetic instruction. SDValue Chain = Node->getOperand(0); SDValue Ptr = Node->getOperand(1); SDValue Val = Node->getOperand(2); @@ -1366,137 +1702,86 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) return 0; - bool isInc = false, isDec = false, isSub = false, isCN = false; - ConstantSDNode *CN = dyn_cast(Val); - if (CN) { - isCN = true; - int64_t CNVal = CN->getSExtValue(); - if (CNVal == 1) - isInc = true; - else if (CNVal == -1) - isDec = true; - else if (CNVal >= 0) - Val = CurDAG->getTargetConstant(CNVal, NVT); - else { - isSub = true; - Val = CurDAG->getTargetConstant(-CNVal, NVT); - } - } else if (Val.hasOneUse() && - Val.getOpcode() == ISD::SUB && - X86::isZeroNode(Val.getOperand(0))) { - isSub = true; - Val = Val.getOperand(1); + // Which index into the table. + enum AtomicOpc Op; + switch (Node->getOpcode()) { + default: + return 0; + case ISD::ATOMIC_LOAD_OR: + Op = OR; + break; + case ISD::ATOMIC_LOAD_AND: + Op = AND; + break; + case ISD::ATOMIC_LOAD_XOR: + Op = XOR; + break; + case ISD::ATOMIC_LOAD_ADD: + Op = ADD; + break; } + + Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val); + bool isUnOp = !Val.getNode(); + bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); unsigned Opc = 0; switch (NVT.getSimpleVT().SimpleTy) { - default: return 0; - case MVT::i8: - if (isInc) - Opc = X86::LOCK_INC8m; - else if (isDec) - Opc = X86::LOCK_DEC8m; - else if (isSub) { - if (isCN) - Opc = X86::LOCK_SUB8mi; - else - Opc = X86::LOCK_SUB8mr; - } else { + default: return 0; + case MVT::i8: if (isCN) - Opc = X86::LOCK_ADD8mi; + Opc = AtomicOpcTbl[Op][ConstantI8]; else - Opc = X86::LOCK_ADD8mr; - } - break; - case MVT::i16: - if (isInc) - Opc = X86::LOCK_INC16m; - else if (isDec) - Opc = X86::LOCK_DEC16m; - else if (isSub) { - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_SUB16mi8; - else - Opc = X86::LOCK_SUB16mi; - } else - Opc = X86::LOCK_SUB16mr; - } else { - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_ADD16mi8; - else - Opc = X86::LOCK_ADD16mi; - } else - Opc = X86::LOCK_ADD16mr; - } - break; - case MVT::i32: - if (isInc) - Opc = X86::LOCK_INC32m; - else if (isDec) - Opc = X86::LOCK_DEC32m; - else if (isSub) { + Opc = AtomicOpcTbl[Op][I8]; + break; + case MVT::i16: if (isCN) { if (immSext8(Val.getNode())) - Opc = X86::LOCK_SUB32mi8; + Opc = AtomicOpcTbl[Op][SextConstantI16]; else - Opc = X86::LOCK_SUB32mi; + Opc = AtomicOpcTbl[Op][ConstantI16]; } else - Opc = X86::LOCK_SUB32mr; - } else { + Opc = AtomicOpcTbl[Op][I16]; + break; + case MVT::i32: if (isCN) { if (immSext8(Val.getNode())) - Opc = X86::LOCK_ADD32mi8; + Opc = AtomicOpcTbl[Op][SextConstantI32]; else - Opc = X86::LOCK_ADD32mi; + Opc = AtomicOpcTbl[Op][ConstantI32]; } else - Opc = X86::LOCK_ADD32mr; - } - break; - case MVT::i64: - if (isInc) - Opc = X86::LOCK_INC64m; - else if (isDec) - Opc = X86::LOCK_DEC64m; - else if (isSub) { - Opc = X86::LOCK_SUB64mr; - if (isCN) { - if (immSext8(Val.getNode())) - Opc = X86::LOCK_SUB64mi8; - else if (i64immSExt32(Val.getNode())) - Opc = X86::LOCK_SUB64mi32; - } - } else { - Opc = X86::LOCK_ADD64mr; + Opc = AtomicOpcTbl[Op][I32]; + break; + case MVT::i64: + Opc = AtomicOpcTbl[Op][I64]; if (isCN) { if (immSext8(Val.getNode())) - Opc = X86::LOCK_ADD64mi8; + Opc = AtomicOpcTbl[Op][SextConstantI64]; else if (i64immSExt32(Val.getNode())) - Opc = X86::LOCK_ADD64mi32; + Opc = AtomicOpcTbl[Op][ConstantI64]; } - } - break; + break; } - DebugLoc dl = Node->getDebugLoc(); + assert(Opc != 0 && "Invalid arith lock transform!"); + + SDValue Ret; SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, NVT), 0); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast(Node)->getMemOperand(); - if (isInc || isDec) { + if (isUnOp) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; - SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0); - cast(Ret)->setMemRefs(MemOp, MemOp + 1); - SDValue RetVals[] = { Undef, Ret }; - return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, + array_lengthof(Ops)), 0); } else { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; - SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); - cast(Ret)->setMemRefs(MemOp, MemOp + 1); - SDValue RetVals[] = { Undef, Ret }; - return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, + array_lengthof(Ops)), 0); } + cast(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); } /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has @@ -1561,12 +1846,148 @@ static bool HasNoSignedComparisonUses(SDNode *N) { return true; } +/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode +/// is suitable for doing the {load; increment or decrement; store} to modify +/// transformation. +static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, + SDValue StoredVal, SelectionDAG *CurDAG, + LoadSDNode* &LoadNode, SDValue &InputChain) { + + // is the value stored the result of a DEC or INC? + if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false; + + // is the stored value result 0 of the load? + if (StoredVal.getResNo() != 0) return false; + + // are there other uses of the loaded value than the inc or dec? + if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; + + // is the store non-extending and non-indexed? + if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) + return false; + + SDValue Load = StoredVal->getOperand(0); + // Is the stored value a non-extending and non-indexed load? + if (!ISD::isNormalLoad(Load.getNode())) return false; + + // Return LoadNode by reference. + LoadNode = cast(Load); + // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8) + EVT LdVT = LoadNode->getMemoryVT(); + if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && + LdVT != MVT::i8) + return false; + + // Is store the only read of the loaded value? + if (!Load.hasOneUse()) + return false; + + // Is the address of the store the same as the load? + if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || + LoadNode->getOffset() != StoreNode->getOffset()) + return false; + + // Check if the chain is produced by the load or is a TokenFactor with + // the load output chain as an operand. Return InputChain by reference. + SDValue Chain = StoreNode->getChain(); + + bool ChainCheck = false; + if (Chain == Load.getValue(1)) { + ChainCheck = true; + InputChain = LoadNode->getChain(); + } else if (Chain.getOpcode() == ISD::TokenFactor) { + SmallVector ChainOps; + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { + SDValue Op = Chain.getOperand(i); + if (Op == Load.getValue(1)) { + ChainCheck = true; + continue; + } + + // Make sure using Op as part of the chain would not cause a cycle here. + // In theory, we could check whether the chain node is a predecessor of + // the load. But that can be very expensive. Instead visit the uses and + // make sure they all have smaller node id than the load. + int LoadId = LoadNode->getNodeId(); + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = UI->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != 0) + continue; + if (UI->getNodeId() > LoadId) + return false; + } + + ChainOps.push_back(Op); + } + + if (ChainCheck) + // Make a new TokenFactor with all the other input chains except + // for the load. + InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(), + MVT::Other, &ChainOps[0], ChainOps.size()); + } + if (!ChainCheck) + return false; + + return true; +} + +/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory +/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC. +static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { + if (Opc == X86ISD::DEC) { + if (LdVT == MVT::i64) return X86::DEC64m; + if (LdVT == MVT::i32) return X86::DEC32m; + if (LdVT == MVT::i16) return X86::DEC16m; + if (LdVT == MVT::i8) return X86::DEC8m; + } else { + assert(Opc == X86ISD::INC && "unrecognized opcode"); + if (LdVT == MVT::i64) return X86::INC64m; + if (LdVT == MVT::i32) return X86::INC32m; + if (LdVT == MVT::i16) return X86::INC16m; + if (LdVT == MVT::i8) return X86::INC8m; + } + llvm_unreachable("unrecognized size for LdVT"); +} + +/// SelectGather - Customized ISel for GATHER operations. +/// +SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { + // Operands of Gather: VSrc, Base, VIdx, VMask, Scale + SDValue Chain = Node->getOperand(0); + SDValue VSrc = Node->getOperand(2); + SDValue Base = Node->getOperand(3); + SDValue VIdx = Node->getOperand(4); + SDValue VMask = Node->getOperand(5); + ConstantSDNode *Scale = dyn_cast(Node->getOperand(6)); + if (!Scale) + return 0; + + SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), + MVT::Other); + + // Memory Operands: Base, Scale, Index, Disp, Segment + SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i32); + const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx, + Disp, Segment, VMask, Chain}; + SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), + VTs, Ops, array_lengthof(Ops)); + // Node has 2 outputs: VDst and MVT::Other. + // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. + // We replace VDst of Node with VDst of ResNode, and Other of Node with Other + // of ResNode. + ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2)); + return ResNode; +} + SDNode *X86DAGToDAGISel::Select(SDNode *Node) { EVT NVT = Node->getValueType(0); unsigned Opc, MOpc; unsigned Opcode = Node->getOpcode(); DebugLoc dl = Node->getDebugLoc(); - + DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n'); if (Node->isMachineOpcode()) { @@ -1576,69 +1997,219 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: break; + case Intrinsic::x86_avx2_gather_d_pd: + case Intrinsic::x86_avx2_gather_d_pd_256: + case Intrinsic::x86_avx2_gather_q_pd: + case Intrinsic::x86_avx2_gather_q_pd_256: + case Intrinsic::x86_avx2_gather_d_ps: + case Intrinsic::x86_avx2_gather_d_ps_256: + case Intrinsic::x86_avx2_gather_q_ps: + case Intrinsic::x86_avx2_gather_q_ps_256: + case Intrinsic::x86_avx2_gather_d_q: + case Intrinsic::x86_avx2_gather_d_q_256: + case Intrinsic::x86_avx2_gather_q_q: + case Intrinsic::x86_avx2_gather_q_q_256: + case Intrinsic::x86_avx2_gather_d_d: + case Intrinsic::x86_avx2_gather_d_d_256: + case Intrinsic::x86_avx2_gather_q_d: + case Intrinsic::x86_avx2_gather_q_d_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break; + case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break; + case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break; + case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break; + case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break; + case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break; + case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break; + case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break; + case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break; + case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break; + case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break; + case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break; + case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break; + case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break; + case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; + case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; + } + SDNode *RetVal = SelectGather(Node, Opc); + if (RetVal) + // We already called ReplaceUses inside SelectGather. + return NULL; + break; + } + } + break; + } case X86ISD::GlobalBaseReg: return getGlobalBaseReg(); + case X86ISD::ATOMOR64_DAG: - return SelectAtomic64(Node, X86::ATOMOR6432); case X86ISD::ATOMXOR64_DAG: - return SelectAtomic64(Node, X86::ATOMXOR6432); case X86ISD::ATOMADD64_DAG: - return SelectAtomic64(Node, X86::ATOMADD6432); case X86ISD::ATOMSUB64_DAG: - return SelectAtomic64(Node, X86::ATOMSUB6432); case X86ISD::ATOMNAND64_DAG: - return SelectAtomic64(Node, X86::ATOMNAND6432); case X86ISD::ATOMAND64_DAG: - return SelectAtomic64(Node, X86::ATOMAND6432); - case X86ISD::ATOMSWAP64_DAG: - return SelectAtomic64(Node, X86::ATOMSWAP6432); + case X86ISD::ATOMMAX64_DAG: + case X86ISD::ATOMMIN64_DAG: + case X86ISD::ATOMUMAX64_DAG: + case X86ISD::ATOMUMIN64_DAG: + case X86ISD::ATOMSWAP64_DAG: { + unsigned Opc; + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break; + case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break; + case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break; + case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break; + case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break; + case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break; + case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break; + case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break; + case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break; + case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break; + case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break; + } + SDNode *RetVal = SelectAtomic64(Node, Opc); + if (RetVal) + return RetVal; + break; + } + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_ADD: { - SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT); + SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); if (RetVal) return RetVal; break; } + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + break; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + break; + + ConstantSDNode *Cst = dyn_cast(N1); + ConstantSDNode *ShlCst = dyn_cast(N0->getOperand(1)); + if (!Cst || !ShlCst) + break; + + int64_t Val = Cst->getSExtValue(); + uint64_t ShlVal = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1; + if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) + break; + + unsigned ShlOp, Op; + EVT CstVT = NVT; + + // Check the minimum bitwidth for the new constant. + // TODO: AND32ri is the same as AND64ri32 with zext imm. + // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) + CstVT = MVT::i8; + else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) + CstVT = MVT::i32; + + // Bail if there is no smaller encoding. + if (NVT == CstVT) + break; + + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + assert(CstVT == MVT::i8); + ShlOp = X86::SHL32ri; + + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case ISD::AND: Op = X86::AND32ri8; break; + case ISD::OR: Op = X86::OR32ri8; break; + case ISD::XOR: Op = X86::XOR32ri8; break; + } + break; + case MVT::i64: + assert(CstVT == MVT::i8 || CstVT == MVT::i32); + ShlOp = X86::SHL64ri; + + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; + case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; + case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + } + break; + } + + // Emit the smaller op and the shift. + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT); + SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), + getI8Imm(ShlVal)); + } case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - - unsigned LoReg, HiReg; + + unsigned LoReg; switch (NVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; Opc = X86::MUL8r; break; - case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; Opc = X86::MUL16r; break; - case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; Opc = X86::MUL32r; break; - case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; Opc = X86::MUL64r; break; + case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break; + case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; + case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break; + case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break; } - + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); - + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); SDValue Ops[] = {N1, InFlag}; SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, 2); - + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2)); return NULL; } - + case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); bool isSigned = Opcode == ISD::SMUL_LOHI; + bool hasBMI2 = Subtarget->hasBMI2(); if (!isSigned) { switch (NVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; - case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; - case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; + case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r; + MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break; + case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r; + MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break; } } else { switch (NVT.getSimpleVT().SimpleTy) { @@ -1650,13 +2221,31 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } } - unsigned LoReg, HiReg; - switch (NVT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; break; - case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; break; - case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break; - case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break; + unsigned SrcReg, LoReg, HiReg; + switch (Opc) { + default: llvm_unreachable("Unknown MUL opcode!"); + case X86::IMUL8r: + case X86::MUL8r: + SrcReg = LoReg = X86::AL; HiReg = X86::AH; + break; + case X86::IMUL16r: + case X86::MUL16r: + SrcReg = LoReg = X86::AX; HiReg = X86::DX; + break; + case X86::IMUL32r: + case X86::MUL32r: + SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; + break; + case X86::IMUL64r: + case X86::MUL64r: + SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; + break; + case X86::MULX32rr: + SrcReg = X86::EDX; LoReg = HiReg = 0; + break; + case X86::MULX64rr: + SrcReg = X86::RDX; LoReg = HiReg = 0; + break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; @@ -1668,22 +2257,47 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { std::swap(N0, N1); } - SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, - N0, SDValue()).getValue(1); + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, + N0, SDValue()).getValue(1); + SDValue ResHi, ResLo; if (foldedLoad) { + SDValue Chain; SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; - SDNode *CNode = - CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops, - array_lengthof(Ops)); - InFlag = SDValue(CNode, 1); + if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops, + array_lengthof(Ops)); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + Chain = SDValue(CNode, 2); + InFlag = SDValue(CNode, 3); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops, + array_lengthof(Ops)); + Chain = SDValue(CNode, 0); + InFlag = SDValue(CNode, 1); + } // Update the chain. - ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); + ReplaceUses(N1.getValue(1), Chain); } else { - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag); - InFlag = SDValue(CNode, 0); + SDValue Ops[] = { N1, InFlag }; + if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, + array_lengthof(Ops)); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + InFlag = SDValue(CNode, 2); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, + array_lengthof(Ops)); + InFlag = SDValue(CNode, 0); + } } // Prevent use of AH in a REX instruction by referencing AX instead. @@ -1708,21 +2322,27 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - LoReg, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 0), Result); - DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + if (ResLo.getNode() == 0) { + assert(LoReg && "Register for low half is not defined!"); + ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, + InFlag); + InFlag = ResLo.getValue(2); + } + ReplaceUses(SDValue(Node, 0), ResLo); + DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - HiReg, NVT, InFlag); - InFlag = Result.getValue(2); - ReplaceUses(SDValue(Node, 1), Result); - DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + if (ResHi.getNode() == 0) { + assert(HiReg && "Register for high half is not defined!"); + ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, + InFlag); + InFlag = ResHi.getValue(2); + } + ReplaceUses(SDValue(Node, 1), ResHi); + DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); } - + return NULL; } @@ -1788,17 +2408,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16, + SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, MVT::Other, Ops, array_lengthof(Ops)), 0); Chain = Move.getValue(1); ReplaceUses(N0.getValue(1), Chain); } else { Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0); + SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0); Chain = CurDAG->getEntryNode(); } - Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue()); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue()); InFlag = Chain.getValue(1); } else { InFlag = @@ -1807,7 +2427,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (isSigned && !signBitIsZero) { // Sign extend the low part into the high part. InFlag = - SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Flag, InFlag),0); + SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. SDValue ClrNode = @@ -1821,14 +2441,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; SDNode *CNode = - CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops, + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops, array_lengthof(Ops)); InFlag = SDValue(CNode, 1); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); } else { InFlag = - SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0); + SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); } // Prevent use of AH in a REX instruction by referencing AX instead. @@ -1873,7 +2493,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return NULL; } - case X86ISD::CMP: { + case X86ISD::CMP: + case X86ISD::SUB: { + // Sometimes a SUB is used to perform comparison. + if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0)) + // This node is not a CMP. + break; SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); @@ -1883,7 +2508,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { HasNoSignedComparisonUses(Node)) // Look past the truncate if CMP is the only use of it. N0 = N0.getOperand(0); - if (N0.getNode()->getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && + if ((N0.getNode()->getOpcode() == ISD::AND || + (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) && + N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8 && X86::isZeroNode(N1)) { ConstantSDNode *C = dyn_cast(N0.getNode()->getOperand(1)); @@ -1898,7 +2525,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // On x86-32, only the ABCD registers have 8-bit subregisters. if (!Subtarget->is64Bit()) { - TargetRegisterClass *TRC = 0; + const TargetRegisterClass *TRC; switch (N0.getValueType().getSimpleVT().SimpleTy) { case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; @@ -1914,7 +2541,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { MVT::i8, Reg); // Emit a testb. - return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm); + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } // For example, "testl %eax, $2048" to "testb %ah, $8". @@ -1927,7 +2560,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Reg = N0.getNode()->getOperand(0); // Put the value in an ABCD register. - TargetRegisterClass *TRC = 0; + const TargetRegisterClass *TRC; switch (N0.getValueType().getSimpleVT().SimpleTy) { case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break; case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; @@ -1942,10 +2575,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, Reg); - // Emit a testb. No special NOREX tricks are needed since there's - // only one GPR operand! - return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, - Subreg, ShiftedImm); + // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only + // target GR8_NOREX registers, so make sure the register class is + // forced. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, + MVT::i32, Subreg, ShiftedImm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } // For example, "testl %eax, $32776" to "testw %ax, $32776". @@ -1961,7 +2600,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { MVT::i16, Reg); // Emit a testw. - return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm); + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } // For example, "testq %rax, $268468232" to "testl %eax, $268468232". @@ -1977,11 +2622,67 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { MVT::i32, Reg); // Emit a testl. - return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm); + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return NULL; } } break; } + case ISD::STORE: { + // Change a chain of {load; incr or dec; store} of the same value into + // a simple increment or decrement through memory of that value, if the + // uses of the modified value and its address are suitable. + // The DEC64m tablegen pattern is currently not able to match the case where + // the EFLAGS on the original DEC are used. (This also applies to + // {INC,DEC}X{64,32,16,8}.) + // We'll need to improve tablegen to allow flags to be transferred from a + // node in the pattern to the result node. probably with a new keyword + // for example, we have this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (implicit EFLAGS)]>; + // but maybe need something like this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (transferrable EFLAGS)]>; + + StoreSDNode *StoreNode = cast(Node); + SDValue StoredVal = StoreNode->getOperand(1); + unsigned Opc = StoredVal->getOpcode(); + + LoadSDNode *LoadNode = 0; + SDValue InputChain; + if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG, + LoadNode, InputChain)) + break; + + SDValue Base, Scale, Index, Disp, Segment; + if (!SelectAddr(LoadNode, LoadNode->getBasePtr(), + Base, Scale, Index, Disp, Segment)) + break; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); + MemOp[0] = StoreNode->getMemOperand(); + MemOp[1] = LoadNode->getMemOperand(); + const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain }; + EVT LdVT = LoadNode->getMemoryVT(); + unsigned newOpc = getFusedLdStOpcode(LdVT, Opc); + MachineSDNode *Result = CurDAG->getMachineNode(newOpc, + Node->getDebugLoc(), + MVT::i32, MVT::Other, Ops, + array_lengthof(Ops)); + Result->setMemRefs(MemOp, MemOp + 2); + + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + + return Result; + } } SDNode *ResNode = SelectCode(Node); @@ -2009,7 +2710,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, return true; break; } - + OutOps.push_back(Op0); OutOps.push_back(Op1); OutOps.push_back(Op2); @@ -2018,10 +2719,10 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, return false; } -/// createX86ISelDag - This pass converts a legalized DAG into a +/// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, - llvm::CodeGenOpt::Level OptLevel) { + CodeGenOpt::Level OptLevel) { return new X86DAGToDAGISel(TM, OptLevel); }