X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=7f89b2d1685697782a411b0186b9266316831983;hb=56a1a69a35ca2ee0d1249d705de8936096c6f008;hp=4fb9599687794c46ffa5c88a897e2f4f8e8227a4;hpb=d0910c4534d323d101729d796e70805d3c2f0716;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 4fb95996877..7f89b2d1685 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "arm-isel" #include "ARM.h" #include "ARMAddressingModes.h" #include "ARMConstantPoolValue.h" @@ -40,12 +41,32 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include using namespace llvm; +STATISTIC(NumTailCalls, "Number of tail calls"); + +// This option should go away when tail calls fully work. +static cl::opt +EnableARMTailCalls("arm-tail-calls", cl::Hidden, + cl::desc("Generate tail calls (TEMPORARY OPTION)."), + cl::init(true)); + +static cl::opt +EnableARMLongCalls("arm-long-calls", cl::Hidden, + cl::desc("Generate calls via indirect call instructions."), + cl::init(false)); + +static cl::opt +ARMInterworking("arm-interworking", cl::Hidden, + cl::desc("Enable / disable ARM interworking (for debugging only)"), + cl::init(true)); + static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, @@ -88,7 +109,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, } setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); @@ -354,8 +375,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::BSWAP, MVT::i32, Expand); // These are expanded into libcalls. - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); + if (!Subtarget->hasDivide()) { + // v7M has a hardware divider + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); @@ -367,6 +391,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + setOperationAction(ISD::TRAP, MVT::Other, Legal); + // Use the default implementation. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Expand); @@ -378,13 +404,59 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // FIXME: Shouldn't need this, since no register is used, but the legalizer // doesn't yet know how to not do that for SjLj. setExceptionSelectorRegister(ARM::R0); - if (Subtarget->isThumb()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); - setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); - - if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) { + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + // Handle atomics directly for ARMv[67] (except for Thumb1), otherwise + // use the default expansion. + bool canHandleAtomics = + (Subtarget->hasV7Ops() || + (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())); + if (canHandleAtomics) { + // membarrier needs custom lowering; the rest are legal and handled + // normally. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); + } else { + // Set them all for expansion, which will force libcalls. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); + } + // 64-bit versions are always libcalls (for now) + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand); + + // If the subtarget does not have extract instructions, sign_extend_inreg + // needs to be expanded. Extract is available in ARM mode on v6 and up, + // and on most Thumb2 implementations. + if (!Subtarget->hasV6Ops() + || (Subtarget->isThumb2() && !Subtarget->hasT2ExtractPack())) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); } @@ -448,9 +520,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::MUL); setStackPointerRegisterToSaveRestore(ARM::SP); - setSchedulingPreference(SchedulingForRegPressure); + + if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2()) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Hybrid); // FIXME: If-converter should use instruction latency to determine // profitability rather than relying on fixed limits. @@ -515,6 +592,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; @@ -559,16 +638,59 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; } } +/// getRegClassFor - Return the register class that should be used for the +/// specified value type. +TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { + // Map v4i64 to QQ registers but do not make the type legal. Similarly map + // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to + // load / store 4 to 8 consecutive D registers. + if (Subtarget->hasNEON()) { + if (VT == MVT::v4i64) + return ARM::QQPRRegisterClass; + else if (VT == MVT::v8i64) + return ARM::QQQQPRRegisterClass; + } + return TargetLowering::getRegClassFor(VT); +} + /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { return getTargetMachine().getSubtarget().isThumb() ? 0 : 1; } +Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { + unsigned NumVals = N->getNumValues(); + if (!NumVals) + return Sched::RegPressure; + + for (unsigned i = 0; i != NumVals; ++i) { + EVT VT = N->getValueType(i); + if (VT.isFloatingPoint() || VT.isVector()) + return Sched::Latency; + } + + if (!N->isMachineOpcode()) + return Sched::RegPressure; + + // Load are scheduled for latency even if there instruction itinerary + // is not available. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrDesc &TID = TII->get(N->getMachineOpcode()); + if (TID.mayLoad()) + return Sched::Latency; + + const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData(); + if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2) + return Sched::Latency; + return Sched::RegPressure; +} + //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// @@ -785,7 +907,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) const { // Assign locations to each value returned by this call. SmallVector RVLocs; @@ -873,7 +995,7 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, DebugLoc dl, SelectionDAG &DAG, const CCValAssign &VA, - ISD::ArgFlagsTy Flags) { + ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); @@ -891,7 +1013,7 @@ void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVector &MemOpChains, - ISD::ArgFlagsTy Flags) { + ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); @@ -920,9 +1042,25 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { - // ARM target does not yet support tail call optimization. - isTailCall = false; + SmallVectorImpl &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool IsSibCall = false; + // Temporarily disable tail calls so things don't break. + if (!EnableARMTailCalls) + isTailCall = false; + if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + Outs, Ins, DAG); + // We don't support GuaranteedTailCallOpt for ARM, only automatically + // detected sibcalls. + if (isTailCall) { + ++NumTailCalls; + IsSibCall = true; + } + } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -935,9 +1073,14 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); + // For tail calls, memory operands are available in our caller's stack. + if (IsSibCall) + NumBytes = 0; + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + if (!IsSibCall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); @@ -998,7 +1141,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, } } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - } else { + } else if (!IsSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1013,10 +1156,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); + // Tail call byval lowering might overwrite argument registers so in case of + // tail call optimization the copies to registers are lowered later. + if (!isTailCall) + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // For tail calls lower the arguments to the 'real' stack slot. + if (isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + + // Do not flag preceeding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + InFlag =SDValue(); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every @@ -1025,17 +1190,52 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, bool isDirect = false; bool isARMFunc = false; bool isLocalARMFunc = false; - MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); - if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - GlobalValue *GV = G->getGlobal(); + + if (EnableARMLongCalls) { + assert (getTargetMachine().getRelocationModel() == Reloc::Static + && "long-calls with non-static relocation model!"); + // Handle a global address or an external symbol. If it's not one of + // those, the target's already in a register, so we don't need to do + // anything extra. + if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + const GlobalValue *GV = G->getGlobal(); + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, + ARMPCLabelIndex, + ARMCP::CPValue, 0); + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { + const char *Sym = S->getSymbol(); + + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(), + Sym, ARMPCLabelIndex, 0); + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0, + false, false, 0); + } + } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + const GlobalValue *GV = G->getGlobal(); isDirect = true; bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); bool isStub = (isExt && Subtarget->isTargetDarwin()) && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || isStub; // ARM call to a local ARM function is predicable. - isLocalARMFunc = !Subtarget->isThumb() && !isExt; + isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); // tBX takes a register source operand. if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); @@ -1051,7 +1251,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, getPointerTy(), Callee, PICLabel); - } else + } else Callee = DAG.getTargetGlobalAddress(GV, getPointerTy()); } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; @@ -1107,9 +1307,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + if (isTailCall) + return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), - &Ops[0], Ops.size()); + Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), @@ -1123,11 +1327,198 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, dl, DAG, InVals); } +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const ARMInstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast(Arg.getOperand(1))->getReg(); + if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { +// unsigned Opcode = Def->getOpcode(); +// if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && +// Def->getOperand(1).isFI()) { +// FI = Def->getOperand(1).getIndex(); +// Bytes = Flags.getByValSize(); +// } else + return false; + } + } else if (LoadSDNode *Ld = dyn_cast(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl &Outs, + const SmallVectorImpl &Ins, + SelectionDAG& DAG) const { + const Function *CallerF = DAG.getMachineFunction().getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Do not sibcall optimize vararg calls unless the call site is not passing + // any arguments. + if (isVarArg && !Outs.empty()) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // FIXME: Completely disable sibcal for Thumb1 since Thumb1RegisterInfo:: + // emitEpilogue is not ready for them. + if (Subtarget->isThumb1Only()) + return false; + + if (isa(Callee)) + return false; + + if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + if (Subtarget->isThumb1Only()) + return false; + + // On Thumb, for the moment, we can only do this to functions defined in this + // compilation, or to indirect calls. A Thumb B to an ARM function is not + // easily fixed up in the linker, unlike BL. + if (Subtarget->isThumb()) { + const GlobalValue *GV = G->getGlobal(); + if (GV->isDeclaration() || GV->isWeakForLinker()) + return false; + } + } + + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector RVLocs1; + CCState CCInfo1(CalleeCC, false, getTargetMachine(), + RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); + + SmallVector RVLocs2; + CCState CCInfo2(CallerCC, false, getTargetMachine(), + RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), + ArgLocs, *DAG.getContext()); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CalleeCC, false, isVarArg)); + if (CCInfo.getNextStackOffset()) { + MachineFunction &MF = DAG.getMachineFunction(); + + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const ARMInstrInfo *TII = + ((ARMTargetMachine&)getTargetMachine()).getInstrInfo(); + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = Outs[realArgIdx].Val; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (VA.needsCustom()) { + // f64 and vector types are split into multiple registers or + // register/stack-slot combinations. The types will not match + // the registers; give up on memory f64 refs until we figure + // out what to do about this. + if (!VA.isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + if (RegVT == MVT::v2f64) { + if (!ArgLocs[++i].isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + } + } else if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + } + + return true; +} + SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, - DebugLoc dl, SelectionDAG &DAG) { + DebugLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; @@ -1234,13 +1625,14 @@ static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } -SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; DebugLoc DL = Op.getDebugLoc(); EVT PtrVT = getPointerTy(); - BlockAddress *BA = cast(Op)->getBlockAddress(); + const BlockAddress *BA = cast(Op)->getBlockAddress(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; if (RelocM == Reloc::Static) { @@ -1266,7 +1658,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { DebugLoc dl = GA->getDebugLoc(); EVT PtrVT = getPointerTy(); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; @@ -1305,8 +1697,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, // "local exec" model. SDValue ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, - SelectionDAG &DAG) { - GlobalValue *GV = GA->getGlobal(); + SelectionDAG &DAG) const { + const GlobalValue *GV = GA->getGlobal(); DebugLoc dl = GA->getDebugLoc(); SDValue Offset; SDValue Chain = DAG.getEntryNode(); @@ -1352,7 +1744,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, } SDValue -ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { +ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // TODO: implement the "local dynamic" model assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); @@ -1366,10 +1758,10 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { } SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); - GlobalValue *GV = cast(Op)->getGlobal(); + const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); if (RelocM == Reloc::PIC_) { bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); @@ -1406,13 +1798,13 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); - GlobalValue *GV = cast(Op)->getGlobal(); + const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; if (RelocM == Reloc::Static) @@ -1445,7 +1837,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, } SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, - SelectionDAG &DAG){ + SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); MachineFunction &MF = DAG.getMachineFunction(); @@ -1466,9 +1858,24 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } +SDValue +ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + SDValue Val = DAG.getConstant(0, MVT::i32); + return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0), + Op.getOperand(1), Val); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(0, MVT::i32)); +} + SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { + const ARMSubtarget *Subtarget) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); DebugLoc dl = Op.getDebugLoc(); switch (IntNo) { @@ -1504,51 +1911,44 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } return Result; } - case Intrinsic::eh_sjlj_setjmp: - SDValue Val = Subtarget->isThumb() ? - DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) : - DAG.getConstant(0, MVT::i32); - return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(1), - Val); } } static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { + const ARMSubtarget *Subtarget) { DebugLoc dl = Op.getDebugLoc(); SDValue Op5 = Op.getOperand(5); - SDValue Res; unsigned isDeviceBarrier = cast(Op5)->getZExtValue(); - if (isDeviceBarrier) { - if (Subtarget->hasV7Ops()) - Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0)); - else - Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); - } else { - if (Subtarget->hasV7Ops()) - Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); - else - Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); - } - return Res; + // v6 and v7 can both handle barriers directly, but need handled a bit + // differently. Thumb1 and pre-v6 ARM mode use a libcall instead and should + // never get here. + unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER; + if (Subtarget->hasV7Ops()) + return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0)); + else if (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only()) + return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + assert(0 && "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); + return SDValue(); } -static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, - unsigned VarArgsFrameIndex) { +static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *FuncInfo = MF.getInfo(); + // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. DebugLoc dl = Op.getDebugLoc(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, false, false, 0); } SDValue -ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) { +ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); DebugLoc dl = Node->getDebugLoc(); EVT VT = Node->getValueType(0); @@ -1597,7 +1997,7 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) { SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, - DebugLoc dl) { + DebugLoc dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -1608,15 +2008,13 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, RC = ARM::GPRRegisterClass; // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); SDValue ArgValue2; if (NextVA.isMemLoc()) { - unsigned ArgSize = NextVA.getLocVT().getSizeInBits()/8; MachineFrameInfo *MFI = MF.getFrameInfo(); - int FI = MFI->CreateFixedObject(ArgSize, NextVA.getLocMemOffset(), - true, false); + int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true, false); // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); @@ -1637,7 +2035,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) + const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1665,14 +2064,22 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, if (VA.needsCustom()) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. - RegVT = MVT::i32; - if (VA.getLocVT() == MVT::v2f64) { SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); VA = ArgLocs[++i]; // skip ahead to next loc - SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], - Chain, DAG, dl); + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), + true, false); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, + PseudoSourceValue::getFixedStack(FI), 0, + false, false, 0); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + } ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); @@ -1760,10 +2167,12 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // to their spots on the stack so that they may be loaded by deferencing // the result of va_next. AFI->setVarArgsRegSaveSize(VARegSaveSize); - VarArgsFrameIndex = MFI->CreateFixedObject(VARegSaveSize, ArgOffset + - VARegSaveSize - VARegSize, - true, false); - SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); + AFI->setVarArgsFrameIndex( + MFI->CreateFixedObject(VARegSaveSize, + ArgOffset + VARegSaveSize - VARegSize, + true, false)); + SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), + getPointerTy()); SmallVector MemOps; for (; NumGPRs < 4; ++NumGPRs) { @@ -1775,9 +2184,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - PseudoSourceValue::getFixedStack(VarArgsFrameIndex), 0, - false, false, 0); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), + 0, false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getConstant(4, getPointerTy())); @@ -1787,7 +2197,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, &MemOps[0], MemOps.size()); } else // This will point to the next argument passed via stack. - VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset, true, false); + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, + true, false)); } return Chain; @@ -1802,7 +2213,7 @@ static bool isFloatingPointZero(SDValue Op) { if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { SDValue WrapperOp = Op.getOperand(1).getOperand(0); if (ConstantPoolSDNode *CP = dyn_cast(WrapperOp)) - if (ConstantFP *CFP = dyn_cast(CP->getConstVal())) + if (const ConstantFP *CFP = dyn_cast(CP->getConstVal())) return CFP->getValueAPF().isPosZero(); } } @@ -1813,7 +2224,8 @@ static bool isFloatingPointZero(SDValue Op) { /// the given operands. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) { + SDValue &ARMCC, SelectionDAG &DAG, + DebugLoc dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate(C)) { @@ -1879,7 +2291,7 @@ static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp); } -SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -1913,7 +2325,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) { return Result; } -SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); @@ -1947,7 +2359,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) { return Res; } -SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); @@ -2036,9 +2448,31 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp); } -SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(4, MVT::i32); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + NULL, 0, false, false, 0); + } + + // Return LR, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(ARM::LR, ARM::GPRRegisterClass); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); +} + +SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); @@ -2051,132 +2485,42 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { return FrameAddr; } -SDValue -ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, bool AlwaysInline, - const Value *DstSV, uint64_t DstSVOff, - const Value *SrcSV, uint64_t SrcSVOff){ - // Do repeated 4-byte loads and stores. To be improved. - // This requires 4-byte alignment. - if ((Align & 3) != 0) - return SDValue(); - // This requires the copy size to be a constant, preferrably - // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast(Size); - if (!ConstantSize) - return SDValue(); - uint64_t SizeVal = ConstantSize->getZExtValue(); - if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) - return SDValue(); - - unsigned BytesLeft = SizeVal & 3; - unsigned NumMemOps = SizeVal >> 2; - unsigned EmittedNumMemOps = 0; - EVT VT = MVT::i32; - unsigned VTSize = 4; - unsigned i = 0; - const unsigned MAX_LOADS_IN_LDM = 6; - SDValue TFOps[MAX_LOADS_IN_LDM]; - SDValue Loads[MAX_LOADS_IN_LDM]; - uint64_t SrcOff = 0, DstOff = 0; - - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, MVT::i32)), - SrcSV, SrcSVOff + SrcOff, isVolatile, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, MVT::i32)), - DstSV, DstSVOff + DstOff, isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - EmittedNumMemOps += i; - } - - if (BytesLeft == 0) - return Chain; - - // Issue loads / stores for the trailing (1 - 3) bytes. - unsigned BytesLeftSave = BytesLeft; - i = 0; - while (BytesLeft) { - if (BytesLeft >= 2) { - VT = MVT::i16; - VTSize = 2; - } else { - VT = MVT::i8; - VTSize = 1; - } - - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, MVT::i32)), - SrcSV, SrcSVOff + SrcOff, false, false, 0); - TFOps[i] = Loads[i].getValue(1); - ++i; - SrcOff += VTSize; - BytesLeft -= VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - i = 0; - BytesLeft = BytesLeftSave; - while (BytesLeft) { - if (BytesLeft >= 2) { - VT = MVT::i16; - VTSize = 2; - } else { - VT = MVT::i8; - VTSize = 1; - } - - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, MVT::i32)), - DstSV, DstSVOff + DstOff, false, false, 0); - ++i; - DstOff += VTSize; - BytesLeft -= VTSize; - } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); -} - +/// ExpandBIT_CONVERT - If the target supports VFP, this function is called to +/// expand a bit convert where either the source or destination type is i64 to +/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 +/// operand type is illegal (e.g., v2f32 for a target that doesn't support +/// vectors), since the legalizer won't know what to do with that. static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { - SDValue Op = N->getOperand(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); DebugLoc dl = N->getDebugLoc(); - if (N->getValueType(0) == MVT::f64) { - // Turn i64->f64 into VMOVDRR. + SDValue Op = N->getOperand(0); + + // This function is only supposed to be called for i64 types, either as the + // source or destination of the bit convert. + EVT SrcVT = Op.getValueType(); + EVT DstVT = N->getValueType(0); + assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && + "ExpandBIT_CONVERT called for non-i64 type"); + + // Turn i64->f64 into VMOVDRR. + if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(1, MVT::i32)); - return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + return DAG.getNode(ISD::BIT_CONVERT, dl, DstVT, + DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } // Turn f64->i64 into VMOVRRD. - SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, - DAG.getVTList(MVT::i32, MVT::i32), &Op, 1); + if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { + SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), &Op, 1); + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); + } - // Merge the pieces into a single i64 value. - return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); + return SDValue(); } /// getZeroVector - Returns a vector of specified type with all zero elements. @@ -2229,7 +2573,8 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. -SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -2264,7 +2609,8 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) { /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. -SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -2357,9 +2703,9 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, MVT::i32)); // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and // captures the result into a carry flag. @@ -2473,76 +2819,152 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { return Result; } -/// isVMOVSplat - Check if the specified splat value corresponds to an immediate -/// VMOV instruction, and if so, return the constant being splatted. -static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef, - unsigned SplatBitSize, SelectionDAG &DAG) { +/// isNEONModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON instruction with a "modified immediate" +/// operand (e.g., VMOV). If so, return either the constant being +/// splatted or the encoded value, depending on the DoEncode parameter. The +/// format of the encoded value is: bit12=Op, bits11-8=Cmode, +/// bits7-0=Immediate. +static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + bool isVMOV, bool DoEncode) { + unsigned Op, Cmode, Imm; + EVT VT; + + // SplatBitSize is set to the smallest size that splats the vector, so a + // zero vector will always have SplatBitSize == 8. However, NEON modified + // immediate instructions others than VMOV do not support the 8-bit encoding + // of a zero vector, and the default encoding of zero is supposed to be the + // 32-bit version. + if (SplatBits == 0) + SplatBitSize = 32; + + Op = 0; switch (SplatBitSize) { case 8: - // Any 1-byte value is OK. + // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); - return DAG.getTargetConstant(SplatBits, MVT::i8); + Cmode = 0xe; + Imm = SplatBits; + VT = MVT::i8; + break; case 16: // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. - if ((SplatBits & ~0xff) == 0 || - (SplatBits & ~0xff00) == 0) - return DAG.getTargetConstant(SplatBits, MVT::i16); - break; + VT = MVT::i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn: Op=x, Cmode=100x. + Cmode = 0x8; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00: Op=x, Cmode=101x. + Cmode = 0xa; + Imm = SplatBits >> 8; + break; + } + return SDValue(); case 32: // NEON's 32-bit VMOV supports splat values where: // * only one byte is nonzero, or // * the least significant byte is 0xff and the second byte is nonzero, or // * the least significant 2 bytes are 0xff and the third is nonzero. - if ((SplatBits & ~0xff) == 0 || - (SplatBits & ~0xff00) == 0 || - (SplatBits & ~0xff0000) == 0 || - (SplatBits & ~0xff000000) == 0) - return DAG.getTargetConstant(SplatBits, MVT::i32); + VT = MVT::i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn: Op=x, Cmode=000x. + Cmode = 0; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00: Op=x, Cmode=001x. + Cmode = 0x2; + Imm = SplatBits >> 8; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000: Op=x, Cmode=010x. + Cmode = 0x4; + Imm = SplatBits >> 16; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000: Op=x, Cmode=011x. + Cmode = 0x6; + Imm = SplatBits >> 24; + break; + } if ((SplatBits & ~0xffff) == 0 && - ((SplatBits | SplatUndef) & 0xff) == 0xff) - return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32); + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff: Op=x, Cmode=1100. + Cmode = 0xc; + Imm = SplatBits >> 8; + SplatBits |= 0xff; + break; + } if ((SplatBits & ~0xffffff) == 0 && - ((SplatBits | SplatUndef) & 0xffff) == 0xffff) - return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32); + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff: Op=x, Cmode=1101. + Cmode = 0xd; + Imm = SplatBits >> 16; + SplatBits |= 0xffff; + break; + } // Note: there are a few 32-bit splat values (specifically: 00ffff00, // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not // VMOV.I32. A (very) minor optimization would be to replicate the value // and fall through here to test for a valid 64-bit splat. But, then the // caller would also need to check and handle the change in size. - break; + return SDValue(); case 64: { // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. + if (!isVMOV) + return SDValue(); uint64_t BitMask = 0xff; uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { - if (((SplatBits | SplatUndef) & BitMask) == BitMask) + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { Val |= BitMask; - else if ((SplatBits & BitMask) != 0) + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { return SDValue(); + } BitMask <<= 8; + ImmMask <<= 1; } - return DAG.getTargetConstant(Val, MVT::i64); + // Op=1, Cmode=1110. + Op = 1; + Cmode = 0xe; + SplatBits = Val; + VT = MVT::i64; + break; } default: - llvm_unreachable("unexpected size for isVMOVSplat"); - break; + llvm_unreachable("unexpected size for isNEONModifiedImm"); + return SDValue(); } - return SDValue(); + if (DoEncode) + return DAG.getTargetConstant((Op << 12) | (Cmode << 8) | Imm, MVT::i32); + return DAG.getTargetConstant(SplatBits, VT); } -/// getVMOVImm - If this is a build_vector of constants which can be -/// formed by using a VMOV instruction of the specified element size, -/// return the constant being splatted. The ByteSize field indicates the -/// number of bytes of each element [1248]. -SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { + +/// getNEONModImm - If this is a valid vector constant for a NEON instruction +/// with a "modified immediate" operand (e.g., VMOV) of the specified element +/// size, return the encoded value for that immediate. The ByteSize field +/// indicates the number of bytes of each element [1248]. +SDValue ARM::getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV, + SelectionDAG &DAG) { BuildVectorSDNode *BVN = dyn_cast(N); APInt SplatBits, SplatUndef; unsigned SplatBitSize; @@ -2554,8 +2976,8 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { if (SplatBitSize > ByteSize * 8) return SDValue(); - return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), - SplatBitSize, DAG); + return isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, isVMOV, true); } static bool isVEXTMask(const SmallVectorImpl &M, EVT VT, @@ -2795,28 +3217,65 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { - SDValue Val = isVMOVSplat(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, DAG); + // Check if an immediate VMOV works. + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), + SplatBitSize, DAG, true, false); if (Val.getNode()) return BuildSplat(Val, VT, DAG, dl); } } - // If there are only 2 elements in a 128-bit vector, insert them into an - // undef vector. This handles the common case for 128-bit vector argument - // passing, where the insertions should be translated to subreg accesses - // with no real instructions. - if (VT.is128BitVector() && Op.getNumOperands() == 2) { - SDValue Val = DAG.getUNDEF(VT); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - if (Op0.getOpcode() != ISD::UNDEF) - Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op0, - DAG.getIntPtrConstant(0)); - if (Op1.getOpcode() != ISD::UNDEF) - Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op1, - DAG.getIntPtrConstant(1)); - return Val; + // Scan through the operands to see if only one value is used. + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool isConstant = true; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa(V) && !isa(V)) + isConstant = false; + + if (!Value.getNode()) + Value = V; + else if (V != Value) + usesOnlyOneValue = false; + } + + if (!Value.getNode()) + return DAG.getUNDEF(VT); + + if (isOnlyLowElement) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + + // If all elements are constants, fall back to the default expansion, which + // will generate a load from the constant pool. + if (isConstant) + return SDValue(); + + // Use VDUP for non-constant splats. + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (usesOnlyOneValue && EltSize <= 32) + return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + + // Vectors with 32- or 64-bit elements can be built by directly assigning + // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands + // will be legalized. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + SmallVector Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Op.getOperand(i))); + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); } return SDValue(); @@ -2852,7 +3311,9 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, bool ReverseVEXT; unsigned Imm, WhichResult; - return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16) || @@ -2950,64 +3411,67 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // of the same time so that they get CSEd properly. SVN->getMask(ShuffleMask); - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { - int Lane = SVN->getSplatIndex(); - // If this is undef splat, generate it via "just" vdup, if possible. - if (Lane == -1) Lane = 0; + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize <= 32) { + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; - if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { - return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i32)); } - return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, - DAG.getConstant(Lane, MVT::i32)); - } - bool ReverseVEXT; - unsigned Imm; - if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { - if (ReverseVEXT) - std::swap(V1, V2); - return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, - DAG.getConstant(Imm, MVT::i32)); - } - - if (isVREVMask(ShuffleMask, VT, 64)) - return DAG.getNode(ARMISD::VREV64, dl, VT, V1); - if (isVREVMask(ShuffleMask, VT, 32)) - return DAG.getNode(ARMISD::VREV32, dl, VT, V1); - if (isVREVMask(ShuffleMask, VT, 16)) - return DAG.getNode(ARMISD::VREV16, dl, VT, V1); - - // Check for Neon shuffles that modify both input vectors in place. - // If both results are used, i.e., if there are two shuffles with the same - // source operands and with masks corresponding to both results of one of - // these operations, DAG memoization will ensure that a single node is - // used for both shuffles. - unsigned WhichResult; - if (isVTRNMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVUZPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVZIPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); + bool ReverseVEXT; + unsigned Imm; + if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ReverseVEXT) + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, + DAG.getConstant(Imm, MVT::i32)); + } - if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + if (isVREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(ARMISD::VREV64, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(ARMISD::VREV32, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. - if (VT.getVectorNumElements() == 4 && - (VT.is128BitVector() || VT.is64BitVector())) { + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) @@ -3019,7 +3483,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); @@ -3027,6 +3490,28 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } + // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1); + V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2); + SmallVector Ops; + for (unsigned i = 0; i < NumElts; ++i) { + if (ShuffleMask[i] < 0) + Ops.push_back(DAG.getUNDEF(EltVT)); + else + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + ShuffleMask[i] < (int)NumElts ? V1 : V2, + DAG.getConstant(ShuffleMask[i] & (NumElts-1), + MVT::i32))); + } + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); + } + return SDValue(); } @@ -3061,7 +3546,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val); } -SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); @@ -3074,16 +3559,18 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); - case ISD::VASTART: return LowerVASTART(Op, DAG, VarArgsFrameIndex); + case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); - case ISD::RETURNADDR: break; + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG); @@ -3107,22 +3594,22 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { + SDValue Res; switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this!"); - return; + break; case ISD::BIT_CONVERT: - Results.push_back(ExpandBIT_CONVERT(N, DAG)); - return; + Res = ExpandBIT_CONVERT(N, DAG); + break; case ISD::SRL: - case ISD::SRA: { - SDValue Res = LowerShift(N, DAG, Subtarget); - if (Res.getNode()) - Results.push_back(Res); - return; - } + case ISD::SRA: + Res = LowerShift(N, DAG, Subtarget); + break; } + if (Res.getNode()) + Results.push_back(Res); } //===----------------------------------------------------------------------===// @@ -3304,8 +3791,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB, - DenseMap *EM) const { + MachineBasicBlock *BB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -3389,12 +3875,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, F->insert(It, sinkMBB); // Update machine-CFG edges by first adding all successors of the current // block to the new block which will contain the Phi node for the select. - // Also inform sdisel of the edge changes. for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), - E = BB->succ_end(); I != E; ++I) { - EM->insert(std::make_pair(*I, sinkMBB)); + E = BB->succ_end(); I != E; ++I) sinkMBB->addSuccessor(*I); - } // Next, remove all successors of the current block, and add the true // and fallthrough blocks as its successors. while (!BB->succ_empty()) @@ -3594,6 +4077,62 @@ static SDValue PerformSUBCombine(SDNode *N, return SDValue(); } +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + + if (Subtarget->isThumb1Only()) + return SDValue(); + + if (DAG.getMachineFunction(). + getFunction()->hasFnAttr(Attribute::OptimizeForSize)) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return SDValue(); + + ConstantSDNode *C = dyn_cast(N->getOperand(1)); + if (!C) + return SDValue(); + + uint64_t MulAmt = C->getZExtValue(); + unsigned ShiftAmt = CountTrailingZeros_64(MulAmt); + ShiftAmt = ShiftAmt & (32 - 1); + SDValue V = N->getOperand(0); + DebugLoc DL = N->getDebugLoc(); + + SDValue Res; + MulAmt >>= ShiftAmt; + if (isPowerOf2_32(MulAmt - 1)) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + Res = DAG.getNode(ISD::ADD, DL, VT, + V, DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt-1), + MVT::i32))); + } else if (isPowerOf2_32(MulAmt + 1)) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + Res = DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt+1), + MVT::i32)), + V); + } else + return SDValue(); + + if (ShiftAmt != 0) + Res = DAG.getNode(ISD::SHL, DL, VT, Res, + DAG.getConstant(ShiftAmt, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); +} + /// PerformVMOVRRDCombine - Target-specific dag combine xforms for /// ARMISD::VMOVRRD. static SDValue PerformVMOVRRDCombine(SDNode *N, @@ -3735,7 +4274,8 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Narrowing shifts require an immediate right shift. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) break; - llvm_unreachable("invalid shift count for narrowing vector shift intrinsic"); + llvm_unreachable("invalid shift count for narrowing vector shift " + "intrinsic"); default: llvm_unreachable("unhandled vector shift"); @@ -3980,6 +4520,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::ADD: return PerformADDCombine(N, DCI); case ISD::SUB: return PerformSUBCombine(N, DCI); + case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: @@ -4359,9 +4900,11 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); } else return false; @@ -4369,13 +4912,25 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isLegal = false; if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, - isInc, DAG); + isInc, DAG); else isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) return false; + if (Ptr != Base) { + // Swap base ptr and offset to catch more post-index load / store when + // it's legal. In Thumb2 mode, offset must be an immediate. + if (Ptr == Offset && Op->getOpcode() == ISD::ADD && + !Subtarget->isThumb2()) + std::swap(Base, Offset); + + // Post-indexed load / store update the base pointer. + if (Ptr != Base) + return false; + } + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; return true; } @@ -4446,7 +5001,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } if (StringRef("{cc}").equals_lower(Constraint)) - return std::make_pair(0U, ARM::CCRRegisterClass); + return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass); return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); }