X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=478bf71c686c6d7b732cc94e7a36846a13de90ba;hb=1292c226458b68a119d3a387a0527f453b2065c2;hp=95dbb61766874a5c21d35d81c80928364f664cc7;hpb=bbfc31012bad41d1eda6d65c0081760e8fdbc323;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 95dbb617668..478bf71c686 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -63,9 +63,9 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2); static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { - + bool is64Bit = TM.getSubtarget().is64Bit(); - + if (TM.getSubtarget().isTargetDarwin()) { if (is64Bit) return new X8664_MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); @@ -74,7 +74,7 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { return new X8632_ELFTargetObjectFile(TM); } else if (TM.getSubtarget().isTargetCOFF()) { return new TargetLoweringObjectFileCOFF(); - } + } llvm_unreachable("unknown subtarget type"); } @@ -96,6 +96,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { + // Setup Windows compiler runtime calls. + setLibcallName(RTLIB::SDIV_I64, "_alldiv"); + setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); + } + if (Subtarget->isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); @@ -213,16 +225,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // TODO: when we have SSE, these could be more efficient, by using movd/movq. - if (!X86ScalarSSEf64) { - setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); - setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); + if (!X86ScalarSSEf64) { + setOperationAction(ISD::BITCAST , MVT::f32 , Expand); + setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget->is64Bit()) { - setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); - // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. - if (Subtarget->hasMMX() && !DisableMMX) - setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); - else - setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); + setOperationAction(ISD::BITCAST , MVT::f64 , Expand); + // Without SSE, i64->f64 goes through memory. + setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } @@ -276,21 +285,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); - setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTTZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i8 , Custom); - setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); - setOperationAction(ISD::CTPOP , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) { - setOperationAction(ISD::CTPOP , MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ , MVT::i64 , Custom); } + if (Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP , MVT::i8 , Promote); + } else { + setOperationAction(ISD::CTPOP , MVT::i8 , Expand); + setOperationAction(ISD::CTPOP , MVT::i16 , Expand); + setOperationAction(ISD::CTPOP , MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + } + setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); setOperationAction(ISD::BSWAP , MVT::i16 , Expand); @@ -346,7 +361,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // We may not have a libcall for MEMBARRIER so we should lower this. setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); - + // On X86 and X86-64, atomic operations are lowered to locked instructions. // Locked instructions, in turn, have implicit fence semantics (all memory // operations are flushed before issuing the locked instruction, and they @@ -415,7 +430,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - if (Subtarget->isTargetCygMing()) + if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); @@ -512,13 +527,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { - bool ignored; - APFloat TmpFlt(+0.0); - TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, - &ignored); + APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS + + bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, &ignored); @@ -614,88 +628,41 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { - addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); - addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); - addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); - - addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); - - setOperationAction(ISD::ADD, MVT::v8i8, Legal); - setOperationAction(ISD::ADD, MVT::v4i16, Legal); - setOperationAction(ISD::ADD, MVT::v2i32, Legal); - setOperationAction(ISD::ADD, MVT::v1i64, Legal); - - setOperationAction(ISD::SUB, MVT::v8i8, Legal); - setOperationAction(ISD::SUB, MVT::v4i16, Legal); - setOperationAction(ISD::SUB, MVT::v2i32, Legal); - setOperationAction(ISD::SUB, MVT::v1i64, Legal); - - setOperationAction(ISD::MULHS, MVT::v4i16, Legal); - setOperationAction(ISD::MUL, MVT::v4i16, Legal); - - setOperationAction(ISD::AND, MVT::v8i8, Promote); - AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); - setOperationAction(ISD::AND, MVT::v4i16, Promote); - AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); - setOperationAction(ISD::AND, MVT::v2i32, Promote); - AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); - setOperationAction(ISD::AND, MVT::v1i64, Legal); - - setOperationAction(ISD::OR, MVT::v8i8, Promote); - AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); - setOperationAction(ISD::OR, MVT::v4i16, Promote); - AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); - setOperationAction(ISD::OR, MVT::v2i32, Promote); - AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); - setOperationAction(ISD::OR, MVT::v1i64, Legal); - - setOperationAction(ISD::XOR, MVT::v8i8, Promote); - AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); - setOperationAction(ISD::XOR, MVT::v4i16, Promote); - AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); - setOperationAction(ISD::XOR, MVT::v2i32, Promote); - AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); - setOperationAction(ISD::XOR, MVT::v1i64, Legal); - - setOperationAction(ISD::LOAD, MVT::v8i8, Promote); - AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); - setOperationAction(ISD::LOAD, MVT::v4i16, Promote); - AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); - setOperationAction(ISD::LOAD, MVT::v2i32, Promote); - AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); - setOperationAction(ISD::LOAD, MVT::v1i64, Legal); - - setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); - - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); - - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); - - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); - - setOperationAction(ISD::SELECT, MVT::v8i8, Promote); - setOperationAction(ISD::SELECT, MVT::v4i16, Promote); - setOperationAction(ISD::SELECT, MVT::v2i32, Promote); - setOperationAction(ISD::SELECT, MVT::v1i64, Custom); - setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); - setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); - setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); - - if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { - setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); - setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); - setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); - setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); - } - } + addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); + // No operations on x86mmx supported, everything uses intrinsics. + } + + // MMX-sized vectors (other than x86mmx) are expected to be expanded + // into smaller operations. + setOperationAction(ISD::MULHS, MVT::v8i8, Expand); + setOperationAction(ISD::MULHS, MVT::v4i16, Expand); + setOperationAction(ISD::MULHS, MVT::v2i32, Expand); + setOperationAction(ISD::MULHS, MVT::v1i64, Expand); + setOperationAction(ISD::AND, MVT::v8i8, Expand); + setOperationAction(ISD::AND, MVT::v4i16, Expand); + setOperationAction(ISD::AND, MVT::v2i32, Expand); + setOperationAction(ISD::AND, MVT::v1i64, Expand); + setOperationAction(ISD::OR, MVT::v8i8, Expand); + setOperationAction(ISD::OR, MVT::v4i16, Expand); + setOperationAction(ISD::OR, MVT::v2i32, Expand); + setOperationAction(ISD::OR, MVT::v1i64, Expand); + setOperationAction(ISD::XOR, MVT::v8i8, Expand); + setOperationAction(ISD::XOR, MVT::v4i16, Expand); + setOperationAction(ISD::XOR, MVT::v2i32, Expand); + setOperationAction(ISD::XOR, MVT::v1i64, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); + setOperationAction(ISD::SELECT, MVT::v8i8, Expand); + setOperationAction(ISD::SELECT, MVT::v4i16, Expand); + setOperationAction(ISD::SELECT, MVT::v2i32, Expand); + setOperationAction(ISD::SELECT, MVT::v1i64, Expand); + setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); + setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); + setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); + setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); if (!UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); @@ -795,7 +762,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Do not attempt to promote non-128-bit vectors if (!VT.is128BitVector()) continue; - + setOperationAction(ISD::AND, SVT, Promote); AddPromotedToType (ISD::AND, SVT, MVT::v2i64); setOperationAction(ISD::OR, SVT, Promote); @@ -818,10 +785,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); - if (!DisableMMX && Subtarget->hasMMX()) { - setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); - } } if (Subtarget->hasSSE41()) { @@ -1139,21 +1102,11 @@ unsigned X86TargetLowering::getJumpTableEncoding() const { if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; - + // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } -/// getPICBaseSymbol - Return the X86-32 PIC base. -MCSymbol * -X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, - MCContext &Ctx) const { - const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); - return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ - Twine(MF->getFunctionNumber())+"$pb"); -} - - const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, @@ -1188,7 +1141,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. - return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); + return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } /// getFunctionAlignment - Return the Log2 alignment of this function. @@ -1207,8 +1160,7 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{ RRC = (Subtarget->is64Bit() ? X86::GR64RegisterClass : X86::GR32RegisterClass); break; - case MVT::v8i8: case MVT::v4i16: - case MVT::v2i32: case MVT::v1i64: + case MVT::x86mmx: RRC = X86::VR64RegisterClass; break; case MVT::f32: case MVT::f64: @@ -1225,7 +1177,9 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{ unsigned X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; + const TargetFrameInfo *TFI = MF.getTarget().getFrameInfo(); + + unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; switch (RC->getID()) { default: return 0; @@ -1267,7 +1221,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, #include "X86GenCallingConv.inc" -bool +bool X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { @@ -1312,9 +1266,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); - // If this is x86-64, and we disabled SSE, we can't return FP values - if ((ValVT == MVT::f32 || ValVT == MVT::f64) && - (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { + // If this is x86-64, and we disabled SSE, we can't return FP values, + // or SSE or MMX vectors. + if ((ValVT == MVT::f32 || ValVT == MVT::f64 || + VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && + (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but @@ -1340,20 +1296,19 @@ X86TargetLowering::LowerReturn(SDValue Chain, // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget->is64Bit()) { - if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { - ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); + if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); - // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget->hasSSE2()) - ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy); + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } } - + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); } @@ -1367,7 +1322,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); - assert(Reg && + assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); @@ -1388,6 +1343,28 @@ X86TargetLowering::LowerReturn(SDValue Chain, MVT::Other, &RetOps[0], RetOps.size()); } +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() != ISD::CopyToReg && + Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != X86ISD::RET_FLAG) + return false; + HasRet = true; + } + + return HasRet; +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// @@ -1456,7 +1433,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, MVT::i64, InFlag).getValue(1); Val = Chain.getValue(0); } - Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); + Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); @@ -1499,30 +1476,6 @@ ArgsAreStructReturn(const SmallVectorImpl &Ins) { return Ins[0].Flags.isSRet(); } -/// CCAssignFnForNode - Selects the correct CCAssignFn for a the -/// given CallingConvention value. -CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { - if (Subtarget->is64Bit()) { - if (CC == CallingConv::GHC) - return CC_X86_64_GHC; - else if (Subtarget->isTargetWin64()) - return CC_X86_Win64_C; - else - return CC_X86_64_C; - } - - if (CC == CallingConv::X86_FastCall) - return CC_X86_32_FastCall; - else if (CC == CallingConv::X86_ThisCall) - return CC_X86_32_ThisCall; - else if (CC == CallingConv::Fast) - return CC_X86_32_FastCC; - else if (CC == CallingConv::GHC) - return CC_X86_32_GHC; - else - return CC_X86_32_C; -} - /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified /// by "Src" to address "Dst" with size and alignment information specified by /// the specific parameter attribute. The copy will be passed as a byval @@ -1531,10 +1484,11 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, DebugLoc dl) { - SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, - NULL, 0, NULL, 0); + MachinePointerInfo(), MachinePointerInfo()); } /// IsTailCallConvention - Return true if the calling convention is one that @@ -1583,7 +1537,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, VA.getLocMemOffset(), isImmutable); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); return DAG.getLoad(ValVT, dl, Chain, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0); } } @@ -1617,7 +1571,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); - CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); + CCInfo.AnalyzeFormalArguments(Ins, CC_X86); unsigned LastVal = ~0U; SDValue ArgValue; @@ -1644,7 +1598,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = X86::VR256RegisterClass; else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) RC = X86::VR128RegisterClass; - else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) + else if (RegVT == MVT::x86mmx) RC = X86::VR64RegisterClass; else llvm_unreachable("Unknown argument type!"); @@ -1662,14 +1616,13 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) - ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector()) { - ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, - ArgValue, DAG.getConstant(0, MVT::i64)); - ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); + ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), + ArgValue); } else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } @@ -1680,8 +1633,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) - ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, - false, false, 0); + ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, + MachinePointerInfo(), false, false, 0); InVals.push_back(ArgValue); } @@ -1708,8 +1661,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { - if (Is64Bit || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall)) { + if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall))) { FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); } if (Is64Bit) { @@ -1719,9 +1672,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, static const unsigned GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; - static const unsigned XMMArgRegsWin64[] = { - X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 - }; static const unsigned GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; @@ -1729,21 +1679,23 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - const unsigned *GPR64ArgRegs, *XMMArgRegs; + const unsigned *GPR64ArgRegs; + unsigned NumXMMRegs = 0; if (IsWin64) { - TotalNumIntRegs = 4; TotalNumXMMRegs = 4; + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + TotalNumIntRegs = 4; GPR64ArgRegs = GPR64ArgRegsWin64; - XMMArgRegs = XMMArgRegsWin64; } else { TotalNumIntRegs = 6; TotalNumXMMRegs = 8; GPR64ArgRegs = GPR64ArgRegs64Bit; - XMMArgRegs = XMMArgRegs64Bit; + + NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); } unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, - TotalNumXMMRegs); bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && @@ -1755,14 +1707,24 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // on the stack. TotalNumXMMRegs = 0; - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so they - // may be loaded by deferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex( - MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, + if (IsWin64) { + const TargetFrameInfo &TFI = *getTargetMachine().getFrameInfo(); + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so they + // may be loaded by deferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex( + MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, false)); + } // Store the integer parameter registers. SmallVector MemOps; @@ -1777,9 +1739,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - PseudoSourceValue::getFixedStack( - FuncInfo->getRegSaveFrameIndex()), - Offset, false, false, 0); + MachinePointerInfo::getFixedStack( + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); MemOps.push_back(Store); Offset += 8; } @@ -1799,7 +1761,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->getVarArgsFPOffset())); for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { - unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], + unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], X86::VR128RegisterClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); SaveXMMOps.push_back(Val); @@ -1847,11 +1809,11 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); - if (Flags.isByVal()) { + if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - } + return DAG.getStore(Chain, dl, Arg, PtrOff, - PseudoSourceValue::getStack(), LocMemOffset, + MachinePointerInfo::getStack(LocMemOffset), false, false, 0); } @@ -1867,7 +1829,8 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. - OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), + false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } @@ -1886,7 +1849,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, EVT VT = Is64Bit ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, - PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, + MachinePointerInfo::getFixedStack(NewReturnAddrFI), false, false, 0); return Chain; } @@ -1927,7 +1890,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); - CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); + CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); @@ -1986,21 +1949,21 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, case CCValAssign::AExt: if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { // Special case: passing MMX values in XMM registers. - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0); Arg = SpillSlot; break; @@ -2143,7 +2106,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Store relative to framepointer. MemOpChains2.push_back( DAG.getStore(ArgChain, dl, Arg, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0)); } } @@ -2192,8 +2155,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - (GV->isDeclaration() || GV->isWeakForLinker()) && - Subtarget->getDarwinVers() < 9) { + (GV->isDeclaration() || GV->isWeakForLinker()) && + Subtarget->getDarwinVers() < 9) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -2206,13 +2169,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned char OpFlags = 0; - // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external - // symbols should go through the PLT. + // On ELF targets, in either X86-64 or X86-32 mode, direct calls to + // external symbols should go through the PLT. if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - Subtarget->getDarwinVers() < 9) { + Subtarget->getDarwinVers() < 9) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -2510,7 +2473,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); - CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); + CCInfo.AnalyzeCallOperands(Outs, CC_X86); if (CCInfo.getNextStackOffset()) { MachineFunction &MF = DAG.getMachineFunction(); if (MF.getInfo()->getBytesToPopOnReturn()) @@ -2564,6 +2527,11 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + // An stdcall caller is expected to clean up its arguments; the callee + // isn't going to do that. + if (!CCMatch && CallerCC==CallingConv::X86_StdCall) + return false; + return true; } @@ -2592,6 +2560,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFPD: + case X86ISD::PALIGN: case X86ISD::SHUFPS: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: @@ -2600,6 +2569,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVLPD: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKLPS: @@ -2625,6 +2595,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: return DAG.getNode(Opc, dl, VT, V1); } @@ -2648,6 +2619,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::PALIGN: case X86ISD::SHUFPD: case X86ISD::SHUFPS: return DAG.getNode(Opc, dl, VT, V1, V2, @@ -2865,7 +2837,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) { /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. static bool isPSHUFDMask(const SmallVectorImpl &Mask, EVT VT) { - if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) + if (VT == MVT::v4f32 || VT == MVT::v4i32 ) return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); if (VT == MVT::v2f64 || VT == MVT::v2i64) return (Mask[0] < 2 && Mask[1] < 2); @@ -2933,15 +2905,15 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { static bool isPALIGNRMask(const SmallVectorImpl &Mask, EVT VT, bool hasSSSE3) { int i, e = VT.getVectorNumElements(); - + // Do not handle v2i64 / v2f64 shuffles with palignr. if (e < 4 || !hasSSSE3) return false; - + for (i = 0; i != e; ++i) if (Mask[i] >= 0) break; - + // All undef, not a palignr. if (i == e) return false; @@ -2952,13 +2924,13 @@ static bool isPALIGNRMask(const SmallVectorImpl &Mask, EVT VT, bool NeedsUnary = false; int s = Mask[i] - i; - + // Check the rest of the elements to see if they are consecutive. for (++i; i != e; ++i) { int m = Mask[i]; - if (m < 0) + if (m < 0) continue; - + Unary = Unary && (m < (int)e); NeedsUnary = NeedsUnary || (m < s); @@ -3046,10 +3018,10 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { /// <2, 3, 2, 3> bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { unsigned NumElems = N->getValueType(0).getVectorNumElements(); - + if (NumElems != 4) return false; - + return isUndefOrEqual(N->getMaskElt(0), 2) && isUndefOrEqual(N->getMaskElt(1), 3) && isUndefOrEqual(N->getMaskElt(2), 2) && @@ -3537,13 +3509,10 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted + // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; - if (VT.getSizeInBits() == 64) { // MMX - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - } else if (VT.getSizeInBits() == 128) { + if (VT.getSizeInBits() == 128) { // SSE if (HasSSE2) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); @@ -3559,7 +3528,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); } - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. @@ -3571,11 +3540,8 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { // type. This ensures they get CSE'd. SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; - if (VT.getSizeInBits() == 64) // MMX - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); - else // SSE - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -3640,9 +3606,6 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, /// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { - if (SV->getValueType(0).getVectorNumElements() <= 4) - return SDValue(SV, 0); - EVT PVT = MVT::v4f32; EVT VT = SV->getValueType(0); DebugLoc dl = SV->getDebugLoc(); @@ -3663,9 +3626,9 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Perform the splat. int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); + V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1); V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, V1); } /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified @@ -3789,7 +3752,7 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, } // Actual nodes that may contain scalar elements - if (Opcode == ISD::BIT_CONVERT) { + if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); @@ -3978,7 +3941,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } } - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); } /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. @@ -4017,11 +3980,10 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, DebugLoc dl) { - bool isMMX = VT.getSizeInBits() == 64; - EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; + EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; - SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); } @@ -4029,7 +3991,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, SDValue X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, SelectionDAG &DAG) const { - + // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. @@ -4084,41 +4046,42 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, int EltNo = (Offset - StartOffset) >> 2; int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; - SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, + SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(StartOffset), false, false, 0); // Canonicalize it to a v4i32 shuffle. - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getVectorShuffle(MVT::v4i32, dl, V1, - DAG.getUNDEF(MVT::v4i32), &Mask[0])); + DAG.getUNDEF(MVT::v4i32),&Mask[0])); } return SDValue(); } -/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a -/// vector of type 'VT', see if the elements can be replaced by a single large +/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a +/// vector of type 'VT', see if the elements can be replaced by a single large /// load which has the same value as a build_vector whose operands are 'elts'. /// /// Example: -> zextload a -/// +/// /// FIXME: we'd also like to handle the case where the last elements are zero /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, - DebugLoc &dl, SelectionDAG &DAG) { + DebugLoc &DL, SelectionDAG &DAG) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); - + LoadSDNode *LDBase = NULL; unsigned LastLoadedElt = -1U; - + // For each element in the initializer, see if we've found a load or an undef. - // If we don't find an initial load element, or later load elements are + // If we don't find an initial load element, or later load elements are // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; - + if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); @@ -4143,18 +4106,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) - return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), 0); - return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->getAlignment()); } else if (NumElems == 4 && LastLoadedElt == 1) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; - SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); + SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, + Ops, 2, MVT::i32, + LDBase->getMemOperand()); + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); } @@ -4169,10 +4134,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(Op.getNode()) || (Op.getValueType().getSizeInBits() != 256 && ISD::isBuildVectorAllOnes(Op.getNode()))) { - // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to + // Canonicalize this to <4 x i32> (SSE) to // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are // eliminated on x86-32 hosts. - if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) + if (Op.getValueType() == MVT::v4i32) return Op; if (ISD::isBuildVectorAllOnes(Op.getNode())) @@ -4223,9 +4188,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && (!IsAllConstants || Idx == 0)) { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { - // Handle MMX and SSE both. - EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; - unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; + // Handle SSE only. + assert(VT == MVT::v2i64 && "Expected an SSE value type!"); + EVT VecVT = MVT::v4i32; + unsigned VecElts = 4; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). @@ -4245,7 +4211,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(Item.getValueType()), &Mask[0]); } - return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); } } @@ -4264,11 +4230,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG); } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + EVT MiddleVT = MVT::v4i32; Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), DAG); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); + return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -4394,20 +4361,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Check for a build vector of consecutive loads. for (unsigned i = 0; i < NumElems; ++i) V[i] = Op.getOperand(i); - + // Check for elements which are consecutive loads. SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); if (LD.getNode()) return LD; - - // For SSE 4.1, use insertps to put the high elements into the low element. + + // For SSE 4.1, use insertps to put the high elements into the low element. if (getSubtarget()->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); - + for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, @@ -4415,7 +4382,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } return Result; } - + // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). @@ -4441,7 +4408,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (V[i+EltStride].getOpcode() == ISD::UNDEF && EltStride == NumElems/2) continue; - + V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); } EltStride >>= 1; @@ -4461,21 +4428,21 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || ResVT == MVT::v8i16 || ResVT == MVT::v16i8); int Mask[2]; - SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); + SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); InVec = Op.getOperand(1); if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { unsigned NumElts = ResVT.getVectorNumElements(); - VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); + VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); } else { - InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); + InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); Mask[0] = 0; Mask[1] = 2; VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); } - return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); + return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); } // v8i16 shuffles - Prefer shuffles in the following order: @@ -4557,9 +4524,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); NewV = DAG.getVectorShuffle(MVT::v2i64, dl, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); - NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); + NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the // source words for the shuffle, to aid later transformations. @@ -4628,12 +4595,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); } - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); if (!TwoInputs) - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); // Calculate the shuffle mask for the second input, shuffle it, and // OR it with the first shuffled input. @@ -4648,12 +4615,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); } - V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); } // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, @@ -4820,8 +4787,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // No SSSE3 - Calculate in place words and then fix all out of place words // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from // the 16 different words that comprise the two doublequadword input vectors. - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); - V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); SDValue NewV = V2Only ? V2 : V1; for (int i = 0; i != 8; ++i) { int Elt0 = MaskVals[i*2]; @@ -4883,25 +4850,23 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, DAG.getIntPtrConstant(i)); } - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide -/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be +/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in /// the right sequence. e.g. -/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> +/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, - const TargetLowering &TLI, DebugLoc dl) { + SelectionDAG &DAG, DebugLoc dl) { EVT VT = SVOp->getValueType(0); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); unsigned NumElems = VT.getVectorNumElements(); unsigned NewWidth = (NumElems == 4) ? 2 : 4; - EVT MaskVT = (NewWidth == 4) ? MVT::v4i16 : MVT::v2i32; - EVT NewVT = MaskVT; + EVT NewVT; switch (VT.getSimpleVT().SimpleTy) { default: assert(false && "Unexpected!"); case MVT::v4f32: NewVT = MVT::v2f64; break; @@ -4910,12 +4875,6 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, case MVT::v16i8: NewVT = MVT::v4i32; break; } - if (NewWidth == 2) { - if (VT.isInteger()) - NewVT = MVT::v2i64; - else - NewVT = MVT::v2f64; - } int Scale = NumElems / NewWidth; SmallVector MaskVec; for (unsigned i = 0; i < NumElems; i += Scale) { @@ -4935,8 +4894,8 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, MaskVec.push_back(StartIdx / Scale); } - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); } @@ -4953,13 +4912,13 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, // movssrr and movsdrr do not clear top bits. Try to use movd, movq // instead. MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; - if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && + if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && - SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && + SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { // PR2108 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, OpVT, @@ -4969,9 +4928,9 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, } } - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, - DAG.getNode(ISD::BIT_CONVERT, dl, + DAG.getNode(ISD::BITCAST, dl, OpVT, SrcOp))); } @@ -5125,7 +5084,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } static bool MayFoldVectorLoad(SDValue V) { - if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); @@ -5134,6 +5093,110 @@ static bool MayFoldVectorLoad(SDValue V) { return false; } +// FIXME: the version above should always be used. Since there's +// a bug where several vector shuffles can't be folded because the +// DAG is not updated during lowering and a node claims to have two +// uses while it only has one, use this version, and let isel match +// another instruction if the load really happens to have more than +// one use. Remove this version after this bug get fixed. +// rdar://8434668, PR8156 +static bool RelaxedMayFoldVectorLoad(SDValue V) { + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) + V = V.getOperand(0); + if (ISD::isNormalLoad(V.getNode())) + return true; + return false; +} + +/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by +/// a vector extract, and if both can be later optimized into a single load. +/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked +/// here because otherwise a target specific shuffle node is going to be +/// emitted for this shuffle, and the optimization not done. +/// FIXME: This is probably not the best approach, but fix the problem +/// until the right path is decided. +static +bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, + const TargetLowering &TLI) { + EVT VT = V.getValueType(); + ShuffleVectorSDNode *SVOp = dyn_cast(V); + + // Be sure that the vector shuffle is present in a pattern like this: + // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) + if (!V.hasOneUse()) + return false; + + SDNode *N = *V.getNode()->use_begin(); + if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + SDValue EltNo = N->getOperand(1); + if (!isa(EltNo)) + return false; + + // If the bit convert changed the number of elements, it is unsafe + // to examine the mask. + bool HasShuffleIntoBitcast = false; + if (V.getOpcode() == ISD::BITCAST) { + EVT SrcVT = V.getOperand(0).getValueType(); + if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) + return false; + V = V.getOperand(0); + HasShuffleIntoBitcast = true; + } + + // Select the input vector, guarding against out of range extract vector. + unsigned NumElems = VT.getVectorNumElements(); + unsigned Elt = cast(EltNo)->getZExtValue(); + int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); + V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); + + // Skip one more bit_convert if necessary + if (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + if (ISD::isNormalLoad(V.getNode())) { + // Is the original load suitable? + LoadSDNode *LN0 = cast(V); + + // FIXME: avoid the multi-use bug that is preventing lots of + // of foldings to be detected, this is still wrong of course, but + // give the temporary desired behavior, and if it happens that + // the load has real more uses, during isel it will not fold, and + // will generate poor code. + if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() + return false; + + if (!HasShuffleIntoBitcast) + return true; + + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + unsigned NewAlign = + TLI.getTargetData()->getABITypeAlignment( + VT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) + return false; + } + + return true; +} + +static +SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Canonizalize to v2f64. + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, + getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, + V1, DAG)); +} + static SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { @@ -5247,58 +5310,106 @@ static inline unsigned getUNPCKHOpcode(EVT VT) { return 0; } -SDValue -X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { +static +SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, + const TargetLowering &TLI, + const X86Subtarget *Subtarget) { ShuffleVectorSDNode *SVOp = cast(Op); - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); - unsigned NumElems = VT.getVectorNumElements(); - bool isMMX = VT.getSizeInBits() == 64; - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - bool V1IsSplat = false; - bool V2IsSplat = false; - bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); - bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); - MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); if (isZeroShuffle(SVOp)) return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); - // Promote splats to v4f32. + // Handle splat operations if (SVOp->isSplat()) { - if (isMMX || NumElems < 4) + // Special case, this is the only place now where it's + // allowed to return a vector_shuffle operation without + // using a target specific node, because *hopefully* it + // will be optimized away by the dag combiner. + if (VT.getVectorNumElements() <= 4 && + CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) return Op; + + // Handle splats by matching through known masks + if (VT.getVectorNumElements() <= 4) + return SDValue(); + + // Canonicalize all of the remaining to v4f32. return PromoteSplat(SVOp, DAG); } // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, - LowerVECTOR_SHUFFLE(NewOp, DAG)); + return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) { if (isCommutedMOVL(cast(NewOp), true, false)) return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), DAG, Subtarget, dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode() && X86::isMOVLMask(cast(NewOp))) return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), DAG, Subtarget, dl); } } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { + ShuffleVectorSDNode *SVOp = cast(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned NumElems = VT.getVectorNumElements(); + bool isMMX = VT.getSizeInBits() == 64; + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsSplat = false; + bool V2IsSplat = false; + bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); + bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); + bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); + MachineFunction &MF = DAG.getMachineFunction(); + bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); + + // Shuffle operations on MMX not supported. + if (isMMX) + return Op; + + // Vector shuffle lowering takes 3 steps: + // + // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable + // narrowing and commutation of operands should be handled. + // 2) Matching of shuffles with known shuffle masks to x86 target specific + // shuffle nodes. + // 3) Rewriting of unmatched masks into new generic shuffle operations, + // so the shuffle can be broken into other shuffles and the legalizer can + // try the lowering again. + // + // The general ideia is that no vector_shuffle operation should be left to + // be matched during isel, all of them must be converted to a target specific + // node here. + + // Normalize the input vectors. Here splats, zeroed vectors, profitable + // narrowing and commutation of operands should be handled. The actual code + // doesn't include all of those, work in progress... + SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); + if (NewOp.getNode()) + return NewOp; // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. @@ -5309,6 +5420,18 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (VT != MVT::v2i64 && VT != MVT::v2f64) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && + RelaxedMayFoldVectorLoad(V1)) + return getMOVDDup(Op, dl, V1, DAG); + + if (X86::isMOVHLPS_v_undef_Mask(SVOp)) + return getMOVHighToLow(Op, dl, DAG); + + // Use to match splats + if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && + (VT == MVT::v2f64 || VT == MVT::v2i64)) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + if (X86::isPSHUFDMask(SVOp)) { // The actual implementation will match the mask in the if above and then // during isel it can match several different instructions, not only pshufd @@ -5349,7 +5472,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return V2; if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); - if (!isMMX && !X86::isMOVLPMask(SVOp)) { + if (!X86::isMOVLPMask(SVOp)) { if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); @@ -5359,22 +5482,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // FIXME: fold these into legal mask. - if (!isMMX) { - if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) - return getMOVLowToHigh(Op, dl, DAG, HasSSE2); + if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) + return getMOVLowToHigh(Op, dl, DAG, HasSSE2); - if (X86::isMOVHLPSMask(SVOp)) - return getMOVHighToLow(Op, dl, DAG); + if (X86::isMOVHLPSMask(SVOp)) + return getMOVHighToLow(Op, dl, DAG); - if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) - return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); + if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) + return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); - if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) - return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); + if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) + return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); - if (X86::isMOVLPMask(SVOp)) - return getMOVLP(Op, dl, DAG, HasSSE2); - } + if (X86::isMOVLPMask(SVOp)) + return getMOVLP(Op, dl, DAG, HasSSE2); if (ShouldXformToMOVHLPS(SVOp) || ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) @@ -5415,12 +5536,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (X86::isUNPCKLMask(SVOp)) - return (isMMX) ? - Op : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); if (X86::isUNPCKHMask(SVOp)) - return (isMMX) ? - Op : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first @@ -5444,18 +5563,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *NewSVOp = cast(NewOp); if (X86::isUNPCKLMask(NewSVOp)) - return (isMMX) ? - NewOp : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); if (X86::isUNPCKHMask(NewSVOp)) - return (isMMX) ? - NewOp : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); } - // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. - // Normalize the node to match x86 shuffle ops if needed - if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) + if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are @@ -5464,15 +5579,18 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SmallVector M; SVOp->getMask(M); - // Very little shuffling can be done for 64-bit vectors right now. - if (VT.getSizeInBits() == 64) - return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ? Op : SDValue(); + if (isPALIGNRMask(M, VT, HasSSSE3)) + return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, + X86::getShufflePALIGNRImmediate(SVOp), + DAG); - // FIXME: pshufb, blends, shifts. - if (VT.getVectorNumElements() == 2 || - ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isPALIGNRMask(M, VT, Subtarget->hasSSSE3())) - return Op; + if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && + SVOp->getSplatIndex() == 0 && V2IsUndef) { + if (VT == MVT::v2f64) + return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); + if (VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); + } if (isPSHUFHWMask(M, VT)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, @@ -5514,8 +5632,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return NewOp; } - // Handle all 4 wide cases with a number of shuffles except for MMX. - if (NumElems == 4 && !isMMX) + // Handle all 4 wide cases with a number of shuffles. + if (NumElems == 4) return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); return SDValue(); @@ -5538,7 +5656,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, if (Idx == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, dl, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), Op.getOperand(1))); @@ -5559,14 +5677,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, if ((User->getOpcode() != ISD::STORE || (isa(Op.getOperand(1)) && cast(Op.getOperand(1))->isNullValue())) && - (User->getOpcode() != ISD::BIT_CONVERT || + (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), Op.getOperand(1)); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); } else if (VT == MVT::i32) { // ExtractPS works with constant index. if (isa(Op.getOperand(1))) @@ -5597,7 +5715,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (Idx == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, dl, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. @@ -5657,8 +5775,6 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, unsigned Opc; if (VT == MVT::v8i16) Opc = X86ISD::PINSRW; - else if (VT == MVT::v4i16) - Opc = X86ISD::MMX_PINSRW; else if (VT == MVT::v16i8) Opc = X86ISD::PINSRB; else @@ -5714,8 +5830,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(cast(N2)->getZExtValue()); - return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, - dl, VT, N0, N1, N2); + return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); } @@ -5723,22 +5838,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - + if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - EVT VT = MVT::v2i32; - switch (Op.getValueType().getSimpleVT().SimpleTy) { - default: break; - case MVT::v16i8: - case MVT::v8i16: - VT = MVT::v4i32; - break; - } - return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); + assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && + "Expected an SSE type!"); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as @@ -5804,12 +5913,11 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. - if (OpFlag) { + if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()), Result); - } return Result; } @@ -5913,7 +6021,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, - PseudoSourceValue::getGOT(), 0, false, false, 0); + MachinePointerInfo::getGOT(), false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -5985,14 +6093,14 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit) { DebugLoc dl = GA->getDebugLoc(); - // Get the Thread Pointer - SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, - DebugLoc(), PtrVT, - DAG.getRegister(is64Bit? X86::FS : X86::GS, - MVT::i32)); - SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, - NULL, 0, false, false, 0); + // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). + Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), + is64Bit ? 257 : 256)); + + SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getIntPtrConstant(0), + MachinePointerInfo(Ptr), false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -6011,14 +6119,14 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial // exec) - SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - PseudoSourceValue::getGOT(), 0, false, false, 0); + MachinePointerInfo::getGOT(), false, false, 0); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. @@ -6027,29 +6135,29 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { - + GlobalAddressSDNode *GA = cast(Op); const GlobalValue *GV = GA->getGlobal(); if (Subtarget->isTargetELF()) { // TODO: implement the "local dynamic" model // TODO: implement the "initial exec"model for pic executables - + // If GV is an alias then use the aliasee for determining // thread-localness. if (const GlobalAlias *GA = dyn_cast(GV)) GV = GA->resolveAliasedGlobal(false); - - TLSModel::Model model + + TLSModel::Model model = getTLSModel(GV, getTargetMachine().getRelocationModel()); - + switch (model) { case TLSModel::GeneralDynamic: case TLSModel::LocalDynamic: // not implemented if (Subtarget->is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); - + case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, @@ -6060,7 +6168,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; - + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && @@ -6069,24 +6177,24 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; - DebugLoc DL = Op.getDebugLoc(); + DebugLoc DL = Op.getDebugLoc(); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, getPointerTy(), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); - + // With PIC32, the address is actually $g + Offset. if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()), Offset); - + // Lowering the machine isd will make sure everything is in the right // location. SDValue Args[] = { Offset }; SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); - + // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setAdjustsStack(true); @@ -6096,7 +6204,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); } - + assert(false && "TLS not implemented for this target."); @@ -6155,12 +6263,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); - if (SrcVT.isVector()) { - if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { - return Op; - } + if (SrcVT.isVector()) return SDValue(); - } assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); @@ -6181,25 +6285,36 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0, + MachinePointerInfo::getFixedStack(SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, - SDValue StackSlot, + SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD - DebugLoc dl = Op.getDebugLoc(); + DebugLoc DL = Op.getDebugLoc(); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); + + unsigned ByteSize = SrcVT.getSizeInBits()/8; + + int SSFI = cast(StackSlot)->getIndex(); + MachineMemOperand *MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); + SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; - SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, - Tys, Ops, array_lengthof(Ops)); + SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : + X86ISD::FILD, DL, + Tys, Ops, array_lengthof(Ops), + SrcVT, MMO); if (useSSE) { Chain = Result.getValue(1); @@ -6209,15 +6324,23 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); - int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); + unsigned SSFISize = Op.getValueType().getSizeInBits()/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; - Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); - Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0, + MachineMemOperand *MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); + + Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, + Ops, array_lengthof(Ops), + Op.getValueType(), MMO); + Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(SSFI), false, false, 0); } @@ -6291,12 +6414,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, DAG.getIntPtrConstant(0))); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); - SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); + SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); @@ -6324,19 +6447,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, DAG.getIntPtrConstant(0))); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), DAG.getIntPtrConstant(0)); // Or the load with the bias. SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), DAG.getIntPtrConstant(0)); // Subtract the bias. @@ -6381,24 +6504,34 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackSlot, WordOff); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, NULL, 0, false, false, 0); + StackSlot, MachinePointerInfo(), + false, false, 0); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), - OffsetSlot, NULL, 0, false, false, 0); + OffsetSlot, MachinePointerInfo(), + false, false, 0); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); return Fild; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, NULL, 0, false, false, 0); + StackSlot, MachinePointerInfo(), + false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) + int SSFI = cast(StackSlot)->getIndex(); + MachineMemOperand *MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, 8, 8); + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; - SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); + SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, + MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); @@ -6422,8 +6555,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), - FudgePtr, PseudoSourceValue::getConstantPool(), - 0, MVT::f32, false, false, 4); + FudgePtr, MachinePointerInfo::getConstantPool(), + MVT::f32, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); @@ -6431,7 +6564,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, std::pair X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { - DebugLoc dl = Op.getDebugLoc(); + DebugLoc DL = Op.getDebugLoc(); EVT DstTy = Op.getValueType(); @@ -6460,6 +6593,8 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + + unsigned Opc; switch (DstTy.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); @@ -6470,37 +6605,43 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); - if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { + EVT TheVT = Op.getOperand(0).getValueType(); + if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); - Chain = DAG.getStore(Chain, dl, Value, StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0, + Chain = DAG.getStore(Chain, DL, Value, StackSlot, + MachinePointerInfo::getFixedStack(SSFI), false, false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { - Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) + Chain, StackSlot, DAG.getValueType(TheVT) }; - Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, + DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); } + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + Ops, 3, DstTy, MMO); return std::make_pair(FIST, StackSlot); } SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) { - if (Op.getValueType() == MVT::v2i32 && - Op.getOperand(0).getValueType() == MVT::v2f64) { - return Op; - } + if (Op.getValueType().isVector()) return SDValue(); - } std::pair Vals = FP_TO_INTHelper(Op, DAG, true); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -6509,7 +6650,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, // Load the result. return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, NULL, 0, false, false, 0); + FIST, StackSlot, MachinePointerInfo(), false, false, 0); } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, @@ -6520,7 +6661,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, // Load the result. return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, NULL, 0, false, false, 0); + FIST, StackSlot, MachinePointerInfo(), false, false, 0); } SDValue X86TargetLowering::LowerFABS(SDValue Op, @@ -6546,7 +6687,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -6573,14 +6714,14 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); if (VT.isVector()) { - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::XOR, dl, MVT::v2i64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op.getOperand(0)), - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); } else { return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } @@ -6622,7 +6763,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); @@ -6632,7 +6773,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, DAG.getConstant(32, MVT::i32)); - SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); + SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, DAG.getIntPtrConstant(0)); } @@ -6651,7 +6792,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); @@ -7003,11 +7144,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { switch (VT.getSimpleVT().SimpleTy) { default: break; - case MVT::v8i8: case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; - case MVT::v4i16: case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; - case MVT::v2i32: case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; } @@ -7109,7 +7247,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) + if (C && C->getAPIntValue() == 1) Cond = Cond.getOperand(0); } @@ -7142,7 +7280,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. - if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); if (NewSetCC.getNode()) { CC = NewSetCC.getOperand(0); @@ -7216,7 +7354,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { ConstantSDNode *C = dyn_cast(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) + if (C && C->getAPIntValue() == 1) Cond = Cond.getOperand(0); } @@ -7317,7 +7455,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. - if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); if (NewSetCC.getNode()) { CC = NewSetCC.getOperand(0); @@ -7344,8 +7482,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetCygMing() && - "This should be used only on Cygwin/Mingw targets"); + assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && + "This should be used only on Windows targets"); DebugLoc dl = Op.getDebugLoc(); // Get the inputs. @@ -7362,7 +7500,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); - Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); + Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); Flag = Chain.getValue(1); Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); @@ -7376,15 +7514,15 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); - DebugLoc dl = Op.getDebugLoc(); + DebugLoc DL = Op.getDebugLoc(); - if (!Subtarget->is64Bit()) { + if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), getPointerTy()); - return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, - false, false, 0); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); } // __va_list_tag: @@ -7395,48 +7533,107 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset - SDValue Store = DAG.getStore(Op.getOperand(0), dl, + SDValue Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), MVT::i32), - FIN, SV, 0, false, false, 0); + FIN, MachinePointerInfo(SV), false, false, 0); MemOps.push_back(Store); // Store fp_offset - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, DAG.getIntPtrConstant(4)); - Store = DAG.getStore(Op.getOperand(0), dl, + Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), MVT::i32), - FIN, SV, 4, false, false, 0); + FIN, MachinePointerInfo(SV, 4), false, false, 0); MemOps.push_back(Store); // Store ptr to overflow_arg_area - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, DAG.getIntPtrConstant(4)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), getPointerTy()); - Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, + Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, + MachinePointerInfo(SV, 8), false, false, 0); MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, DAG.getIntPtrConstant(8)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); - Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, - false, false, 0); + Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, + MachinePointerInfo(SV, 16), false, false, 0); MemOps.push_back(Store); - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], MemOps.size()); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { - // X86-64 va_list is a struct { i32, i32, i8*, i8* }. - assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); + assert(Subtarget->is64Bit() && + "LowerVAARG only handles 64-bit va_arg!"); + assert((Subtarget->isTargetLinux() || + Subtarget->isTargetDarwin()) && + "Unhandled target in LowerVAARG"); + assert(Op.getNode()->getNumOperands() == 4); + SDValue Chain = Op.getOperand(0); + SDValue SrcPtr = Op.getOperand(1); + const Value *SV = cast(Op.getOperand(2))->getValue(); + unsigned Align = Op.getConstantOperandVal(3); + DebugLoc dl = Op.getDebugLoc(); - report_fatal_error("VAArgInst is not yet implemented for x86-64!"); - return SDValue(); + EVT ArgVT = Op.getNode()->getValueType(0); + const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); + uint8_t ArgMode; + + // Decide which area this value should be read from. + // TODO: Implement the AMD64 ABI in its entirety. This simple + // selection mechanism works only for the basic types. + if (ArgVT == MVT::f80) { + llvm_unreachable("va_arg for f80 not yet implemented"); + } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { + ArgMode = 2; // Argument passed in XMM register. Use fp_offset. + } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { + ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. + } else { + llvm_unreachable("Unhandled argument type in LowerVAARG"); + } + + if (ArgMode == 2) { + // Sanity Check: Make sure using fp_offset makes sense. + assert(!UseSoftFloat && + !(DAG.getMachineFunction() + .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && + Subtarget->hasSSE1()); + } + + // Insert VAARG_64 node into the DAG + // VAARG_64 returns two values: Variable Argument Address, Chain + SmallVector InstOps; + InstOps.push_back(Chain); + InstOps.push_back(SrcPtr); + InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); + InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); + InstOps.push_back(DAG.getConstant(Align, MVT::i32)); + SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); + SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, + VTs, &InstOps[0], InstOps.size(), + MVT::i64, + MachinePointerInfo(SV), + /*Align=*/0, + /*Volatile=*/false, + /*ReadMem=*/true, + /*WriteMem=*/true); + Chain = VAARG.getValue(1); + + // Load the next argument and return it + return DAG.getLoad(ArgVT, dl, + Chain, + VAARG, + MachinePointerInfo(), + false, false, 0); } SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { @@ -7447,11 +7644,12 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); - DebugLoc dl = Op.getDebugLoc(); + DebugLoc DL = Op.getDebugLoc(); - return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, + return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, - false, DstSV, 0, SrcSV, 0); + false, + MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } SDValue @@ -7720,10 +7918,11 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); } else { ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); +// FIXME this must be lowered to get rid of the invalid type. } EVT VT = Op.getValueType(); - ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); + ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(NewIntNo, MVT::i32), Op.getOperand(1), ShAmt); @@ -7747,13 +7946,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), - NULL, 0, false, false, 0); + MachinePointerInfo(), false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - RetAddrFI, NULL, 0, false, false, 0); + RetAddrFI, MachinePointerInfo(), false, false, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { @@ -7766,7 +7965,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) - FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), false, false, 0); return FrameAddr; } @@ -7791,7 +7991,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, DAG.getIntPtrConstant(TD->getPointerSize())); StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); - Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); + Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), + false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); MF.getRegInfo().addLiveOut(StoreAddrReg); @@ -7826,11 +8027,13 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), - Addr, TrmpAddr, 0, false, false, 0); + Addr, MachinePointerInfo(TrmpAddr), + false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, MVT::i64)); - OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, + MachinePointerInfo(TrmpAddr, 2), false, false, 2); // Load the 'nest' parameter value into R10. @@ -7839,11 +8042,13 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), - Addr, TrmpAddr, 10, false, false, 0); + Addr, MachinePointerInfo(TrmpAddr, 10), + false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, MVT::i64)); - OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 12), false, false, 2); // Jump to the nested function. @@ -7851,13 +8056,15 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), - Addr, TrmpAddr, 20, false, false, 0); + Addr, MachinePointerInfo(TrmpAddr, 20), + false, false, 0); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, - TrmpAddr, 22, false, false, 0); + MachinePointerInfo(TrmpAddr, 22), + false, false, 0); SDValue Ops[] = { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; @@ -7919,22 +8126,26 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri|N86Reg, MVT::i8), - Trmp, TrmpAddr, 0, false, false, 0); + Trmp, MachinePointerInfo(TrmpAddr), + false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, MVT::i32)); - OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 1), false, false, 1); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, - TrmpAddr, 5, false, false, 1); + MachinePointerInfo(TrmpAddr, 5), + false, false, 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, MVT::i32)); - OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, + MachinePointerInfo(TrmpAddr, 6), false, false, 1); SDValue Ops[] = @@ -7969,41 +8180,48 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, const TargetFrameInfo &TFI = *TM.getFrameInfo(); unsigned StackAlignment = TFI.getStackAlignment(); EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + DebugLoc DL = Op.getDebugLoc(); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, - DAG.getEntryNode(), StackSlot); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, 2, 2); + + SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, + DAG.getVTList(MVT::Other), + Ops, 2, MVT::i16, MMO); // Load FP Control Word from stack slot - SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, - false, false, 0); + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, + MachinePointerInfo(), false, false, 0); // Transform as necessary SDValue CWD1 = - DAG.getNode(ISD::SRL, dl, MVT::i16, - DAG.getNode(ISD::AND, dl, MVT::i16, + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x800, MVT::i16)), DAG.getConstant(11, MVT::i8)); SDValue CWD2 = - DAG.getNode(ISD::SRL, dl, MVT::i16, - DAG.getNode(ISD::AND, dl, MVT::i16, + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x400, MVT::i16)), DAG.getConstant(9, MVT::i8)); SDValue RetVal = - DAG.getNode(ISD::AND, dl, MVT::i16, - DAG.getNode(ISD::ADD, dl, MVT::i16, - DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), + DAG.getNode(ISD::AND, DL, MVT::i16, + DAG.getNode(ISD::ADD, DL, MVT::i16, + DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), DAG.getConstant(1, MVT::i16)), DAG.getConstant(3, MVT::i16)); return DAG.getNode((VT.getSizeInBits() < 16 ? - ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); + ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { @@ -8129,16 +8347,16 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { Op.getOperand(1), DAG.getConstant(23, MVT::i32)); ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); - + std::vector CV(4, CI); Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); - Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } @@ -8156,7 +8374,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { Constant *C = ConstantVector::get(CVM1); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 16); // r = pblendv(r, psllw(r & (char16)15, 4), a); @@ -8169,12 +8387,13 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { R, M, Op); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - + C = ConstantVector::get(CVM2); CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, false, false, 16); - + MachinePointerInfo::getConstantPool(), + false, false, 16); + // r = pblendv(r, psllw(r & (char16)63, 2), a); M = DAG.getNode(ISD::AND, dl, VT, R, M); M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, @@ -8185,7 +8404,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { R, M, Op); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - + // return pblendv(r, r+r, a); R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), @@ -8265,10 +8484,10 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ DebugLoc dl = Op.getDebugLoc(); - + if (!Subtarget->hasSSE2()) { SDValue Chain = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, + SDValue Zero = DAG.getConstant(0, Subtarget->is64Bit() ? MVT::i64 : MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::ESP, MVT::i32), // Base @@ -8279,37 +8498,37 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ Zero, Chain }; - SDNode *Res = + SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, array_lengthof(Ops)); return SDValue(Res, 0); } - + unsigned isDev = cast(Op.getOperand(5))->getZExtValue(); if (!isDev) return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); - + unsigned Op1 = cast(Op.getOperand(1))->getZExtValue(); unsigned Op2 = cast(Op.getOperand(2))->getZExtValue(); unsigned Op3 = cast(Op.getOperand(3))->getZExtValue(); unsigned Op4 = cast(Op.getOperand(4))->getZExtValue(); - + // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; if (!Op1 && !Op2 && !Op3 && Op4) return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); - + // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; if (Op1 && !Op2 && !Op3 && !Op4) return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); - - // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), + + // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), // (MFENCE)>; return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); } SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { EVT T = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + DebugLoc DL = Op.getDebugLoc(); unsigned Reg = 0; unsigned size = 0; switch(T.getSimpleVT().SimpleTy) { @@ -8323,7 +8542,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { Reg = X86::RAX; size = 8; break; } - SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, + SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), @@ -8331,9 +8550,11 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { DAG.getTargetConstant(size, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); + MachineMemOperand *MMO = cast(Op)->getMemOperand(); + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, + Ops, 5, T, MMO); SDValue cpOut = - DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); + DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); return cpOut; } @@ -8356,16 +8577,16 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, return DAG.getMergeValues(Ops, 2, dl); } -SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, +SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); - assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && + assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && !DisableMMX) && - "Unexpected custom BIT_CONVERT"); - assert((DstVT == MVT::i64 || + "Unexpected custom BITCAST"); + assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && - "Unexpected custom BIT_CONVERT"); + "Unexpected custom BITCAST"); // i64 <=> MMX conversions are Legal. if (SrcVT==MVT::i64 && DstVT.isVector()) return Op; @@ -8448,7 +8669,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); - case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); + case ISD::BITCAST: return LowerBITCAST(Op, DAG); } } @@ -8492,8 +8713,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (FIST.getNode() != 0) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, - false, false, 0)); + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, + MachinePointerInfo(), false, false, 0)); } return; } @@ -8535,7 +8756,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, N->getOperand(1), swapInH.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); + MachineMemOperand *MMO = cast(N)->getMemOperand(); + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, + Ops, 3, T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, MVT::i32, Result.getValue(1)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, @@ -8608,7 +8831,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; - case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; @@ -8616,7 +8838,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; - case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; @@ -8688,7 +8909,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; - case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; + case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; + case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; } } @@ -9210,15 +9432,12 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { - assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && "Target must have SSE4.2 or AVX features enabled"); DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - unsigned Opc; - if (!Subtarget->hasAVX()) { if (memArg) Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; @@ -9231,23 +9450,317 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; } - MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); - + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); for (unsigned i = 0; i < numArgs; ++i) { MachineOperand &Op = MI->getOperand(i+1); - if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } - - BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) + BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) .addReg(X86::XMM0); MI->eraseFromParent(); + return BB; +} +MachineBasicBlock * +X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + // Address into RAX/EAX, other two args into ECX, EDX. + unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + + unsigned ValOps = X86::AddrNumOperands; + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(ValOps).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) + .addReg(MI->getOperand(ValOps+1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); + + MI->eraseFromParent(); // The pseudo is gone now. return BB; } +MachineBasicBlock * +X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + // First arg in ECX, the second in EAX. + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitVAARG64WithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit va_arg instruction on X86-64. + + // Operands to this pseudo-instruction: + // 0 ) Output : destination address (reg) + // 1-5) Input : va_list address (addr, i64mem) + // 6 ) ArgSize : Size (in bytes) of vararg type + // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset + // 8 ) Align : Alignment of type + // 9 ) EFLAGS (implicit-def) + + assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); + assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); + + unsigned DestReg = MI->getOperand(0).getReg(); + MachineOperand &Base = MI->getOperand(1); + MachineOperand &Scale = MI->getOperand(2); + MachineOperand &Index = MI->getOperand(3); + MachineOperand &Disp = MI->getOperand(4); + MachineOperand &Segment = MI->getOperand(5); + unsigned ArgSize = MI->getOperand(6).getImm(); + unsigned ArgMode = MI->getOperand(7).getImm(); + unsigned Align = MI->getOperand(8).getImm(); + + // Memory Reference + assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + // Machine Information + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); + const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); + DebugLoc DL = MI->getDebugLoc(); + + // struct va_list { + // i32 gp_offset + // i32 fp_offset + // i64 overflow_area (address) + // i64 reg_save_area (address) + // } + // sizeof(va_list) = 24 + // alignment(va_list) = 8 + + unsigned TotalNumIntRegs = 6; + unsigned TotalNumXMMRegs = 8; + bool UseGPOffset = (ArgMode == 1); + bool UseFPOffset = (ArgMode == 2); + unsigned MaxOffset = TotalNumIntRegs * 8 + + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); + + /* Align ArgSize to a multiple of 8 */ + unsigned ArgSizeA8 = (ArgSize + 7) & ~7; + bool NeedsAlign = (Align > 8); + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *overflowMBB; + MachineBasicBlock *offsetMBB; + MachineBasicBlock *endMBB; + + unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB + unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB + unsigned OffsetReg = 0; + + if (!UseGPOffset && !UseFPOffset) { + // If we only pull from the overflow region, we don't create a branch. + // We don't need to alter control flow. + OffsetDestReg = 0; // unused + OverflowDestReg = DestReg; + + offsetMBB = NULL; + overflowMBB = thisMBB; + endMBB = thisMBB; + } else { + // First emit code to check if gp_offset (or fp_offset) is below the bound. + // If so, pull the argument from reg_save_area. (branch to offsetMBB) + // If not, pull from overflow_area. (branch to overflowMBB) + // + // thisMBB + // | . + // | . + // offsetMBB overflowMBB + // | . + // | . + // endMBB + + // Registers for the PHI in endMBB + OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); + OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); + + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *MF = MBB->getParent(); + overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); + offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); + endMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + // Insert the new basic blocks + MF->insert(MBBIter, offsetMBB); + MF->insert(MBBIter, overflowMBB); + MF->insert(MBBIter, endMBB); + + // Transfer the remainder of MBB and its successor edges to endMBB. + endMBB->splice(endMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(MI)), + thisMBB->end()); + endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); + + // Make offsetMBB and overflowMBB successors of thisMBB + thisMBB->addSuccessor(offsetMBB); + thisMBB->addSuccessor(overflowMBB); + + // endMBB is a successor of both offsetMBB and overflowMBB + offsetMBB->addSuccessor(endMBB); + overflowMBB->addSuccessor(endMBB); + + // Load the offset value into a register + OffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Check if there is enough room left to pull this argument. + BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) + .addReg(OffsetReg) + .addImm(MaxOffset + 8 - ArgSizeA8); + + // Branch to "overflowMBB" if offset >= max + // Fall through to "offsetMBB" otherwise + BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) + .addMBB(overflowMBB); + } + + // In offsetMBB, emit code to use the reg_save_area. + if (offsetMBB) { + assert(OffsetReg != 0); + + // Read the reg_save_area address. + unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 16) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Zero-extend the offset + unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) + .addImm(0) + .addReg(OffsetReg) + .addImm(X86::sub_32bit); + + // Add the offset to the reg_save_area to get the final address. + BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) + .addReg(OffsetReg64) + .addReg(RegSaveReg); + + // Compute the offset for the next argument + unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) + .addReg(OffsetReg) + .addImm(UseFPOffset ? 16 : 8); + + // Store it back into the va_list. + BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .addReg(NextOffsetReg) + .setMemRefs(MMOBegin, MMOEnd); + + // Jump to endMBB + BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) + .addMBB(endMBB); + } + + // + // Emit code to use overflow area + // + + // Load the overflow_area address into a register. + unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // If we need to align it, do so. Otherwise, just copy the address + // to OverflowDestReg. + if (NeedsAlign) { + // Align the overflow address + assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); + unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); + + // aligned_addr = (addr + (align-1)) & ~(align-1) + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) + .addReg(OverflowAddrReg) + .addImm(Align-1); + + BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) + .addReg(TmpReg) + .addImm(~(uint64_t)(Align-1)); + } else { + BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) + .addReg(OverflowAddrReg); + } + + // Compute the next overflow address after this argument. + // (the overflow address should be kept 8-byte aligned) + unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) + .addReg(OverflowDestReg) + .addImm(ArgSizeA8); + + // Store the new overflow address. + BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .addReg(NextAddrReg) + .setMemRefs(MMOBegin, MMOEnd); + + // If we branched, emit the PHI to the front of endMBB. + if (offsetMBB) { + BuildMI(*endMBB, endMBB->begin(), DL, + TII->get(X86::PHI), DestReg) + .addReg(OffsetDestReg).addMBB(offsetMBB) + .addReg(OverflowDestReg).addMBB(overflowMBB); + } + + // Erase the pseudo instruction + MI->eraseFromParent(); + + return endMBB; +} + MachineBasicBlock * X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MachineInstr *MI, @@ -9303,8 +9816,8 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; MachineMemOperand *MMO = F->getMachineMemOperand( - PseudoSourceValue::getFixedStack(RegSaveFrameIndex), - MachineMemOperand::MOStore, Offset, + MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), + MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) .addFrameIndex(RegSaveFrameIndex) @@ -9396,7 +9909,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, +X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -9406,8 +9919,11 @@ X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, // FIXME: The code should be tweaked as soon as we'll try to do codegen for // mingw-w64. + const char *StackProbeSymbol = + Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; + BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) - .addExternalSymbol("_alloca") + .addExternalSymbol(StackProbeSymbol) .addReg(X86::EAX, RegState::Implicit) .addReg(X86::ESP, RegState::Implicit) .addReg(X86::EAX, RegState::Define | RegState::Implicit) @@ -9425,30 +9941,30 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. - const X86InstrInfo *TII + const X86InstrInfo *TII = static_cast(getTargetMachine().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); MachineFunction *F = BB->getParent(); - bool IsWin64 = Subtarget->isTargetWin64(); - + + assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); - + if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) .addReg(X86::RIP) .addImm(0).addReg(0) - .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); - MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) .addImm(0).addReg(0) - .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); @@ -9458,13 +9974,13 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, TII->get(X86::MOV32rm), X86::EAX) .addReg(TII->getGlobalBaseReg(F)) .addImm(0).addReg(0) - .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, MI->getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); } - + MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } @@ -9474,13 +9990,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { default: assert(false && "Unexpected instr type to insert"); - case X86::MINGW_ALLOCA: - return EmitLoweredMingwAlloca(MI, BB); + case X86::WIN_ALLOCA: + return EmitLoweredWinAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_GR8: - case X86::CMOV_V1I64: case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_V4F32: @@ -9590,6 +10105,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: return EmitPCMP(MI, BB, 5, true /* in mem */); + // Thread synchronization. + case X86::MONITOR: + return EmitMonitor(MI, BB); + case X86::MWAIT: + return EmitMwait(MI, BB); + // Atomic Lowering. case X86::ATOMAND32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, @@ -9754,6 +10275,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, false); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + + case X86::VAARG_64: + return EmitVAARG64WithCustomInserter(MI, BB); } } @@ -9798,6 +10322,16 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, } } +unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, + unsigned Depth) const { + // SETCC_CARRY sets the dest to ~0 for true or 0 for false. + if (Op.getOpcode() == X86ISD::SETCC_CARRY) + return Op.getValueType().getScalarType().getSizeInBits(); + + // Fallback case. + return 1; +} + /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the /// node is a GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, @@ -9884,8 +10418,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, - 0, false, false, 0); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); // Replace each use (extract) with a load of the appropriate element. for (SmallVectorImpl::iterator UI = Uses.begin(), @@ -9900,11 +10434,12 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), - OffsetVal, StackPtr); + StackPtr, OffsetVal); // Load the scalar. SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, - ScalarAddr, NULL, 0, false, false, 0); + ScalarAddr, MachinePointerInfo(), + false, false, 0); // Replace the exact with the load. DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); @@ -10607,9 +11142,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // pair instead. if (Subtarget->is64Bit() || F64IsLegal) { EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; - SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), - Ld->getBasePtr(), Ld->getSrcValue(), - Ld->getSrcValueOffset(), Ld->isVolatile(), + SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->getAlignment()); SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { @@ -10618,7 +11152,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Ops.size()); } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), - St->getSrcValue(), St->getSrcValueOffset(), + St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); } @@ -10629,11 +11163,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(4, MVT::i32)); SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->getAlignment()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset()+4, + Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), MinAlign(Ld->getAlignment(), 4)); @@ -10650,12 +11184,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(4, MVT::i32)); SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, - St->getSrcValue(), St->getSrcValueOffset(), + St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, - St->getSrcValue(), - St->getSrcValueOffset() + 4, + St->getPointerInfo().getWithOffset(4), St->isVolatile(), St->isNonTemporal(), MinAlign(St->getAlignment(), 4)); @@ -10713,13 +11246,13 @@ static SDValue PerformBTCombine(SDNode *N, static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { SDValue Op = N->getOperand(0); - if (Op.getOpcode() == ISD::BIT_CONVERT) + if (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); EVT VT = N->getValueType(0), OpVT = Op.getValueType(); if (Op.getOpcode() == X86ISD::VZEXT_LOAD && VT.getVectorElementType().getSizeInBits() == OpVT.getVectorElementType().getSizeInBits()) { - return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); } return SDValue(); } @@ -10773,6 +11306,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); case X86ISD::SHUFPS: // Handle all target specific shuffles case X86ISD::SHUFPD: + case X86ISD::PALIGN: case X86ISD::PUNPCKHBW: case X86ISD::PUNPCKHWD: case X86ISD::PUNPCKHDQ: @@ -10930,13 +11464,13 @@ static bool LowerToBSwap(CallInst *CI) { bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast(CI->getCalledValue()); - std::vector Constraints = IA->ParseConstraints(); + InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); std::string AsmStr = IA->getAsmString(); // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector AsmPieces; - SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? + SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; @@ -10977,6 +11511,35 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { } break; case 3: + if (CI->getType()->isIntegerTy(32) && + IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { + SmallVector Words; + SplitString(AsmPieces[0], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && + Words[2] == "${0:w}") { + Words.clear(); + SplitString(AsmPieces[1], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && + Words[2] == "$0") { + Words.clear(); + SplitString(AsmPieces[2], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && + Words[2] == "${0:w}") { + AsmPieces.clear(); + const std::string &Constraints = IA->getConstraintString(); + SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); + std::sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") { + return LowerToBSwap(CI); + } + } + } + } + } if (CI->getType()->isIntegerTy(64) && Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && @@ -11010,18 +11573,32 @@ X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(const std::string &Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { - case 'A': - return C_Register; - case 'f': - case 'r': case 'R': - case 'l': case 'q': case 'Q': - case 'x': + case 'f': + case 't': + case 'u': case 'y': + case 'x': case 'Y': return C_RegisterClass; + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + return C_Register; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'G': + case 'C': case 'e': case 'Z': return C_Other; @@ -11032,6 +11609,110 @@ X86TargetLowering::getConstraintType(const std::string &Constraint) const { return TargetLowering::getConstraintType(Constraint); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight + X86TargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + const Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + case 'R': + case 'q': + case 'Q': + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + if (CallOperandVal->getType()->isIntegerTy()) + weight = CW_SpecificReg; + break; + case 'f': + case 't': + case 'u': + if (type->isFloatingPointTy()) + weight = CW_SpecificReg; + break; + case 'y': + if (type->isX86_MMXTy() && !DisableMMX && Subtarget->hasMMX()) + weight = CW_SpecificReg; + break; + case 'x': + case 'Y': + if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) + weight = CW_Register; + break; + case 'I': + if (ConstantInt *C = dyn_cast(info.CallOperandVal)) { + if (C->getZExtValue() <= 31) + weight = CW_Constant; + } + break; + case 'J': + if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (C->getZExtValue() <= 63) + weight = CW_Constant; + } + break; + case 'K': + if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) + weight = CW_Constant; + } + break; + case 'L': + if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) + weight = CW_Constant; + } + break; + case 'M': + if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (C->getZExtValue() <= 3) + weight = CW_Constant; + } + break; + case 'N': + if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (C->getZExtValue() <= 0xff) + weight = CW_Constant; + } + break; + case 'G': + case 'C': + if (dyn_cast(CallOperandVal)) { + weight = CW_Constant; + } + break; + case 'e': + if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80000000LL) && + (C->getSExtValue() <= 0x7fffffffLL)) + weight = CW_Constant; + } + break; + case 'Z': + if (ConstantInt *C = dyn_cast(CallOperandVal)) { + if (C->getZExtValue() <= 0xffffffff) + weight = CW_Constant; + } + break; + } + return weight; +} + /// LowerXConstraint - try to replace an X constraint, which matches anything, /// with another that has more specific requirements based on the type of the /// corresponding operand.