X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=724899b00eee3606d3f0563b3d84b33b933d2199;hb=fca82deecb1e1909a97db41cf324bf70e39af1e5;hp=efe7bac72eb0870d45ab73f42e72f2e0b31a2fa6;hpb=41ea7e7eb3a6a269f2bfed0bdc191ea046d18e5e;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index efe7bac72eb..724899b00ee 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -45,7 +45,7 @@ static cl::opt DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); // Forward declarations. -static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG); +static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG, DebugLoc dl); X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM) { @@ -80,7 +80,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); } - + // Set up the register classes. addRegisterClass(MVT::i8, X86::GR8RegisterClass); addRegisterClass(MVT::i16, X86::GR16RegisterClass); @@ -90,7 +90,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - // We don't accept any truncstore of integer registers. + // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); @@ -113,30 +113,38 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); } else { - if (X86ScalarSSEf64) { + if (!UseSoftFloat && !NoImplicitFloat && X86ScalarSSEf64) { // We have an impenetrably clever algorithm for ui64->double only. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); - // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP. - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand); - } else + + // We have faster algorithm for ui32->single only. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + } else { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + } } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - // SSE has no i16 to fp conversion, only i32 - if (X86ScalarSSEf32) { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + + if (!UseSoftFloat && !NoImplicitFloat) { + // SSE has no i16 to fp conversion, only i32 + if (X86ScalarSSEf32) { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } } else { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 @@ -233,7 +241,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); - + setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTTZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i8 , Custom); @@ -306,24 +314,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); // Expand certain atomics - setOperationAction(ISD::ATOMIC_CMP_SWAP_8 , MVT::i8, Custom); - setOperationAction(ISD::ATOMIC_CMP_SWAP_16, MVT::i16, Custom); - setOperationAction(ISD::ATOMIC_CMP_SWAP_32, MVT::i32, Custom); - setOperationAction(ISD::ATOMIC_CMP_SWAP_64, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB_8 , MVT::i8, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB_16, MVT::i16, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB_32, MVT::i32, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB_64, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); if (!Subtarget->is64Bit()) { - setOperationAction(ISD::ATOMIC_LOAD_ADD_64, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB_64, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_AND_64, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_OR_64, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_XOR_64, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_NAND_64, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_SWAP_64, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); } // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion. @@ -374,7 +382,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); - if (X86ScalarSSEf64) { + if (!UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -412,7 +420,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setConvertAction(MVT::f80, MVT::f32, Expand); setConvertAction(MVT::f80, MVT::f64, Expand); } - } else if (X86ScalarSSEf32) { + } else if (!UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -447,7 +455,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Fast) { setConvertAction(MVT::f32, MVT::f64, Expand); setConvertAction(MVT::f32, MVT::f80, Expand); - setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f80, MVT::f32, Expand); setConvertAction(MVT::f64, MVT::f32, Expand); // And x87->x87 truncations also. setConvertAction(MVT::f80, MVT::f64, Expand); @@ -457,7 +465,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } - } else { + } else if (!UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, X86::RFP64RegisterClass); @@ -472,7 +480,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // this though and handle it in InstructionSelectPreprocess so that // dagcombine2 can hack on these. if (Fast) { - setConvertAction(MVT::f80, MVT::f32, Expand); + setConvertAction(MVT::f80, MVT::f32, Expand); setConvertAction(MVT::f64, MVT::f32, Expand); setConvertAction(MVT::f80, MVT::f64, Expand); } @@ -492,28 +500,30 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Long double always uses X87. - addRegisterClass(MVT::f80, X86::RFP80RegisterClass); - setOperationAction(ISD::UNDEF, MVT::f80, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); - { - bool ignored; - APFloat TmpFlt(+0.0); - TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, - &ignored); - addLegalFPImmediate(TmpFlt); // FLD0 - TmpFlt.changeSign(); - addLegalFPImmediate(TmpFlt); // FLD0/FCHS - APFloat TmpFlt2(+1.0); - TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, - &ignored); - addLegalFPImmediate(TmpFlt2); // FLD1 - TmpFlt2.changeSign(); - addLegalFPImmediate(TmpFlt2); // FLD1/FCHS - } - - if (!UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f80 , Expand); - setOperationAction(ISD::FCOS , MVT::f80 , Expand); + if (!UseSoftFloat) { + addRegisterClass(MVT::f80, X86::RFP80RegisterClass); + setOperationAction(ISD::UNDEF, MVT::f80, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); + { + bool ignored; + APFloat TmpFlt(+0.0); + TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, + &ignored); + addLegalFPImmediate(TmpFlt); // FLD0 + TmpFlt.changeSign(); + addLegalFPImmediate(TmpFlt); // FLD0/FCHS + APFloat TmpFlt2(+1.0); + TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, + &ignored); + addLegalFPImmediate(TmpFlt2); // FLD1 + TmpFlt2.changeSign(); + addLegalFPImmediate(TmpFlt2); // FLD1/FCHS + } + + if (!UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f80 , Expand); + setOperationAction(ISD::FCOS , MVT::f80 , Expand); + } } // Always use a library call for pow. @@ -577,15 +587,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); } - if (!DisableMMX && Subtarget->hasMMX()) { + // FIXME: In order to prevent SSE instructions being expanded to MMX ones + // with -msoft-float, disable use of MMX as well. + if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); - // FIXME: add MMX packed arithmetics - setOperationAction(ISD::ADD, MVT::v8i8, Legal); setOperationAction(ISD::ADD, MVT::v4i16, Legal); setOperationAction(ISD::ADD, MVT::v2i32, Legal); @@ -650,9 +660,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); + + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); + setOperationAction(ISD::SELECT, MVT::v8i8, Promote); + setOperationAction(ISD::SELECT, MVT::v4i16, Promote); + setOperationAction(ISD::SELECT, MVT::v2i32, Promote); + setOperationAction(ISD::SELECT, MVT::v1i64, Custom); } - if (Subtarget->hasSSE1()) { + if (!UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -669,8 +686,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); } - if (Subtarget->hasSSE2()) { + if (!UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); + + // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM + // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); @@ -680,6 +700,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); @@ -713,12 +734,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); + if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); @@ -745,13 +768,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::LOAD, MVT::v2i64, Legal); setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); - + } - + if (Subtarget->hasSSE41()) { // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::MUL, MVT::v2i64, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -759,12 +781,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // information. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); if (Subtarget->is64Bit()) { @@ -776,21 +798,34 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->hasSSE42()) { setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); } - + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - // Add with overflow operations are custom lowered. + // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::SADDO, MVT::i64, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i64, Custom); + setOperationAction(ISD::SSUBO, MVT::i32, Custom); + setOperationAction(ISD::SSUBO, MVT::i64, Custom); + setOperationAction(ISD::USUBO, MVT::i32, Custom); + setOperationAction(ISD::USUBO, MVT::i64, Custom); + setOperationAction(ISD::SMULO, MVT::i32, Custom); + setOperationAction(ISD::SMULO, MVT::i64, Custom); + setOperationAction(ISD::UMULO, MVT::i32, Custom); + setOperationAction(ISD::UMULO, MVT::i64, Custom); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::STORE); + if (Subtarget->is64Bit()) + setTargetDAGCombine(ISD::MUL); computeRegisterProperties(); @@ -804,7 +839,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } -MVT X86TargetLowering::getSetCCResultType(const SDValue &) const { +MVT X86TargetLowering::getSetCCResultType(MVT VT) const { return MVT::i8; } @@ -864,7 +899,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. - if (Subtarget->getStackAlignment() >= 16) { + if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) { if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) return MVT::v4i32; if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) @@ -875,15 +910,17 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, return MVT::i32; } - /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC /// jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (usesGlobalOffsetTable()) - return DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, getPointerTy()); + return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); if (!Subtarget->isPICStyleRIPRel()) - return DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()); + // This doesn't have DebugLoc associated with it, but is not really the + // same as a Register. + return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), + getPointerTy()); return Table; } @@ -895,14 +932,15 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, /// LowerRET - Lower an ISD::RET node. SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args"); - + SmallVector RVLocs; unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86); - + // If this is the first return lowered for this function, add the regs to the // liveout set for the function. if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { @@ -911,7 +949,7 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); } SDValue Chain = Op.getOperand(0); - + // Handle tail call return. Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL); if (Chain.getOpcode() == X86ISD::TAILCALL) { @@ -922,7 +960,7 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { (cast(TargetAddress)->getReg() == X86::EAX || cast(TargetAddress)->getReg() == X86::R9)) || TargetAddress.getOpcode() == ISD::TargetExternalSymbol || - TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && + TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && "Expecting an global address, external symbol, or register"); assert(StackAdjustment.getOpcode() == ISD::Constant && "Expecting a const value"); @@ -936,10 +974,10 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) { Operands.push_back(Chain.getOperand(i)); } - return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0], + return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0], Operands.size()); } - + // Regular return. SDValue Flag; @@ -947,27 +985,38 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16)); - + // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue ValToCopy = Op.getOperand(i*2+1); - + // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. - if (RVLocs[i].getLocReg() == X86::ST0 || - RVLocs[i].getLocReg() == X86::ST1) { + if (VA.getLocReg() == X86::ST0 || + VA.getLocReg() == X86::ST1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. - if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) - ValToCopy = DAG.getNode(ISD::FP_EXTEND, MVT::f80, ValToCopy); + if (isScalarFPTypeInSSEReg(VA.getValVT())) + ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } - Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), ValToCopy, Flag); + // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 + // which is returned in RAX / RDX. + if (Subtarget->is64Bit()) { + MVT ValVT = ValToCopy.getValueType(); + if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { + ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); + if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) + ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); + } + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); } @@ -984,19 +1033,20 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); FuncInfo->setSRetReturnReg(Reg); } - SDValue Val = DAG.getCopyFromReg(Chain, Reg, getPointerTy()); + SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); - Chain = DAG.getCopyToReg(Chain, X86::RAX, Val, Flag); + Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); Flag = Chain.getValue(1); } - + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); - - return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, &RetOps[0], RetOps.size()); + + return DAG.getNode(X86ISD::RET_FLAG, dl, + MVT::Other, &RetOps[0], RetOps.size()); } @@ -1006,50 +1056,77 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { /// being lowered. The returns a SDNode with the same number of values as the /// ISD::CALL. SDNode *X86TargetLowering:: -LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, +LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, unsigned CallingConv, SelectionDAG &DAG) { - + + DebugLoc dl = TheCall->getDebugLoc(); // Assign locations to each value returned by this call. SmallVector RVLocs; bool isVarArg = TheCall->isVarArg(); + bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); CCInfo.AnalyzeCallResult(TheCall, RetCC_X86); SmallVector ResultVals; - + // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { - MVT CopyVT = RVLocs[i].getValVT(); - + CCValAssign &VA = RVLocs[i]; + MVT CopyVT = VA.getValVT(); + + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) { + cerr << "SSE register return with SSE disabled\n"; + exit(1); + } + // If this is a call to a function that returns an fp value on the floating // point stack, but where we prefer to use the value in xmm registers, copy // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. - if ((RVLocs[i].getLocReg() == X86::ST0 || - RVLocs[i].getLocReg() == X86::ST1) && - isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) { + if ((VA.getLocReg() == X86::ST0 || + VA.getLocReg() == X86::ST1) && + isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; } - - Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(), - CopyVT, InFlag).getValue(1); - SDValue Val = Chain.getValue(0); + + SDValue Val; + if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { + // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. + if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + MVT::v2i64, InFlag).getValue(1); + Val = Chain.getValue(0); + Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + Val, DAG.getConstant(0, MVT::i64)); + } else { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + MVT::i64, InFlag).getValue(1); + Val = Chain.getValue(0); + } + Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); + } else { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + CopyVT, InFlag).getValue(1); + Val = Chain.getValue(0); + } InFlag = Chain.getValue(2); - if (CopyVT != RVLocs[i].getValVT()) { + if (CopyVT != VA.getValVT()) { // Round the F80 the right size, which also moves to the appropriate xmm // register. - Val = DAG.getNode(ISD::FP_ROUND, RVLocs[i].getValVT(), Val, + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1)); } - + ResultVals.push_back(Val); } // Merge everything together with a MERGE_VALUES node. ResultVals.push_back(Chain); - return DAG.getMergeValues(TheCall->getVTList(), &ResultVals[0], - ResultVals.size()).getNode(); + return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(), + &ResultVals[0], ResultVals.size()).getNode(); } @@ -1156,9 +1233,9 @@ bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) { /// CallRequiresFnAddressInReg - Check whether the call requires the function /// address to be loaded in a register. -bool +bool X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) { - return !Is64Bit && IsTailCall && + return !Is64Bit && IsTailCall && getTargetMachine().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT(); } @@ -1167,11 +1244,12 @@ X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) { /// by "Src" to address "Dst" with size and alignment information specified by /// the specific parameter attribute. The copy will be passed as a byval /// function parameter. -static SDValue +static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, - ISD::ArgFlagsTy Flags, SelectionDAG &DAG) { + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + DebugLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); - return DAG.getMemcpy(Chain, Dst, Src, SizeNode, Flags.getByValAlign(), + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*AlwaysInline=*/true, NULL, 0, NULL, 0); } @@ -1187,7 +1265,7 @@ SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG, bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); // FIXME: For now, all byval parameter objects are marked mutable. This can be - // changed with more analysis. + // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, @@ -1195,7 +1273,7 @@ SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG, SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); if (Flags.isByVal()) return FIN; - return DAG.getLoad(VA.getValVT(), Root, FIN, + return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN, PseudoSourceValue::getFixedStack(FI), 0); } @@ -1203,7 +1281,8 @@ SDValue X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); - + DebugLoc dl = Op.getDebugLoc(); + const Function* Fn = MF.getFunction(); if (Fn->hasExternalLinkage() && Subtarget->isTargetCygMing() && @@ -1212,7 +1291,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { // Decorate the function name. FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op)); - + MachineFrameInfo *MFI = MF.getFrameInfo(); SDValue Root = Op.getOperand(0); bool isVarArg = cast(Op.getOperand(2))->getZExtValue() != 0; @@ -1227,7 +1306,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { SmallVector ArgLocs; CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC)); - + SmallVector ArgValues; unsigned LastVal = ~0U; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -1237,10 +1316,10 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { assert(VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"); LastVal = VA.getValNo(); - + if (VA.isRegLoc()) { MVT RegVT = VA.getLocVT(); - TargetRegisterClass *RC; + TargetRegisterClass *RC = NULL; if (RegVT == MVT::i32) RC = X86::GR32RegisterClass; else if (Is64Bit && RegVT == MVT::i64) @@ -1271,32 +1350,32 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { } unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC); - SDValue ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT); - + SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); + // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) - ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue, + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) - ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue, + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); - + if (VA.getLocInfo() != CCValAssign::Full) - ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue); - + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + // Handle MMX values passed in GPRs. if (Is64Bit && RegVT != VA.getLocVT()) { if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass) - ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue); + ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); else if (RC == X86::VR128RegisterClass) { - ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i64, ArgValue, - DAG.getConstant(0, MVT::i64)); - ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue); + ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + ArgValue, DAG.getConstant(0, MVT::i64)); + ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); } } - + ArgValues.push_back(ArgValue); } else { assert(VA.isMemLoc()); @@ -1315,8 +1394,8 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); FuncInfo->setSRetReturnReg(Reg); } - SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), Reg, ArgValues[0]); - Root = DAG.getNode(ISD::TokenFactor, MVT::Other, Copy, Root); + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]); + Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root); } unsigned StackSize = CCInfo.getNextStackOffset(); @@ -1363,6 +1442,15 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, TotalNumXMMRegs); + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) && + "SSE register cannot be used when SSE is disabled!"); + if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1()) + // Kernel mode asks for SSE to be disabled, so don't push them + // on the stack. + TotalNumXMMRegs = 0; + // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so they // may be loaded by deferencing the result of va_next. @@ -1374,40 +1462,40 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); - SDValue FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN, + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(VarArgsGPOffset)); for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs], X86::GR64RegisterClass); - SDValue Val = DAG.getCopyFromReg(Root, VReg, MVT::i64); + SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64); SDValue Store = - DAG.getStore(Val.getValue(1), Val, FIN, + DAG.getStore(Val.getValue(1), dl, Val, FIN, PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getIntPtrConstant(8)); } // Now store the XMM (fp + vector) parameter registers. - FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN, + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(VarArgsFPOffset)); for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs], X86::VR128RegisterClass); - SDValue Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32); + SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32); SDValue Store = - DAG.getStore(Val.getValue(1), Val, FIN, + DAG.getStore(Val.getValue(1), dl, Val, FIN, PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getIntPtrConstant(16)); } if (!MemOps.empty()) - Root = DAG.getNode(ISD::TokenFactor, MVT::Other, + Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0], MemOps.size()); } } - + ArgValues.push_back(Root); // Some CCs need callee pop. @@ -1418,7 +1506,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { BytesToPopOnReturn = 0; // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op)) - BytesToPopOnReturn = 4; + BytesToPopOnReturn = 4; BytesCallerReserves = StackSize; } @@ -1431,8 +1519,8 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); // Return the new list of results. - return DAG.getMergeValues(Op.getNode()->getVTList(), &ArgValues[0], - ArgValues.size()).getValue(Op.getResNo()); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), + &ArgValues[0], ArgValues.size()).getValue(Op.getResNo()); } SDValue @@ -1441,50 +1529,53 @@ X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, const CCValAssign &VA, SDValue Chain, SDValue Arg, ISD::ArgFlagsTy Flags) { + DebugLoc dl = TheCall->getDebugLoc(); unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); - PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); if (Flags.isByVal()) { - return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG); + return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); } - return DAG.getStore(Chain, Arg, PtrOff, + return DAG.getStore(Chain, dl, Arg, PtrOff, PseudoSourceValue::getStack(), LocMemOffset); } -/// EmitTailCallLoadRetAddr - Emit a load of return adress if tail call +/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call /// optimization is performed and it is required. -SDValue -X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, +SDValue +X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, - SDValue Chain, - bool IsTailCall, - bool Is64Bit, - int FPDiff) { + SDValue Chain, + bool IsTailCall, + bool Is64Bit, + int FPDiff, + DebugLoc dl) { if (!IsTailCall || FPDiff==0) return Chain; // Adjust the Return address stack slot. MVT VT = getPointerTy(); OutRetAddr = getReturnAddressFrameIndex(DAG); + // Load the "old" Return address. - OutRetAddr = DAG.getLoad(VT, Chain,OutRetAddr, NULL, 0); + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); return SDValue(OutRetAddr.getNode(), 1); } /// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call /// optimization is performed and it is required (FPDiff!=0). -static SDValue -EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, +static SDValue +EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, - bool Is64Bit, int FPDiff) { + bool Is64Bit, int FPDiff, DebugLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int SlotSize = Is64Bit ? 8 : 4; - int NewReturnAddrFI = + int NewReturnAddrFI = MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); MVT VT = Is64Bit ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); - Chain = DAG.getStore(Chain, RetAddrFrIdx, NewRetAddrFrIdx, + Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); return Chain; } @@ -1500,6 +1591,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { SDValue Callee = TheCall->getCallee(); bool Is64Bit = Subtarget->is64Bit(); bool IsStructRet = CallIsStructReturn(TheCall); + DebugLoc dl = TheCall->getDebugLoc(); assert(!(isVarArg && CC == CallingConv::Fast) && "Var args not supported with calling convention fastcc"); @@ -1508,7 +1600,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { SmallVector ArgLocs; CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC)); - + // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); if (PerformTailCallOpt && CC == CallingConv::Fast) @@ -1517,7 +1609,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { int FPDiff = 0; if (IsTailCall) { // Lower arguments at fp - stackoffset + fpdiff. - unsigned NumBytesCallerPushed = + unsigned NumBytesCallerPushed = MF.getInfo()->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; @@ -1532,7 +1624,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { SDValue RetAddrFrIdx; // Load return adress for tail calls. Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit, - FPDiff); + FPDiff, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; @@ -1545,22 +1637,22 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { SDValue Arg = TheCall->getArg(i); ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); bool isByVal = Flags.isByVal(); - + // Promote the value if needed. switch (VA.getLocInfo()) { default: assert(0 && "Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: - Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: - Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::AExt: - Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; } - + if (VA.isRegLoc()) { if (Is64Bit) { MVT RegVT = VA.getLocVT(); @@ -1571,17 +1663,17 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX: case X86::R8: { // Special case: passing MMX values in GPR registers. - Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg); + Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); break; } case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3: case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: { // Special case: passing MMX values in XMM registers. - Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg); - Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Arg); - Arg = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64, - DAG.getNode(ISD::UNDEF, MVT::v2i64), Arg, - getMOVLMask(2, DAG)); + Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); + Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); + Arg = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64, + DAG.getUNDEF(MVT::v2i64), Arg, + getMOVLMask(2, DAG, dl)); break; } } @@ -1591,16 +1683,16 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { if (!IsTailCall || (IsTailCall && isByVal)) { assert(VA.isMemLoc()); if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy()); - + StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); + MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, Chain, Arg, Flags)); } } } - + if (!MemOpChains.empty()) - Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0], MemOpChains.size()); // Build a sequence of copy-to-reg nodes chained together with token chain @@ -1610,16 +1702,18 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { // tail call optimization the copies to registers are lowered later. if (!IsTailCall) for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, - InFlag); + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } // ELF / PIC requires GOT in the EBX register before function calls via PLT - // GOT pointer. + // GOT pointer. if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) { - Chain = DAG.getCopyToReg(Chain, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc::getUnknownLoc(), + getPointerTy()), InFlag); InFlag = Chain.getValue(1); } @@ -1655,8 +1749,10 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); - - Chain = DAG.getCopyToReg(Chain, X86::AL, + assert((Subtarget->hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); + + Chain = DAG.getCopyToReg(Chain, dl, X86::AL, DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); InFlag = Chain.getValue(1); } @@ -1685,35 +1781,36 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy()); - Source = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, Source); + StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, + getPointerTy()); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain, - Flags, DAG)); + Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back( - DAG.getStore(Chain, Arg, FIN, + DAG.getStore(Chain, dl, Arg, FIN, PseudoSourceValue::getFixedStack(FI), 0)); - } + } } } if (!MemOpChains2.empty()) - Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains2[0], MemOpChains2.size()); // Copy arguments to their registers. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, - InFlag); + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } InFlag =SDValue(); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, - FPDiff); + FPDiff, dl); } // If the callee is a GlobalAddress node (quite common, every direct call is) @@ -1730,32 +1827,28 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { } else if (IsTailCall) { unsigned Opc = Is64Bit ? X86::R9 : X86::EAX; - Chain = DAG.getCopyToReg(Chain, - DAG.getRegister(Opc, getPointerTy()), + Chain = DAG.getCopyToReg(Chain, dl, + DAG.getRegister(Opc, getPointerTy()), Callee,InFlag); Callee = DAG.getRegister(Opc, getPointerTy()); // Add register as live out. DAG.getMachineFunction().getRegInfo().addLiveOut(Opc); } - + // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); SmallVector Ops; if (IsTailCall) { - Ops.push_back(Chain); - Ops.push_back(DAG.getIntPtrConstant(NumBytes, true)); - Ops.push_back(DAG.getIntPtrConstant(0, true)); - if (InFlag.getNode()) - Ops.push_back(InFlag); - Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size()); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag); InFlag = Chain.getValue(1); - + // Returns a chain & a flag for retval copy to use. NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); Ops.clear(); } - + Ops.push_back(Chain); Ops.push_back(Callee); @@ -1767,7 +1860,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); - + // Add an implicit use GOT pointer in EBX. if (!IsTailCall && !Is64Bit && getTargetMachine().getRelocationModel() == Reloc::PIC_ && @@ -1782,15 +1875,15 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { Ops.push_back(InFlag); if (IsTailCall) { - assert(InFlag.getNode() && + assert(InFlag.getNode() && "Flag must be set. Depend on flag being set in LowerRET"); - Chain = DAG.getNode(X86ISD::TAILCALL, + Chain = DAG.getNode(X86ISD::TAILCALL, dl, TheCall->getVTList(), &Ops[0], Ops.size()); - + return SDValue(Chain.getNode(), Op.getResNo()); } - Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size()); + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. @@ -1804,7 +1897,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { NumBytesForCalleeToPush = 4; else NumBytesForCalleeToPush = 0; // Callee pops nothing. - + // Returns a flag for retval copy to use. Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), @@ -1844,7 +1937,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { // arg1 // arg2 // RETADDR -// [ new RETADDR +// [ new RETADDR // move area ] // (possible EBP) // ESI @@ -1853,13 +1946,13 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned /// for a 16 byte align requirement. -unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, +unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); const TargetFrameInfo &TFI = *TM.getFrameInfo(); unsigned StackAlignment = TFI.getStackAlignment(); - uint64_t AlignMask = StackAlignment - 1; + uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; uint64_t SlotSize = TD->getPointerSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { @@ -1867,7 +1960,7 @@ unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); } else { // Mask out lower bits, add stackalignment once plus the 12 bytes. - Offset = ((~AlignMask) & Offset) + StackAlignment + + Offset = ((~AlignMask) & Offset) + StackAlignment + (StackAlignment-SlotSize); } return Offset; @@ -1908,6 +2001,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall, FastISel * X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo, + DwarfWriter *dw, DenseMap &vm, DenseMap &bm, @@ -1916,7 +2010,7 @@ X86TargetLowering::createFastISel(MachineFunction &mf, , SmallSet &cil #endif ) { - return X86::createFastISel(mf, mmo, vm, bm, am + return X86::createFastISel(mf, mmo, dw, vm, bm, am #ifndef NDEBUG , cil #endif @@ -1933,10 +2027,10 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); - uint64_t SlotSize = TD->getPointerSize(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. + uint64_t SlotSize = TD->getPointerSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -1945,112 +2039,88 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { } -/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86 -/// specific condition code. It returns a false if it cannot do a direct -/// translation. X86CC is the translated CondCode. LHS/RHS are modified as -/// needed. -static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP, - unsigned &X86CC, SDValue &LHS, SDValue &RHS, - SelectionDAG &DAG) { - X86CC = X86::COND_INVALID; +/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 +/// specific condition code, returning the condition code and the LHS/RHS of the +/// comparison to make. +static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, + SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, RHS.getValueType()); - X86CC = X86::COND_NS; - return true; + return X86::COND_NS; } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. - X86CC = X86::COND_S; - return true; + return X86::COND_S; } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, RHS.getValueType()); - X86CC = X86::COND_LE; - return true; + return X86::COND_LE; } } switch (SetCCOpcode) { - default: break; - case ISD::SETEQ: X86CC = X86::COND_E; break; - case ISD::SETGT: X86CC = X86::COND_G; break; - case ISD::SETGE: X86CC = X86::COND_GE; break; - case ISD::SETLT: X86CC = X86::COND_L; break; - case ISD::SETLE: X86CC = X86::COND_LE; break; - case ISD::SETNE: X86CC = X86::COND_NE; break; - case ISD::SETULT: X86CC = X86::COND_B; break; - case ISD::SETUGT: X86CC = X86::COND_A; break; - case ISD::SETULE: X86CC = X86::COND_BE; break; - case ISD::SETUGE: X86CC = X86::COND_AE; break; + default: assert(0 && "Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; } - } else { - // First determine if it is required or is profitable to flip the operands. + } - // If LHS is a foldable load, but RHS is not, flip the condition. - if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && - !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { - SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); - std::swap(LHS, RHS); - } + // First determine if it is required or is profitable to flip the operands. - switch (SetCCOpcode) { - default: break; - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETUGT: - case ISD::SETUGE: - std::swap(LHS, RHS); - break; - } + // If LHS is a foldable load, but RHS is not, flip the condition. + if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && + !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { + SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); + std::swap(LHS, RHS); + } - // On a floating point condition, the flags are set as follows: - // ZF PF CF op - // 0 | 0 | 0 | X > Y - // 0 | 0 | 1 | X < Y - // 1 | 0 | 0 | X == Y - // 1 | 1 | 1 | unordered - switch (SetCCOpcode) { - default: break; - case ISD::SETUEQ: - case ISD::SETEQ: - X86CC = X86::COND_E; - break; - case ISD::SETOLT: // flipped - case ISD::SETOGT: - case ISD::SETGT: - X86CC = X86::COND_A; - break; - case ISD::SETOLE: // flipped - case ISD::SETOGE: - case ISD::SETGE: - X86CC = X86::COND_AE; - break; - case ISD::SETUGT: // flipped - case ISD::SETULT: - case ISD::SETLT: - X86CC = X86::COND_B; - break; - case ISD::SETUGE: // flipped - case ISD::SETULE: - case ISD::SETLE: - X86CC = X86::COND_BE; - break; - case ISD::SETONE: - case ISD::SETNE: - X86CC = X86::COND_NE; - break; - case ISD::SETUO: - X86CC = X86::COND_P; - break; - case ISD::SETO: - X86CC = X86::COND_NP; - break; - } + switch (SetCCOpcode) { + default: break; + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETUGT: + case ISD::SETUGE: + std::swap(LHS, RHS); + break; } - return X86CC != X86::COND_INVALID; + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (SetCCOpcode) { + default: assert(0 && "Condcode should be pre-legalized away"); + case ISD::SETUEQ: + case ISD::SETEQ: return X86::COND_E; + case ISD::SETOLT: // flipped + case ISD::SETOGT: + case ISD::SETGT: return X86::COND_A; + case ISD::SETOLE: // flipped + case ISD::SETOGE: + case ISD::SETGE: return X86::COND_AE; + case ISD::SETUGT: // flipped + case ISD::SETULT: + case ISD::SETLT: return X86::COND_B; + case ISD::SETUGE: // flipped + case ISD::SETULE: + case ISD::SETLE: return X86::COND_BE; + case ISD::SETONE: + case ISD::SETNE: return X86::COND_NE; + case ISD::SETUO: return X86::COND_P; + case ISD::SETO: return X86::COND_NP; + } } /// hasFPCMov - is there a floating point cmov for the specific X86 condition @@ -2163,7 +2233,8 @@ bool X86::isPSHUFLWMask(SDNode *N) { /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to SHUFP*. -static bool isSHUFPMask(SDOperandPtr Elems, unsigned NumElems) { +template +static bool isSHUFPMask(SDOperand *Elems, unsigned NumElems) { if (NumElems != 2 && NumElems != 4) return false; unsigned Half = NumElems / 2; @@ -2186,7 +2257,8 @@ bool X86::isSHUFPMask(SDNode *N) { /// the reverse of what x86 shuffles want. x86 shuffles requires the lower /// half elements to come from vector 1 (which would equal the dest.) and /// the upper half to come from vector 2. -static bool isCommutedSHUFP(SDOperandPtr Ops, unsigned NumOps) { +template +static bool isCommutedSHUFP(SDOperand *Ops, unsigned NumOps) { if (NumOps != 2 && NumOps != 4) return false; unsigned Half = NumOps / 2; @@ -2280,7 +2352,8 @@ bool X86::isMOVHPMask(SDNode *N) { /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. -bool static isUNPCKLMask(SDOperandPtr Elts, unsigned NumElts, +template +bool static isUNPCKLMask(SDOperand *Elts, unsigned NumElts, bool V2IsSplat = false) { if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return false; @@ -2291,7 +2364,7 @@ bool static isUNPCKLMask(SDOperandPtr Elts, unsigned NumElts, if (!isUndefOrEqual(BitI, j)) return false; if (V2IsSplat) { - if (isUndefOrEqual(BitI1, NumElts)) + if (!isUndefOrEqual(BitI1, NumElts)) return false; } else { if (!isUndefOrEqual(BitI1, j + NumElts)) @@ -2309,7 +2382,8 @@ bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) { /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. -bool static isUNPCKHMask(SDOperandPtr Elts, unsigned NumElts, +template +bool static isUNPCKHMask(SDOperand *Elts, unsigned NumElts, bool V2IsSplat = false) { if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return false; @@ -2385,7 +2459,8 @@ bool X86::isUNPCKH_v_undef_Mask(SDNode *N) { /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. -static bool isMOVLMask(SDOperandPtr Elts, unsigned NumElts) { +template +static bool isMOVLMask(SDOperand *Elts, unsigned NumElts) { if (NumElts != 2 && NumElts != 4) return false; @@ -2408,7 +2483,8 @@ bool X86::isMOVLMask(SDNode *N) { /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVL(SDOperandPtr Ops, unsigned NumOps, +template +static bool isCommutedMOVL(SDOperand *Ops, unsigned NumOps, bool V2IsSplat = false, bool V2IsUndef = false) { if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) @@ -2539,6 +2615,23 @@ static bool isSplatMask(SDNode *N) { return cast(ElementBase)->getZExtValue() < NumElems; } +/// getSplatMaskEltNo - Given a splat mask, return the index to the element +/// we want to splat. +static SDValue getSplatMaskEltNo(SDNode *N) { + assert(isSplatMask(N) && "Not a splat mask"); + unsigned NumElems = N->getNumOperands(); + SDValue ElementBase; + unsigned i = 0; + for (; i != NumElems; ++i) { + SDValue Elt = N->getOperand(i); + if (isa(Elt)) + return Elt; + } + assert(0 && " No splat value found!"); + return SDValue(); +} + + /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies /// a splat of a single element and it's a 2 or 4 element mask. bool X86::isSplatMask(SDNode *N) { @@ -2606,9 +2699,10 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { for (unsigned i = 7; i >= 4; --i) { unsigned Val = 0; SDValue Arg = N->getOperand(i); - if (Arg.getOpcode() != ISD::UNDEF) + if (Arg.getOpcode() != ISD::UNDEF) { Val = cast(Arg)->getZExtValue(); - Mask |= (Val - 4); + Mask |= (Val - 4); + } if (i != 4) Mask <<= 2; } @@ -2635,38 +2729,6 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { return Mask; } -/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand -/// specifies a 8 element shuffle that can be broken into a pair of -/// PSHUFHW and PSHUFLW. -static bool isPSHUFHW_PSHUFLWMask(SDNode *N) { - assert(N->getOpcode() == ISD::BUILD_VECTOR); - - if (N->getNumOperands() != 8) - return false; - - // Lower quadword shuffled. - for (unsigned i = 0; i != 4; ++i) { - SDValue Arg = N->getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - assert(isa(Arg) && "Invalid VECTOR_SHUFFLE mask!"); - unsigned Val = cast(Arg)->getZExtValue(); - if (Val >= 4) - return false; - } - - // Upper quadword shuffled. - for (unsigned i = 4; i != 8; ++i) { - SDValue Arg = N->getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - assert(isa(Arg) && "Invalid VECTOR_SHUFFLE mask!"); - unsigned Val = cast(Arg)->getZExtValue(); - if (Val < 4 || Val > 7) - return false; - } - - return true; -} - /// CommuteVectorShuffle - Swap vector_shuffle operands as well as /// values in ther permute mask. static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1, @@ -2677,11 +2739,12 @@ static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1, MVT EltVT = MaskVT.getVectorElementType(); unsigned NumElems = Mask.getNumOperands(); SmallVector MaskVec; + DebugLoc dl = Op.getDebugLoc(); for (unsigned i = 0; i != NumElems; ++i) { SDValue Arg = Mask.getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT)); + MaskVec.push_back(DAG.getUNDEF(EltVT)); continue; } assert(isa(Arg) && "Invalid VECTOR_SHUFFLE mask!"); @@ -2693,14 +2756,14 @@ static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1, } std::swap(V1, V2); - Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], NumElems); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); + Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], NumElems); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask); } /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static -SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG) { +SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG, DebugLoc dl) { MVT MaskVT = Mask.getValueType(); MVT EltVT = MaskVT.getVectorElementType(); unsigned NumElems = Mask.getNumOperands(); @@ -2708,7 +2771,7 @@ SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG) { for (unsigned i = 0; i != NumElems; ++i) { SDValue Arg = Mask.getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT)); + MaskVec.push_back(DAG.getUNDEF(EltVT)); continue; } assert(isa(Arg) && "Invalid VECTOR_SHUFFLE mask!"); @@ -2718,7 +2781,7 @@ SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG) { else MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], NumElems); + return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], NumElems); } @@ -2837,7 +2900,7 @@ static bool isZeroShuffle(SDNode *N) { SDValue Arg = Mask.getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) continue; - + unsigned Idx = cast(Arg)->getZExtValue(); if (Idx < NumElems) { unsigned Opc = V1.getNode()->getOpcode(); @@ -2860,39 +2923,40 @@ static bool isZeroShuffle(SDNode *N) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// -static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG) { +static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG, + DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - + // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest // type. This ensures they get CSE'd. SDValue Vec; if (VT.getSizeInBits() == 64) { // MMX SDValue Cst = DAG.getTargetConstant(0, MVT::i32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); } else if (HasSSE2) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); - Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4f32, Cst, Cst, Cst, Cst); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } - return DAG.getNode(ISD::BIT_CONVERT, VT, Vec); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. /// -static SDValue getOnesVector(MVT VT, SelectionDAG &DAG) { +static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - + // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest // type. This ensures they get CSE'd. SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; if (VT.getSizeInBits() == 64) // MMX - Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); else // SSE - Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst); - return DAG.getNode(ISD::BIT_CONVERT, VT, Vec); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); } @@ -2917,14 +2981,15 @@ static SDValue NormalizeMask(SDValue Mask, SelectionDAG &DAG) { } if (Changed) - Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(), + Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getDebugLoc(), + Mask.getValueType(), &MaskVec[0], MaskVec.size()); return Mask; } /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. -static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG) { +static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG, DebugLoc dl) { MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT BaseVT = MaskVT.getVectorElementType(); @@ -2932,12 +2997,14 @@ static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG) { MaskVec.push_back(DAG.getConstant(NumElems, BaseVT)); for (unsigned i = 1; i != NumElems; ++i) MaskVec.push_back(DAG.getConstant(i, BaseVT)); - return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, + &MaskVec[0], MaskVec.size()); } /// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation /// of specified width. -static SDValue getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) { +static SDValue getUnpacklMask(unsigned NumElems, SelectionDAG &DAG, + DebugLoc dl) { MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT BaseVT = MaskVT.getVectorElementType(); SmallVector MaskVec; @@ -2945,12 +3012,14 @@ static SDValue getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) { MaskVec.push_back(DAG.getConstant(i, BaseVT)); MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, + &MaskVec[0], MaskVec.size()); } /// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation /// of specified width. -static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) { +static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG, + DebugLoc dl) { MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT BaseVT = MaskVT.getVectorElementType(); unsigned Half = NumElems/2; @@ -2959,14 +3028,15 @@ static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) { MaskVec.push_back(DAG.getConstant(i + Half, BaseVT)); MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, + &MaskVec[0], MaskVec.size()); } /// getSwapEltZeroMask - Returns a vector_shuffle mask for a shuffle that swaps /// element #0 of a vector with the specified index, leaving the rest of the /// elements in place. static SDValue getSwapEltZeroMask(unsigned NumElems, unsigned DestElt, - SelectionDAG &DAG) { + SelectionDAG &DAG, DebugLoc dl) { MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT BaseVT = MaskVT.getVectorElementType(); SmallVector MaskVec; @@ -2974,7 +3044,8 @@ static SDValue getSwapEltZeroMask(unsigned NumElems, unsigned DestElt, MaskVec.push_back(DAG.getConstant(DestElt, BaseVT)); for (unsigned i = 1; i != NumElems; ++i) MaskVec.push_back(DAG.getConstant(i == DestElt ? 0 : i, BaseVT)); - return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, + &MaskVec[0], MaskVec.size()); } /// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. @@ -2985,21 +3056,33 @@ static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) { return Op; SDValue V1 = Op.getOperand(0); SDValue Mask = Op.getOperand(2); - unsigned NumElems = Mask.getNumOperands(); + unsigned MaskNumElems = Mask.getNumOperands(); + unsigned NumElems = MaskNumElems; + DebugLoc dl = Op.getDebugLoc(); // Special handling of v4f32 -> v4i32. if (VT != MVT::v4f32) { - Mask = getUnpacklMask(NumElems, DAG); + // Find which element we want to splat. + SDNode* EltNoNode = getSplatMaskEltNo(Mask.getNode()).getNode(); + unsigned EltNo = cast(EltNoNode)->getZExtValue(); + // unpack elements to the correct location while (NumElems > 4) { - V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask); + if (EltNo < NumElems/2) { + Mask = getUnpacklMask(MaskNumElems, DAG, dl); + } else { + Mask = getUnpackhMask(MaskNumElems, DAG, dl); + EltNo -= NumElems/2; + } + V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1, Mask); NumElems >>= 1; } - Mask = getZeroVector(MVT::v4i32, true, DAG); + SDValue Cst = DAG.getConstant(EltNo, MVT::i32); + Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } - V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1); - SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1, - DAG.getNode(ISD::UNDEF, PVT), Mask); - return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle); + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); + SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1, + DAG.getUNDEF(PVT), Mask); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle); } /// isVectorLoad - Returns true if the node is a vector load, a scalar @@ -3025,21 +3108,23 @@ static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask, MVT VT = Op.getValueType(); if (VT == PVT) return Op; + DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = PVT.getVectorNumElements(); if (NumElems == 2) { SDValue Cst = DAG.getTargetConstant(0, MVT::i32); - Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst); + Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); } else { assert(NumElems == 4); SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32); SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32); - Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst0, Cst1, Cst0, Cst1); + Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Cst0, Cst1, Cst0, Cst1); } - V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1); - SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1, - DAG.getNode(ISD::UNDEF, PVT), Mask); - return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle); + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); + SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1, + DAG.getUNDEF(PVT), Mask); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle); } /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified @@ -3049,9 +3134,10 @@ static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask, static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool isZero, bool HasSSE2, SelectionDAG &DAG) { + DebugLoc dl = V2.getDebugLoc(); MVT VT = V2.getValueType(); SDValue V1 = isZero - ? getZeroVector(VT, HasSSE2, DAG) : DAG.getNode(ISD::UNDEF, VT); + ? getZeroVector(VT, HasSSE2, DAG, dl) : DAG.getUNDEF(VT); unsigned NumElems = V2.getValueType().getVectorNumElements(); MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); MVT EVT = MaskVT.getVectorElementType(); @@ -3061,9 +3147,9 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, MaskVec.push_back(DAG.getConstant(NumElems, EVT)); else MaskVec.push_back(DAG.getConstant(i, EVT)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], MaskVec.size()); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask); } /// getNumOfConsecutiveZeros - Return the number of elements in a result of @@ -3138,15 +3224,16 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, if (NumNonZero > 8) return SDValue(); + DebugLoc dl = Op.getDebugLoc(); SDValue V(0, 0); bool First = true; for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG); + V = getZeroVector(MVT::v8i16, true, DAG, dl); else - V = DAG.getNode(ISD::UNDEF, MVT::v8i16); + V = DAG.getUNDEF(MVT::v8i16); First = false; } @@ -3154,24 +3241,25 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, SDValue ThisElt(0, 0), LastElt(0, 0); bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { - LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1)); + LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, + MVT::i16, Op.getOperand(i-1)); } if (ThisIsNonZero) { - ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, MVT::i16, + ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, DAG.getConstant(8, MVT::i8)); if (LastIsNonZero) - ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt); + ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; if (ThisElt.getNode()) - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt, + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, DAG.getIntPtrConstant(i/2)); } } - return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V); + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); } /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. @@ -3182,6 +3270,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (NumNonZero > 4) return SDValue(); + DebugLoc dl = Op.getDebugLoc(); SDValue V(0, 0); bool First = true; for (unsigned i = 0; i < 8; ++i) { @@ -3189,12 +3278,13 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (isNonZero) { if (First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG); + V = getZeroVector(MVT::v8i16, true, DAG, dl); else - V = DAG.getNode(ISD::UNDEF, MVT::v8i16); + V = DAG.getUNDEF(MVT::v8i16); First = false; } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i), + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v8i16, V, Op.getOperand(i), DAG.getIntPtrConstant(i)); } } @@ -3206,18 +3296,19 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, /// static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, - const TargetLowering &TLI) { + const TargetLowering &TLI, DebugLoc dl) { bool isMMX = VT.getSizeInBits() == 64; MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; - SrcOp = DAG.getNode(ISD::BIT_CONVERT, ShVT, SrcOp); - return DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(Opc, ShVT, SrcOp, + SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getNode(Opc, dl, ShVT, SrcOp, DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); // All zero's are handled with pxor, all one's are handled with pcmpeqd. if (ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorAllOnes(Op.getNode())) { @@ -3228,8 +3319,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return Op; if (ISD::isBuildVectorAllOnes(Op.getNode())) - return getOnesVector(Op.getValueType(), DAG); - return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG); + return getOnesVector(Op.getValueType(), DAG, dl); + return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); } MVT VT = Op.getValueType(); @@ -3260,14 +3351,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { if (NumNonZero == 0) { // All undef vector. Return an UNDEF. All zero vectors were handled above. - return DAG.getNode(ISD::UNDEF, VT); + return DAG.getUNDEF(VT); } // Special case for single non-zero, non-undef, element. if (NumNonZero == 1 && NumElems <= 4) { unsigned Idx = CountTrailingZeros_32(NonZeros); SDValue Item = Op.getOperand(Idx); - + // If this is an insertion of an i64 value on x86-32, and if the top bits of // the value are obviously zero, truncate the value to i32 and do the // insertion that way. Only do this if the value is non-constant or if the @@ -3279,27 +3370,27 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { // Handle MMX and SSE both. MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; - + // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). - Item = DAG.getNode(ISD::TRUNCATE, MVT::i32, Item); - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VecVT, Item); + Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), DAG); - + // Now we have our 32-bit value zero extended in the low element of // a vector. If Idx != 0, swizzle it into place. if (Idx != 0) { - SDValue Ops[] = { - Item, DAG.getNode(ISD::UNDEF, Item.getValueType()), - getSwapEltZeroMask(VecElts, Idx, DAG) + SDValue Ops[] = { + Item, DAG.getUNDEF(Item.getValueType()), + getSwapEltZeroMask(VecElts, Idx, DAG, dl) }; - Item = DAG.getNode(ISD::VECTOR_SHUFFLE, VecVT, Ops, 3); + Item = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VecVT, Ops, 3); } - return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Item); + return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); } } - + // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd @@ -3308,7 +3399,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { if (Idx == 0 && // Don't do this for i64 values on x86-32. (EVT != MVT::i64 || Subtarget->is64Bit())) { - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget->hasSSE2(), DAG); @@ -3319,10 +3410,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(1)), - NumBits/2, DAG, *this); + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + VT, Op.getOperand(1)), + NumBits/2, DAG, *this, dl); } - + if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); @@ -3332,8 +3424,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item); - + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget->hasSSE2(), DAG); @@ -3342,17 +3434,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { SmallVector MaskVec; for (unsigned i = 0; i < NumElems; i++) MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], MaskVec.size()); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item, - DAG.getNode(ISD::UNDEF, VT), Mask); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, Item, + DAG.getUNDEF(VT), Mask); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) return SDValue(); - + // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) @@ -3363,7 +3455,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = CountTrailingZeros_32(NonZeros); - SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, + SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget->hasSSE2(), DAG); @@ -3391,9 +3483,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1 << i)); if (isZero) - V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG); + V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); else - V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i)); + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { @@ -3403,16 +3495,16 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { V[i] = V[i*2]; // Must be a zero vector. break; case 1: - V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2], - getMOVLMask(NumElems, DAG)); + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2+1], V[i*2], + getMOVLMask(NumElems, DAG, dl)); break; case 2: - V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1], - getMOVLMask(NumElems, DAG)); + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1], + getMOVLMask(NumElems, DAG, dl)); break; case 3: - V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1], - getUnpacklMask(NumElems, DAG)); + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1], + getUnpacklMask(NumElems, DAG, dl)); break; } } @@ -3432,9 +3524,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT)); else MaskVec.push_back(DAG.getConstant(i+NumElems, EVT)); - SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], MaskVec.size()); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[0], V[1], ShufMask); } if (Values.size() > 2) { @@ -3443,13 +3535,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { // Step 1: unpcklps 0, 2 ==> X: // : unpcklps 1, 3 ==> Y: // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> - SDValue UnpckMask = getUnpacklMask(NumElems, DAG); + SDValue UnpckMask = getUnpacklMask(NumElems, DAG, dl); for (unsigned i = 0; i < NumElems; ++i) - V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i)); + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); NumElems >>= 1; while (NumElems != 0) { for (unsigned i = 0; i < NumElems; ++i) - V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems], + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i], V[i + NumElems], UnpckMask); NumElems >>= 1; } @@ -3459,258 +3551,409 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return SDValue(); } +// v8i16 shuffles - Prefer shuffles in the following order: +// 1. [all] pshuflw, pshufhw, optional move +// 2. [ssse3] 1 x pshufb +// 3. [ssse3] 2 x pshufb + 1 x por +// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) static SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2, SDValue PermMask, SelectionDAG &DAG, - TargetLowering &TLI) { - SDValue NewV; - MVT MaskVT = MVT::getIntVectorWithNumElements(8); - MVT MaskEVT = MaskVT.getVectorElementType(); - MVT PtrVT = TLI.getPointerTy(); + X86TargetLowering &TLI, DebugLoc dl) { SmallVector MaskElts(PermMask.getNode()->op_begin(), PermMask.getNode()->op_end()); - - // First record which half of which vector the low elements come from. - SmallVector LowQuad(4); - for (unsigned i = 0; i < 4; ++i) { + SmallVector MaskVals; + + // Determine if more than 1 of the words in each of the low and high quadwords + // of the result come from the same quadword of one of the two inputs. Undef + // mask values count as coming from any quadword, for better codegen. + SmallVector LoQuad(4); + SmallVector HiQuad(4); + BitVector InputQuads(4); + for (unsigned i = 0; i < 8; ++i) { + SmallVectorImpl &Quad = i < 4 ? LoQuad : HiQuad; SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) + int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : + cast(Elt)->getZExtValue(); + MaskVals.push_back(EltIdx); + if (EltIdx < 0) { + ++Quad[0]; + ++Quad[1]; + ++Quad[2]; + ++Quad[3]; continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - int QuadIdx = EltIdx / 4; - ++LowQuad[QuadIdx]; + } + ++Quad[EltIdx / 4]; + InputQuads.set(EltIdx / 4); } - int BestLowQuad = -1; + int BestLoQuad = -1; unsigned MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { - if (LowQuad[i] > MaxQuad) { - BestLowQuad = i; - MaxQuad = LowQuad[i]; + if (LoQuad[i] > MaxQuad) { + BestLoQuad = i; + MaxQuad = LoQuad[i]; } } - // Record which half of which vector the high elements come from. - SmallVector HighQuad(4); - for (unsigned i = 4; i < 8; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) - continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - int QuadIdx = EltIdx / 4; - ++HighQuad[QuadIdx]; - } - - int BestHighQuad = -1; + int BestHiQuad = -1; MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { - if (HighQuad[i] > MaxQuad) { - BestHighQuad = i; - MaxQuad = HighQuad[i]; + if (HiQuad[i] > MaxQuad) { + BestHiQuad = i; + MaxQuad = HiQuad[i]; } } - // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it. - if (BestLowQuad != -1 || BestHighQuad != -1) { - // First sort the 4 chunks in order using shufpd. - SmallVector MaskVec; - - if (BestLowQuad != -1) - MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32)); - else - MaskVec.push_back(DAG.getConstant(0, MVT::i32)); - - if (BestHighQuad != -1) - MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32)); - else - MaskVec.push_back(DAG.getConstant(1, MVT::i32)); - - SDValue Mask= DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, &MaskVec[0],2); - NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64, - DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V1), - DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V2), Mask); - NewV = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, NewV); - - // Now sort high and low parts separately. - BitVector InOrder(8); - if (BestLowQuad != -1) { - // Sort lower half in order using PSHUFLW. - MaskVec.clear(); - bool AnyOutOrder = false; - - for (unsigned i = 0; i != 4; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(Elt); - InOrder.set(i); - } else { - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx != i) - AnyOutOrder = true; + // For SSSE3, If all 8 words of the result come from only 1 quadword of each + // of the two input vectors, shuffle them into one input vector so only a + // single pshufb instruction is necessary. If There are more than 2 input + // quads, disable the next transformation since it does not help SSSE3. + bool V1Used = InputQuads[0] || InputQuads[1]; + bool V2Used = InputQuads[2] || InputQuads[3]; + if (TLI.getSubtarget()->hasSSSE3()) { + if (InputQuads.count() == 2 && V1Used && V2Used) { + BestLoQuad = InputQuads.find_first(); + BestHiQuad = InputQuads.find_next(BestLoQuad); + } + if (InputQuads.count() > 2) { + BestLoQuad = -1; + BestHiQuad = -1; + } + } - MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT)); + // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update + // the shuffle mask. If a quad is scored as -1, that means that it contains + // words from all 4 input quadwords. + SDValue NewV; + if (BestLoQuad >= 0 || BestHiQuad >= 0) { + SmallVector MaskV; + MaskV.push_back(DAG.getConstant(BestLoQuad < 0 ? 0 : BestLoQuad, MVT::i64)); + MaskV.push_back(DAG.getConstant(BestHiQuad < 0 ? 1 : BestHiQuad, MVT::i64)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, &MaskV[0], 2); + + NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask); + NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); + + // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the + // source words for the shuffle, to aid later transformations. + bool AllWordsInNewV = true; + bool InOrder[2] = { true, true }; + for (unsigned i = 0; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx != (int)i) + InOrder[i/4] = false; + if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) + continue; + AllWordsInNewV = false; + break; + } - // If this element is in the right place after this shuffle, then - // remember it. - if ((int)(EltIdx / 4) == BestLowQuad) - InOrder.set(i); - } - } - if (AnyOutOrder) { - for (unsigned i = 4; i != 8; ++i) - MaskVec.push_back(DAG.getConstant(i, MaskEVT)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8); - NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask); + bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; + if (AllWordsInNewV) { + for (int i = 0; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) + continue; + idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; + if ((idx != i) && idx < 4) + pshufhw = false; + if ((idx != i) && idx > 3) + pshuflw = false; } + V1 = NewV; + V2Used = false; + BestLoQuad = 0; + BestHiQuad = 1; } - if (BestHighQuad != -1) { - // Sort high half in order using PSHUFHW if possible. - MaskVec.clear(); - - for (unsigned i = 0; i != 4; ++i) - MaskVec.push_back(DAG.getConstant(i, MaskEVT)); - - bool AnyOutOrder = false; - for (unsigned i = 4; i != 8; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(Elt); - InOrder.set(i); - } else { - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx != i) - AnyOutOrder = true; - - MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT)); - - // If this element is in the right place after this shuffle, then - // remember it. - if ((int)(EltIdx / 4) == BestHighQuad) - InOrder.set(i); - } - } - - if (AnyOutOrder) { - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8); - NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask); - } + // If we've eliminated the use of V2, and the new mask is a pshuflw or + // pshufhw, that's as cheap as it gets. Return the new shuffle. + if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { + MaskV.clear(); + for (unsigned i = 0; i != 8; ++i) + MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16) + : DAG.getConstant(MaskVals[i], + MVT::i16)); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, + DAG.getUNDEF(MVT::v8i16), + DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, + &MaskV[0], 8)); } - - // The other elements are put in the right place using pextrw and pinsrw. + } + + // If we have SSSE3, and all words of the result are from 1 input vector, + // case 2 is generated, otherwise case 3 is generated. If no SSSE3 + // is present, fall back to case 4. + if (TLI.getSubtarget()->hasSSSE3()) { + SmallVector pshufbMask; + + // If we have elements from both input vectors, set the high bit of the + // shuffle mask element to zero out elements that come from V2 in the V1 + // mask, and elements that come from V1 in the V2 mask, so that the two + // results can be OR'd together. + bool TwoInputs = V1Used && V2Used; for (unsigned i = 0; i != 8; ++i) { - if (InOrder[i]) + int EltIdx = MaskVals[i] * 2; + if (TwoInputs && (EltIdx >= 16)) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); continue; - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); + } + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + if (!TwoInputs) + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 8; ++i) { + int EltIdx = MaskVals[i] * 2; + if (EltIdx < 16) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - SDValue ExtOp = (EltIdx < 8) - ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1, - DAG.getConstant(EltIdx, PtrVT)) - : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2, - DAG.getConstant(EltIdx - 8, PtrVT)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp, - DAG.getConstant(i, PtrVT)); - } - - return NewV; - } - - // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use as - // few as possible. First, let's find out how many elements are already in the - // right order. - unsigned V1InOrder = 0; - unsigned V1FromV1 = 0; - unsigned V2InOrder = 0; - unsigned V2FromV2 = 0; - SmallVector V1Elts; - SmallVector V2Elts; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = MaskElts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - V1Elts.push_back(Elt); - V2Elts.push_back(Elt); - ++V1InOrder; - ++V2InOrder; - continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); } - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx == i) { - V1Elts.push_back(Elt); - V2Elts.push_back(DAG.getConstant(i+8, MaskEVT)); - ++V1InOrder; - } else if (EltIdx == i+8) { - V1Elts.push_back(Elt); - V2Elts.push_back(DAG.getConstant(i, MaskEVT)); - ++V2InOrder; - } else if (EltIdx < 8) { - V1Elts.push_back(Elt); - ++V1FromV1; - } else { - V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT)); - ++V2FromV2; + V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + } + + // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, + // and update MaskVals with new element order. + BitVector InOrder(8); + if (BestLoQuad >= 0) { + SmallVector MaskV; + for (int i = 0; i != 4; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); + InOrder.set(i); + } else if ((idx / 4) == BestLoQuad) { + MaskV.push_back(DAG.getConstant(idx & 3, MVT::i16)); + InOrder.set(i); + } else { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); + } } + for (unsigned i = 4; i != 8; ++i) + MaskV.push_back(DAG.getConstant(i, MVT::i16)); + NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, + DAG.getUNDEF(MVT::v8i16), + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v8i16, &MaskV[0], 8)); } - - if (V2InOrder > V1InOrder) { - PermMask = CommuteVectorShuffleMask(PermMask, DAG); - std::swap(V1, V2); - std::swap(V1Elts, V2Elts); - std::swap(V1FromV1, V2FromV2); - } - - if ((V1FromV1 + V1InOrder) != 8) { - // Some elements are from V2. - if (V1FromV1) { - // If there are elements that are from V1 but out of place, - // then first sort them in place - SmallVector MaskVec; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = V1Elts[i]; - if (Elt.getOpcode() == ISD::UNDEF) { - MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT)); - continue; - } - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx >= 8) - MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT)); - else - MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT)); + + // If BestHi >= 0, generate a pshufhw to put the high elements in order, + // and update MaskVals with the new element order. + if (BestHiQuad >= 0) { + SmallVector MaskV; + for (unsigned i = 0; i != 4; ++i) + MaskV.push_back(DAG.getConstant(i, MVT::i16)); + for (unsigned i = 4; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); + InOrder.set(i); + } else if ((idx / 4) == BestHiQuad) { + MaskV.push_back(DAG.getConstant((idx & 3) + 4, MVT::i16)); + InOrder.set(i); + } else { + MaskV.push_back(DAG.getUNDEF(MVT::i16)); } - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8); - V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V1, Mask); } - + NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, + DAG.getUNDEF(MVT::v8i16), + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v8i16, &MaskV[0], 8)); + } + + // In case BestHi & BestLo were both -1, which means each quadword has a word + // from each of the four input quadwords, calculate the InOrder bitvector now + // before falling through to the insert/extract cleanup. + if (BestLoQuad == -1 && BestHiQuad == -1) { NewV = V1; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = V1Elts[i]; - if (Elt.getOpcode() == ISD::UNDEF) - continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - if (EltIdx < 8) + for (int i = 0; i != 8; ++i) + if (MaskVals[i] < 0 || MaskVals[i] == i) + InOrder.set(i); + } + + // The other elements are put in the right place using pextrw and pinsrw. + for (unsigned i = 0; i != 8; ++i) { + if (InOrder[i]) + continue; + int EltIdx = MaskVals[i]; + if (EltIdx < 0) + continue; + SDValue ExtOp = (EltIdx < 8) + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, + DAG.getIntPtrConstant(EltIdx)) + : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, + DAG.getIntPtrConstant(EltIdx - 8)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, + DAG.getIntPtrConstant(i)); + } + return NewV; +} + +// v16i8 shuffles - Prefer shuffles in the following order: +// 1. [ssse3] 1 x pshufb +// 2. [ssse3] 2 x pshufb + 1 x por +// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw +static +SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2, + SDValue PermMask, SelectionDAG &DAG, + X86TargetLowering &TLI, DebugLoc dl) { + SmallVector MaskElts(PermMask.getNode()->op_begin(), + PermMask.getNode()->op_end()); + SmallVector MaskVals; + + // If we have SSSE3, case 1 is generated when all result bytes come from + // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is + // present, fall back to case 3. + // FIXME: kill V2Only once shuffles are canonizalized by getNode. + bool V1Only = true; + bool V2Only = true; + for (unsigned i = 0; i < 16; ++i) { + SDValue Elt = MaskElts[i]; + int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : + cast(Elt)->getZExtValue(); + MaskVals.push_back(EltIdx); + if (EltIdx < 0) + continue; + if (EltIdx < 16) + V2Only = false; + else + V1Only = false; + } + + // If SSSE3, use 1 pshufb instruction per vector with elements in the result. + if (TLI.getSubtarget()->hasSSSE3()) { + SmallVector pshufbMask; + + // If all result elements are from one input vector, then only translate + // undef mask values to 0x80 (zero out result) in the pshufb mask. + // + // Otherwise, we have elements from both input vectors, and must zero out + // elements that come from V2 in the first mask, and V1 in the second mask + // so that we can OR them together. + bool TwoInputs = !(V1Only || V2Only); + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); continue; - SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2, - DAG.getConstant(EltIdx - 8, PtrVT)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp, - DAG.getConstant(i, PtrVT)); + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } - return NewV; - } else { - // All elements are from V1. - NewV = V1; - for (unsigned i = 0; i < 8; ++i) { - SDValue Elt = V1Elts[i]; - if (Elt.getOpcode() == ISD::UNDEF) + // If all the elements are from V2, assign it to V1 and return after + // building the first pshufb. + if (V2Only) + V1 = V2; + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + if (!TwoInputs) + return V1; + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 16) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); continue; - unsigned EltIdx = cast(Elt)->getZExtValue(); - SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1, - DAG.getConstant(EltIdx, PtrVT)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp, - DAG.getConstant(i, PtrVT)); + } + pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); } - return NewV; + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); } + + // No SSSE3 - Calculate in place words and then fix all out of place words + // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from + // the 16 different words that comprise the two doublequadword input vectors. + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); + SDValue NewV = V2Only ? V2 : V1; + for (int i = 0; i != 8; ++i) { + int Elt0 = MaskVals[i*2]; + int Elt1 = MaskVals[i*2+1]; + + // This word of the result is all undef, skip it. + if (Elt0 < 0 && Elt1 < 0) + continue; + + // This word of the result is already in the correct place, skip it. + if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) + continue; + if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) + continue; + + SDValue Elt0Src = Elt0 < 16 ? V1 : V2; + SDValue Elt1Src = Elt1 < 16 ? V1 : V2; + SDValue InsElt; + + // If Elt0 and Elt1 are defined, are consecutive, and can be load + // using a single extract together, load it and store it. + if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + continue; + } + + // If Elt1 is defined, extract it from the appropriate source. If the + // source byte is not also odd, shift the extracted word left 8 bits + // otherwise clear the bottom 8 bits if we need to do an or. + if (Elt1 >= 0) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + if ((Elt1 & 1) == 0) + InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, + DAG.getConstant(8, TLI.getShiftAmountTy())); + else if (Elt0 >= 0) + InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, + DAG.getConstant(0xFF00, MVT::i16)); + } + // If Elt0 is defined, extract it from the appropriate source. If the + // source byte is not also even, shift the extracted word right 8 bits. If + // Elt1 was also defined, OR the extracted values together before + // inserting them in the result. + if (Elt0 >= 0) { + SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); + if ((Elt0 & 1) != 0) + InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, + DAG.getConstant(8, TLI.getShiftAmountTy())); + else if (Elt1 >= 0) + InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, + DAG.getConstant(0x00FF, MVT::i16)); + InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) + : InsElt0; + } + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + } + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide @@ -3722,7 +3965,7 @@ static SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2, MVT VT, SDValue PermMask, SelectionDAG &DAG, - TargetLowering &TLI) { + TargetLowering &TLI, DebugLoc dl) { unsigned NumElems = PermMask.getNumOperands(); unsigned NewWidth = (NumElems == 4) ? 2 : 4; MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); @@ -3757,15 +4000,15 @@ SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2, return SDValue(); } if (StartIdx == ~0U) - MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEltVT)); + MaskVec.push_back(DAG.getUNDEF(MaskEltVT)); else MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MaskEltVT)); } - V1 = DAG.getNode(ISD::BIT_CONVERT, NewVT, V1); - V2 = DAG.getNode(ISD::BIT_CONVERT, NewVT, V2); - return DAG.getNode(ISD::VECTOR_SHUFFLE, NewVT, V1, V2, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, NewVT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], MaskVec.size())); } @@ -3773,7 +4016,7 @@ SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2, /// static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget *Subtarget, DebugLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { LoadSDNode *LD = NULL; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) @@ -3788,30 +4031,33 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT, SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) { // PR2108 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; - return DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(X86ISD::VZEXT_MOVL, OpVT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT, + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + OpVT, SrcOp.getOperand(0) .getOperand(0)))); } } } - return DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(X86ISD::VZEXT_MOVL, OpVT, - DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp))); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, + DAG.getNode(ISD::BIT_CONVERT, dl, + OpVT, SrcOp))); } /// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of /// shuffles. static SDValue LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, - SDValue PermMask, MVT VT, SelectionDAG &DAG) { + SDValue PermMask, MVT VT, SelectionDAG &DAG, + DebugLoc dl) { MVT MaskVT = PermMask.getValueType(); MVT MaskEVT = MaskVT.getVectorElementType(); SmallVector, 8> Locs; Locs.resize(4); - SmallVector Mask1(4, DAG.getNode(ISD::UNDEF, MaskEVT)); + SmallVector Mask1(4, DAG.getUNDEF(MaskEVT)); unsigned NumHi = 0; unsigned NumLo = 0; for (unsigned i = 0; i != 4; ++i) { @@ -3839,11 +4085,11 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, // implemented with two shuffles. First shuffle gather the elements. // The second shuffle, which takes the first shuffle as both of its // vector operands, put the elements into the right order. - V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &Mask1[0], Mask1.size())); - SmallVector Mask2(4, DAG.getNode(ISD::UNDEF, MaskEVT)); + SmallVector Mask2(4, DAG.getUNDEF(MaskEVT)); for (unsigned i = 0; i != 4; ++i) { if (Locs[i].first == -1) continue; @@ -3854,8 +4100,8 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, } } - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &Mask2[0], Mask2.size())); } else if (NumLo == 3 || NumHi == 3) { // Otherwise, we must have three elements from one vector, call it X, and @@ -3867,7 +4113,7 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, // from X. if (NumHi == 3) { // Normalize it so the 3 elements come from V1. - PermMask = CommuteVectorShuffleMask(PermMask, DAG); + PermMask = CommuteVectorShuffleMask(PermMask, DAG, dl); std::swap(V1, V2); } @@ -3883,19 +4129,20 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, } Mask1[0] = PermMask.getOperand(HiIndex); - Mask1[1] = DAG.getNode(ISD::UNDEF, MaskEVT); + Mask1[1] = DAG.getUNDEF(MaskEVT); Mask1[2] = PermMask.getOperand(HiIndex^1); - Mask1[3] = DAG.getNode(ISD::UNDEF, MaskEVT); - V2 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4)); + Mask1[3] = DAG.getUNDEF(MaskEVT); + V2 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &Mask1[0], 4)); if (HiIndex >= 2) { Mask1[0] = PermMask.getOperand(0); Mask1[1] = PermMask.getOperand(1); Mask1[2] = DAG.getConstant(HiIndex & 1 ? 6 : 4, MaskEVT); Mask1[3] = DAG.getConstant(HiIndex & 1 ? 4 : 6, MaskEVT); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4)); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MaskVT, &Mask1[0], 4)); } else { Mask1[0] = DAG.getConstant(HiIndex & 1 ? 2 : 0, MaskEVT); Mask1[1] = DAG.getConstant(HiIndex & 1 ? 0 : 2, MaskEVT); @@ -3909,15 +4156,16 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, Mask1[3] = DAG.getConstant(cast(Mask1[3])->getZExtValue()+4, MaskEVT); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V2, V1, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4)); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V2, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MaskVT, &Mask1[0], 4)); } } // Break it into (shuffle shuffle_hi, shuffle_lo). Locs.clear(); - SmallVector LoMask(4, DAG.getNode(ISD::UNDEF, MaskEVT)); - SmallVector HiMask(4, DAG.getNode(ISD::UNDEF, MaskEVT)); + SmallVector LoMask(4, DAG.getUNDEF(MaskEVT)); + SmallVector HiMask(4, DAG.getUNDEF(MaskEVT)); SmallVector *MaskPtr = &LoMask; unsigned MaskIdx = 0; unsigned LoIdx = 0; @@ -3943,23 +4191,23 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, } } - SDValue LoShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + SDValue LoShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &LoMask[0], LoMask.size())); - SDValue HiShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + SDValue HiShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &HiMask[0], HiMask.size())); SmallVector MaskOps; for (unsigned i = 0; i != 4; ++i) { if (Locs[i].first == -1) { - MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT)); + MaskOps.push_back(DAG.getUNDEF(MaskEVT)); } else { unsigned Idx = Locs[i].first * 4 + Locs[i].second; MaskOps.push_back(DAG.getConstant(Idx, MaskEVT)); } } - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle, - DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, LoShuffle, HiShuffle, + DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskOps[0], MaskOps.size())); } @@ -3969,6 +4217,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue V2 = Op.getOperand(1); SDValue PermMask = Op.getOperand(2); MVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = PermMask.getNumOperands(); bool isMMX = VT.getSizeInBits() == 64; bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; @@ -3976,11 +4225,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { bool V1IsSplat = false; bool V2IsSplat = false; + // FIXME: Check for legal shuffle and return? + if (isUndefShuffle(Op.getNode())) - return DAG.getNode(ISD::UNDEF, VT); + return DAG.getUNDEF(VT); if (isZeroShuffle(Op.getNode())) - return getZeroVector(VT, Subtarget->hasSSE2(), DAG); + return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); if (isIdentityMask(PermMask.getNode())) return V1; @@ -4002,30 +4253,33 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8) { - SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this); + SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, + *this, dl); if (NewOp.getNode()) - return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG)); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + LowerVECTOR_SHUFFLE(NewOp, DAG)); } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, - DAG, *this); + DAG, *this, dl); if (NewOp.getNode()) { SDValue NewV1 = NewOp.getOperand(0); SDValue NewV2 = NewOp.getOperand(1); SDValue NewMask = NewOp.getOperand(2); if (isCommutedMOVL(NewMask.getNode(), true, false)) { NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG); - return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget); + return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget, + dl); } } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, - DAG, *this); + DAG, *this, dl); if (NewOp.getNode() && X86::isMOVLMask(NewOp.getOperand(2).getNode())) return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), - DAG, Subtarget); + DAG, Subtarget, dl); } } @@ -4035,18 +4289,18 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue ShVal; bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { - // If the shifted value has multiple uses, it may be cheaper to use + // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. MVT EVT = VT.getVectorElementType(); ShAmt *= EVT.getSizeInBits(); - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } if (X86::isMOVLMask(PermMask.getNode())) { if (V1IsUndef) return V2; if (ISD::isBuildVectorAllZeros(V1.getNode())) - return getVZextMovL(VT, VT, V2, DAG, Subtarget); + return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); if (!isMMX) return Op; } @@ -4066,7 +4320,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // No better options. Use a vshl / vsrl. MVT EVT = VT.getVectorElementType(); ShAmt *= EVT.getSizeInBits(); - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } bool Commuted = false; @@ -4074,7 +4328,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // 1,1,1,1 -> v8i16 though. V1IsSplat = isSplatVector(V1.getNode()); V2IsSplat = isSplatVector(V2.getNode()); - + // Canonicalize the splat or undef, if present, to be on the RHS. if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); @@ -4091,9 +4345,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // V2 is a splat, so the mask may be malformed. That is, it may point // to any V2 element. The instruction selectior won't like this. Get // a corrected mask and commute to form a proper MOVS{S|D}. - SDValue NewMask = getMOVLMask(NumElems, DAG); + SDValue NewMask = getMOVLMask(NumElems, DAG, dl); if (NewMask.getNode() != PermMask.getNode()) - Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask); + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask); } return Op; } @@ -4110,12 +4364,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // new vector_shuffle with the corrected mask. SDValue NewMask = NormalizeMask(PermMask, DAG); if (NewMask.getNode() != PermMask.getNode()) { - if (X86::isUNPCKLMask(PermMask.getNode(), true)) { - SDValue NewMask = getUnpacklMask(NumElems, DAG); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask); - } else if (X86::isUNPCKHMask(PermMask.getNode(), true)) { - SDValue NewMask = getUnpackhMask(NumElems, DAG); - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask); + if (X86::isUNPCKLMask(NewMask.getNode(), true)) { + SDValue NewMask = getUnpacklMask(NumElems, DAG, dl); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask); + } else if (X86::isUNPCKHMask(NewMask.getNode(), true)) { + SDValue NewMask = getUnpackhMask(NumElems, DAG, dl); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask); } } } @@ -4134,13 +4388,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return Op; } + // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. // Try PSHUF* first, then SHUFP*. // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.getNode())) { if (V2.getOpcode() != ISD::UNDEF) - return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, - DAG.getNode(ISD::UNDEF, VT), PermMask); + return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, + DAG.getUNDEF(VT), PermMask); return Op; } @@ -4152,14 +4407,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { MVT RVT = VT; if (VT == MVT::v4f32) { RVT = MVT::v4i32; - Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, - DAG.getNode(ISD::BIT_CONVERT, RVT, V1), - DAG.getNode(ISD::UNDEF, RVT), PermMask); + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT, + DAG.getNode(ISD::BIT_CONVERT, dl, RVT, V1), + DAG.getUNDEF(RVT), PermMask); } else if (V2.getOpcode() != ISD::UNDEF) - Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, V1, - DAG.getNode(ISD::UNDEF, RVT), PermMask); + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT, V1, + DAG.getUNDEF(RVT), PermMask); if (RVT != VT) - Op = DAG.getNode(ISD::BIT_CONVERT, VT, Op); + Op = DAG.getNode(ISD::BIT_CONVERT, dl, VT, Op); return Op; } @@ -4171,14 +4426,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // Handle v8i16 specifically since SSE can do byte extraction and insertion. if (VT == MVT::v8i16) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this); + SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this, dl); if (NewOp.getNode()) return NewOp; } + if (VT == MVT::v16i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(V1, V2, PermMask, DAG, *this, dl); + if (NewOp.getNode()) + return NewOp; + } + // Handle all 4 wide cases with a number of shuffles except for MMX. if (NumElems == 4 && !isMMX) - return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG); + return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG, dl); return SDValue(); } @@ -4187,18 +4448,28 @@ SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); if (VT.getSizeInBits() == 8) { - SDValue Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32, + SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); - SDValue Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract, + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); - return DAG.getNode(ISD::TRUNCATE, VT, Assert); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } else if (VT.getSizeInBits() == 16) { - SDValue Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32, + unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); + // If Idx is 0, it's cheaper to do a move instead of a pextrw. + if (Idx == 0) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BIT_CONVERT, dl, + MVT::v4i32, + Op.getOperand(0)), + Op.getOperand(1))); + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); - SDValue Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract, + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); - return DAG.getNode(ISD::TRUNCATE, VT, Assert); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } else if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the @@ -4214,10 +4485,15 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, (User->getOpcode() != ISD::BIT_CONVERT || User->getValueType(0) != MVT::i32)) return SDValue(); - SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Op.getOperand(0)), - Op.getOperand(1)); - return DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Extract); + SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, + Op.getOperand(0)), + Op.getOperand(1)); + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); + } else if (VT == MVT::i32) { + // ExtractPS works with constant index. + if (isa(Op.getOperand(1))) + return Op; } return SDValue(); } @@ -4235,22 +4511,24 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { } MVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); if (Idx == 0) - return DAG.getNode(ISD::TRUNCATE, MVT::i16, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Vec), + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BIT_CONVERT, dl, + MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1); - SDValue Extract = DAG.getNode(X86ISD::PEXTRW, EVT, + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT, Op.getOperand(0), Op.getOperand(1)); - SDValue Assert = DAG.getNode(ISD::AssertZext, EVT, Extract, + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract, DAG.getValueType(VT)); - return DAG.getNode(ISD::TRUNCATE, VT, Assert); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } else if (VT.getSizeInBits() == 32) { unsigned Idx = cast(Op.getOperand(1))->getZExtValue(); if (Idx == 0) @@ -4261,17 +4539,17 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { IdxVec. push_back(DAG.getConstant(Idx, MaskVT.getVectorElementType())); IdxVec. - push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType())); + push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); IdxVec. - push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType())); + push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); IdxVec. - push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType())); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &IdxVec[0], IdxVec.size()); SDValue Vec = Op.getOperand(0); - Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), - Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, + Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(), + Vec, DAG.getUNDEF(Vec.getValueType()), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0)); } else if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b @@ -4288,13 +4566,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { SmallVector IdxVec; IdxVec.push_back(DAG.getConstant(1, MaskVT.getVectorElementType())); IdxVec. - push_back(DAG.getNode(ISD::UNDEF, MaskVT.getVectorElementType())); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &IdxVec[0], IdxVec.size()); SDValue Vec = Op.getOperand(0); - Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), - Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, + Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(), + Vec, DAG.getUNDEF(Vec.getValueType()), + Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0)); } @@ -4305,6 +4584,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ MVT VT = Op.getValueType(); MVT EVT = VT.getVectorElementType(); + DebugLoc dl = Op.getDebugLoc(); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); @@ -4313,25 +4593,29 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) && isa(N2)) { unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB - : X86ISD::PINSRW; + : X86ISD::PINSRW; // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1); + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(cast(N2)->getZExtValue()); - return DAG.getNode(Opc, VT, N0, N1, N2); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); } else if (EVT == MVT::f32 && isa(N2)) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into these // bits. For example (insert (extract, 3), 2) could be matched by putting // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the + // Bits [5:4] of the constant are the destination select. This is the // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. N2 = DAG.getIntPtrConstant(cast(N2)->getZExtValue() << 4); - return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + } else if (EVT == MVT::i32) { + // InsertPS works with constant index. + if (isa(N2)) + return Op; } return SDValue(); } @@ -4347,6 +4631,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { if (EVT == MVT::i8) return SDValue(); + DebugLoc dl = Op.getDebugLoc(); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); @@ -4355,23 +4640,24 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { // Transform it so it match pinsrw which expects a 16-bit value in a GR32 // as its second argument. if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1); + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(cast(N2)->getZExtValue()); - return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2); + return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); } SDValue X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); if (Op.getValueType() == MVT::v2f32) - return DAG.getNode(ISD::BIT_CONVERT, MVT::v2f32, - DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2i32, - DAG.getNode(ISD::BIT_CONVERT, MVT::i32, + return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op.getOperand(0)))); - SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0)); + SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); MVT VT = MVT::v2i32; switch (Op.getValueType().getSimpleVT()) { default: break; @@ -4380,8 +4666,8 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { VT = MVT::v4i32; break; } - return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), - DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, AnyExt)); + return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as @@ -4393,15 +4679,18 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { ConstantPoolSDNode *CP = cast(Op); - SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), - getPointerTy(), - CP->getAlignment()); - Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + // FIXME there isn't really any debug info here, should come from the parent + DebugLoc dl = CP->getDebugLoc(); + SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), + CP->getAlignment()); + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && !Subtarget->isPICStyleRIPRel()) { - Result = DAG.getNode(ISD::ADD, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc::getUnknownLoc(), + getPointerTy()), Result); } @@ -4409,7 +4698,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, +X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, int64_t Offset, SelectionDAG &DAG) const { bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_; @@ -4424,28 +4713,28 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, Offset = 0; } else Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0); - Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (IsPic && !Subtarget->isPICStyleRIPRel()) { - Result = DAG.getNode(ISD::ADD, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), Result); } - + // For Darwin & Mingw32, external and weak symbols are indirect, so we want to // load the value at address GV, not the value of GV itself. This means that // the GlobalAddress must be in the base or index register of the address, not // the GV offset field. Platform check is inside GVRequiresExtraLoad() call // The same applies for external symbols during PIC codegen if (ExtraLoadRequired) - Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result, + Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, PseudoSourceValue::getGOT(), 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) - Result = DAG.getNode(ISD::ADD, getPointerTy(), Result, + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, DAG.getConstant(Offset, getPointerTy())); return Result; @@ -4455,7 +4744,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { const GlobalValue *GV = cast(Op)->getGlobal(); int64_t Offset = cast(Op)->getOffset(); - return LowerGlobalAddress(GV, Offset, DAG); + return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit @@ -4463,8 +4752,10 @@ static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const MVT PtrVT) { SDValue InFlag; - SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX, + DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc::getUnknownLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); @@ -4474,13 +4765,13 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, GA->getValueType(0), GA->getOffset()); SDValue Ops[] = { Chain, TGA, InFlag }; - SDValue Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 3); + SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); InFlag = Result.getValue(2); Chain = Result.getValue(1); // call ___tls_get_addr. This function receives its argument in // the register EAX. - Chain = DAG.getCopyToReg(Chain, X86::EAX, Result, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Result, InFlag); InFlag = Chain.getValue(1); NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); @@ -4490,10 +4781,10 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, DAG.getRegister(X86::EAX, PtrVT), DAG.getRegister(X86::EBX, PtrVT), InFlag }; - Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 5); + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 5); InFlag = Chain.getValue(1); - return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag); + return DAG.getCopyFromReg(Chain, dl, X86::EAX, PtrVT, InFlag); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit @@ -4501,6 +4792,7 @@ static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const MVT PtrVT) { SDValue InFlag, Chain; + DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better // emit leaq symbol@TLSGD(%rip), %rdi SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag); @@ -4508,13 +4800,13 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, GA->getValueType(0), GA->getOffset()); SDValue Ops[] = { DAG.getEntryNode(), TGA}; - SDValue Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 2); + SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); Chain = Result.getValue(1); InFlag = Result.getValue(2); // call __tls_get_addr. This function receives its argument in // the register RDI. - Chain = DAG.getCopyToReg(Chain, X86::RDI, Result, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, X86::RDI, Result, InFlag); InFlag = Chain.getValue(1); NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); @@ -4523,32 +4815,34 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, PtrVT), DAG.getRegister(X86::RDI, PtrVT), InFlag }; - Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 4); + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 4); InFlag = Chain.getValue(1); - return DAG.getCopyFromReg(Chain, X86::RAX, PtrVT, InFlag); + return DAG.getCopyFromReg(Chain, dl, X86::RAX, PtrVT, InFlag); } // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or // "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, - const MVT PtrVT) { + const MVT PtrVT, TLSModel::Model model) { + DebugLoc dl = GA->getDebugLoc(); // Get the Thread Pointer - SDValue ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT); + SDValue ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, + DebugLoc::getUnknownLoc(), PtrVT); // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial // exec) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), GA->getOffset()); - SDValue Offset = DAG.getNode(X86ISD::Wrapper, PtrVT, TGA); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); - if (GA->getGlobal()->isDeclaration()) // initial exec TLS model - Offset = DAG.getLoad(PtrVT, DAG.getEntryNode(), Offset, + if (model == TLSModel::InitialExec) + Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, PseudoSourceValue::getGOT(), 0); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. - return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset); + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue @@ -4558,28 +4852,48 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); GlobalAddressSDNode *GA = cast(Op); - // If the relocation model is PIC, use the "General Dynamic" TLS Model, - // otherwise use the "Local Exec"TLS Model + GlobalValue *GV = GA->getGlobal(); + TLSModel::Model model = + getTLSModel (GV, getTargetMachine().getRelocationModel()); if (Subtarget->is64Bit()) { - return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); + switch (model) { + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: // not implemented + case TLSModel::InitialExec: // not implemented + case TLSModel::LocalExec: // not implemented + return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); + default: + assert (0 && "Unknown TLS model"); + } } else { - if (getTargetMachine().getRelocationModel() == Reloc::PIC_) + switch (model) { + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: // not implemented return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); - else - return LowerToTLSExecModel(GA, DAG, getPointerTy()); + + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModel(GA, DAG, getPointerTy(), model); + default: + assert (0 && "Unknown TLS model"); + } } } SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { + // FIXME there isn't really any debug info here + DebugLoc dl = Op.getDebugLoc(); const char *Sym = cast(Op)->getSymbol(); SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy()); - Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && !Subtarget->isPICStyleRIPRel()) { - Result = DAG.getNode(ISD::ADD, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc::getUnknownLoc(), + getPointerTy()), Result); } @@ -4588,13 +4902,17 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { JumpTableSDNode *JT = cast(Op); + // FIXME there isn't really any debug into here + DebugLoc dl = JT->getDebugLoc(); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()); - Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && !Subtarget->isPICStyleRIPRel()) { - Result = DAG.getNode(ISD::ADD, getPointerTy(), - DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc::getUnknownLoc(), + getPointerTy()), Result); } @@ -4602,31 +4920,33 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { } /// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and -/// take a 2 x i32 value to shift plus a shift amount. +/// take a 2 x i32 value to shift plus a shift amount. SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); SDValue Tmp1 = isSRA ? - DAG.getNode(ISD::SRA, VT, ShOpHi, DAG.getConstant(VTBits - 1, MVT::i8)) : + DAG.getNode(ISD::SRA, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, MVT::i8)) : DAG.getConstant(0, VT); SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { - Tmp2 = DAG.getNode(X86ISD::SHLD, VT, ShOpHi, ShOpLo, ShAmt); - Tmp3 = DAG.getNode(ISD::SHL, VT, ShOpLo, ShAmt); + Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); } else { - Tmp2 = DAG.getNode(X86ISD::SHRD, VT, ShOpLo, ShOpHi, ShAmt); - Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, VT, ShOpHi, ShAmt); + Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); } - SDValue AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt, + SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, MVT::i8)); - SDValue Cond = DAG.getNode(X86ISD::CMP, VT, + SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, AndNode, DAG.getConstant(0, MVT::i8)); SDValue Hi, Lo; @@ -4635,36 +4955,37 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; if (Op.getOpcode() == ISD::SHL_PARTS) { - Hi = DAG.getNode(X86ISD::CMOV, VT, Ops0, 4); - Lo = DAG.getNode(X86ISD::CMOV, VT, Ops1, 4); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); } else { - Lo = DAG.getNode(X86ISD::CMOV, VT, Ops0, 4); - Hi = DAG.getNode(X86ISD::CMOV, VT, Ops1, 4); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); } SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, 2); + return DAG.getMergeValues(Ops, 2, dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getValueType(); assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); - + // These are really Legal; caller falls through into that case. if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) return SDValue(); - if (SrcVT == MVT::i64 && Op.getValueType() != MVT::f80 && + if (SrcVT == MVT::i64 && Op.getValueType() != MVT::f80 && Subtarget->is64Bit()) return SDValue(); - + + DebugLoc dl = Op.getDebugLoc(); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - SDValue Chain = DAG.getStore(DAG.getEntryNode(), Op.getOperand(0), - StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0); + SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, + PseudoSourceValue::getFixedStack(SSFI), 0); // Build the FILD SDVTList Tys; @@ -4677,7 +4998,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { Ops.push_back(Chain); Ops.push_back(StackSlot); Ops.push_back(DAG.getValueType(SrcVT)); - SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, + SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, Tys, &Ops[0], Ops.size()); if (useSSE) { @@ -4697,119 +5018,189 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { Ops.push_back(StackSlot); Ops.push_back(DAG.getValueType(Op.getValueType())); Ops.push_back(InFlag); - Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size()); - Result = DAG.getLoad(Op.getValueType(), Chain, StackSlot, + Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); + Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, PseudoSourceValue::getFixedStack(SSFI), 0); } return Result; } -SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { - MVT SrcVT = Op.getOperand(0).getValueType(); - assert(SrcVT.getSimpleVT() == MVT::i64 && "Unknown UINT_TO_FP to lower!"); - - // We only handle SSE2 f64 target here; caller can handle the rest. - if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) - return SDValue(); - - // This algorithm is not obvious. Here it is in C code, more or less: -/* - double uint64_to_double( uint32_t hi, uint32_t lo ) - { - static const __m128i exp = { 0x4330000045300000ULL, 0 }; - static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; - - // copy ints to xmm registers - __m128i xh = _mm_cvtsi32_si128( hi ); - __m128i xl = _mm_cvtsi32_si128( lo ); - - // combine into low half of a single xmm register - __m128i x = _mm_unpacklo_epi32( xh, xl ); - __m128d d; - double sd; - - // merge in appropriate exponents to give the integer bits the - // right magnitude - x = _mm_unpacklo_epi32( x, exp ); - - // subtract away the biases to deal with the IEEE-754 double precision - // implicit 1 - d = _mm_sub_pd( (__m128d) x, bias ); - - // All conversions up to here are exact. The correctly rounded result is - // calculated using the - // current rounding mode using the following horizontal add. - d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); - _mm_store_sd( &sd, d ); //since we are returning doubles in XMM, this - // store doesn't really need to be here (except maybe to zero the other - // double) - return sd; - } -*/ +// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { + // This algorithm is not obvious. Here it is in C code, more or less: + /* + double uint64_to_double( uint32_t hi, uint32_t lo ) { + static const __m128i exp = { 0x4330000045300000ULL, 0 }; + static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; + + // Copy ints to xmm registers. + __m128i xh = _mm_cvtsi32_si128( hi ); + __m128i xl = _mm_cvtsi32_si128( lo ); + + // Combine into low half of a single xmm register. + __m128i x = _mm_unpacklo_epi32( xh, xl ); + __m128d d; + double sd; + + // Merge in appropriate exponents to give the integer bits the right + // magnitude. + x = _mm_unpacklo_epi32( x, exp ); + + // Subtract away the biases to deal with the IEEE-754 double precision + // implicit 1. + d = _mm_sub_pd( (__m128d) x, bias ); + + // All conversions up to here are exact. The correctly rounded result is + // calculated using the current rounding mode using the following + // horizontal add. + d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); + _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this + // store doesn't really need to be here (except + // maybe to zero the other double) + return sd; + } + */ + + DebugLoc dl = Op.getDebugLoc(); // Build some magic constants. - std::vectorCV0; + std::vector CV0; CV0.push_back(ConstantInt::get(APInt(32, 0x45300000))); CV0.push_back(ConstantInt::get(APInt(32, 0x43300000))); CV0.push_back(ConstantInt::get(APInt(32, 0))); CV0.push_back(ConstantInt::get(APInt(32, 0))); Constant *C0 = ConstantVector::get(CV0); - SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 4); + SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); - std::vectorCV1; + std::vector CV1; CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL)))); CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); - SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 4); + SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); SmallVector MaskVec; MaskVec.push_back(DAG.getConstant(0, MVT::i32)); MaskVec.push_back(DAG.getConstant(4, MVT::i32)); MaskVec.push_back(DAG.getConstant(1, MVT::i32)); MaskVec.push_back(DAG.getConstant(5, MVT::i32)); - SDValue UnpcklMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, &MaskVec[0], - MaskVec.size()); + SDValue UnpcklMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + &MaskVec[0], MaskVec.size()); SmallVector MaskVec2; MaskVec2.push_back(DAG.getConstant(1, MVT::i32)); MaskVec2.push_back(DAG.getConstant(0, MVT::i32)); - SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, &MaskVec2[0], - MaskVec2.size()); + SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, + &MaskVec2[0], MaskVec2.size()); - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, + SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(0), DAG.getIntPtrConstant(1))); - SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, + SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(0), DAG.getIntPtrConstant(0))); - SDValue Unpck1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, + SDValue Unpck1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32, XR1, XR2, UnpcklMask); - SDValue CLod0 = DAG.getLoad(MVT::v4i32, DAG.getEntryNode(), CPIdx0, - PseudoSourceValue::getConstantPool(), 0, false, 16); - SDValue Unpck2 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, - Unpck1, CLod0, UnpcklMask); - SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64, Unpck2); - SDValue CLod1 = DAG.getLoad(MVT::v2f64, CLod0.getValue(1), CPIdx1, - PseudoSourceValue::getConstantPool(), 0, false, 16); - SDValue Sub = DAG.getNode(ISD::FSUB, MVT::v2f64, XR2F, CLod1); + SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + PseudoSourceValue::getConstantPool(), 0, + false, 16); + SDValue Unpck2 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32, + Unpck1, CLod0, UnpcklMask); + SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); + SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + PseudoSourceValue::getConstantPool(), 0, + false, 16); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + // Add the halves; easiest way is to swap them into another reg first. - SDValue Shuf = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2f64, + SDValue Shuf = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2f64, Sub, Sub, ShufMask); - SDValue Add = DAG.getNode(ISD::FADD, MVT::v2f64, Shuf, Sub); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f64, Add, + SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, DAG.getIntPtrConstant(0)); } +// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + // FP constant to bias correct the final result. + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + MVT::f64); + + // Load the 32-bit value into an XMM register. + SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Op.getOperand(0), + DAG.getIntPtrConstant(0))); + + Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), + DAG.getIntPtrConstant(0)); + + // Or the load with the bias. + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, Load)), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, Bias))); + Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), + DAG.getIntPtrConstant(0)); + + // Subtract the bias. + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); + + // Handle final rounding. + MVT DestVT = Op.getValueType(); + + if (DestVT.bitsLT(MVT::f64)) { + return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, + DAG.getIntPtrConstant(0)); + } else if (DestVT.bitsGT(MVT::f64)) { + return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); + } + + // Handle final rounding. + return Sub; +} + +SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + + // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't + // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform + // the optimization here. + if (DAG.SignBitIsZero(N0)) + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); + + MVT SrcVT = N0.getValueType(); + if (SrcVT == MVT::i64) { + // We only handle SSE2 f64 target here; caller can handle the rest. + if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) + return SDValue(); + + return LowerUINT_TO_FP_i64(Op, DAG); + } else if (SrcVT == MVT::i32) { + return LowerUINT_TO_FP_i32(Op, DAG); + } + + assert(0 && "Unknown UINT_TO_FP to lower!"); + return SDValue(); +} + std::pair X86TargetLowering:: FP_TO_SINTHelper(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); assert(Op.getValueType().getSimpleVT() <= MVT::i64 && Op.getValueType().getSimpleVT() >= MVT::i16 && "Unknown FP_TO_SINT to lower!"); // These are really Legal. - if (Op.getValueType() == MVT::i32 && + if (Op.getValueType() == MVT::i32 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); if (Subtarget->is64Bit() && @@ -4835,13 +5226,13 @@ FP_TO_SINTHelper(SDValue Op, SelectionDAG &DAG) { SDValue Value = Op.getOperand(0); if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!"); - Chain = DAG.getStore(Chain, Value, StackSlot, + Chain = DAG.getStore(Chain, dl, Value, StackSlot, PseudoSourceValue::getFixedStack(SSFI), 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) }; - Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3); + Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); @@ -4849,7 +5240,7 @@ FP_TO_SINTHelper(SDValue Op, SelectionDAG &DAG) { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getNode(Opc, MVT::Other, Ops, 3); + SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); return std::make_pair(FIST, StackSlot); } @@ -4858,28 +5249,14 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { std::pair Vals = FP_TO_SINTHelper(Op, DAG); SDValue FIST = Vals.first, StackSlot = Vals.second; if (FIST.getNode() == 0) return SDValue(); - - // Load the result. - return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0); -} - -SDNode *X86TargetLowering::ExpandFP_TO_SINT(SDNode *N, SelectionDAG &DAG) { - std::pair Vals = FP_TO_SINTHelper(SDValue(N, 0), DAG); - SDValue FIST = Vals.first, StackSlot = Vals.second; - if (FIST.getNode() == 0) return 0; - - MVT VT = N->getValueType(0); - - // Return a load from the stack slot. - SDValue Res = DAG.getLoad(VT, FIST, StackSlot, NULL, 0); - // Use MERGE_VALUES to drop the chain result value and get a node with one - // result. This requires turning off getMergeValues simplification, since - // otherwise it will give us Res back. - return DAG.getMergeValues(&Res, 1, false).getNode(); + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, NULL, 0); } SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); MVT VT = Op.getValueType(); MVT EltVT = VT; if (VT.isVector()) @@ -4897,14 +5274,15 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { CV.push_back(C); } Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); - SDValue Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); - return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask); + return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); MVT VT = Op.getValueType(); MVT EltVT = VT; unsigned EltNum = 1; @@ -4925,34 +5303,36 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { CV.push_back(C); } Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); - SDValue Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); if (VT.isVector()) { - return DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(ISD::XOR, MVT::v2i64, - DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Op.getOperand(0)), - DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Mask))); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getNode(ISD::XOR, dl, MVT::v2i64, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + Op.getOperand(0)), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); } else { - return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask); + return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } } SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); MVT VT = Op.getValueType(); MVT SrcVT = Op1.getValueType(); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { - Op1 = DAG.getNode(ISD::FP_EXTEND, VT, Op1); + Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); SrcVT = VT; } // And if it is bigger, shrink it first. if (SrcVT.bitsGT(VT)) { - Op1 = DAG.getNode(ISD::FP_ROUND, VT, Op1, DAG.getIntPtrConstant(1)); + Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); SrcVT = VT; } @@ -4971,20 +5351,20 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); } Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); - SDValue Mask1 = DAG.getLoad(SrcVT, DAG.getEntryNode(), CPIdx, + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); - SDValue SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1); + SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); // Shift sign bit right or left if the two operands have different types. if (SrcVT.bitsGT(VT)) { // Op0 is MVT::f32, Op1 is MVT::f64. - SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2f64, SignBit); - SignBit = DAG.getNode(X86ISD::FSRL, MVT::v2f64, SignBit, + SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); + SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, DAG.getConstant(32, MVT::i32)); - SignBit = DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32, SignBit); - SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f32, SignBit, + SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); + SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, DAG.getIntPtrConstant(0)); } @@ -5000,34 +5380,189 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); } C = ConstantVector::get(CV); - CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); - SDValue Mask2 = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, + CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, PseudoSourceValue::getConstantPool(), 0, false, 16); - SDValue Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2); + SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); // Or the value with the sign bit. - return DAG.getNode(X86ISD::FOR, VT, Val, SignBit); + return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); +} + +/// Emit nodes that will be selected as "test Op0,Op0", or something +/// equivalent. +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, + SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + + // CF and OF aren't always set the way we want. Determine which + // of these we need. + bool NeedCF = false; + bool NeedOF = false; + switch (X86CC) { + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + NeedCF = true; + break; + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: + NeedOF = true; + break; + default: break; + } + + // See if we can use the EFLAGS value from the operand instead of + // doing a separate TEST. TEST always sets OF and CF to 0, so unless + // we prove that the arithmetic won't overflow, we can't use OF or CF. + if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { + unsigned Opcode = 0; + unsigned NumOperands = 0; + switch (Op.getNode()->getOpcode()) { + case ISD::ADD: + // Due to an isel shortcoming, be conservative if this add is likely to + // be selected as part of a load-modify-store instruction. When the root + // node in a match is a store, isel doesn't know how to remap non-chain + // non-flag uses of other nodes in the match, such as the ADD in this + // case. This leads to the ADD being left around and reselected, with + // the result being two adds in the output. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() == ISD::STORE) + goto default_case; + if (ConstantSDNode *C = + dyn_cast(Op.getNode()->getOperand(1))) { + // An add of one will be selected as an INC. + if (C->getAPIntValue() == 1) { + Opcode = X86ISD::INC; + NumOperands = 1; + break; + } + // An add of negative one (subtract of one) will be selected as a DEC. + if (C->getAPIntValue().isAllOnesValue()) { + Opcode = X86ISD::DEC; + NumOperands = 1; + break; + } + } + // Otherwise use a regular EFLAGS-setting add. + Opcode = X86ISD::ADD; + NumOperands = 2; + break; + case ISD::SUB: + // Due to the ISEL shortcoming noted above, be conservative if this sub is + // likely to be selected as part of a load-modify-store instruction. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() == ISD::STORE) + goto default_case; + // Otherwise use a regular EFLAGS-setting sub. + Opcode = X86ISD::SUB; + NumOperands = 2; + break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::INC: + case X86ISD::DEC: + return SDValue(Op.getNode(), 1); + default: + default_case: + break; + } + if (Opcode != 0) { + const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::i32); + SmallVector Ops; + for (unsigned i = 0; i != NumOperands; ++i) + Ops.push_back(Op.getOperand(i)); + SDValue New = DAG.getNode(Opcode, dl, VTs, 2, &Ops[0], NumOperands); + DAG.ReplaceAllUsesWith(Op, New); + return SDValue(New.getNode(), 1); + } + } + + // Otherwise just emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); +} + +/// Emit nodes that will be selected as "cmp Op0,Op1", or something +/// equivalent. +SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SelectionDAG &DAG) { + if (ConstantSDNode *C = dyn_cast(Op1)) + if (C->getAPIntValue() == 0) + return EmitTest(Op0, X86CC, DAG); + + DebugLoc dl = Op0.getDebugLoc(); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); - SDValue Cond; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - SDValue CC = Op.getOperand(2); - bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); - unsigned X86CC; + DebugLoc dl = Op.getDebugLoc(); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + + // Lower (X & (1 << N)) == 0 to BT(X, N). + // Lower ((X >>u N) & 1) != 0 to BT(X, N). + // Lower ((X >>s N) & 1) != 0 to BT(X, N). + if (Op0.getOpcode() == ISD::AND && + Op0.hasOneUse() && + Op1.getOpcode() == ISD::Constant && + cast(Op1)->getZExtValue() == 0 && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + SDValue LHS, RHS; + if (Op0.getOperand(1).getOpcode() == ISD::SHL) { + if (ConstantSDNode *Op010C = + dyn_cast(Op0.getOperand(1).getOperand(0))) + if (Op010C->getZExtValue() == 1) { + LHS = Op0.getOperand(0); + RHS = Op0.getOperand(1).getOperand(1); + } + } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { + if (ConstantSDNode *Op000C = + dyn_cast(Op0.getOperand(0).getOperand(0))) + if (Op000C->getZExtValue() == 1) { + LHS = Op0.getOperand(1); + RHS = Op0.getOperand(0).getOperand(1); + } + } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { + ConstantSDNode *AndRHS = cast(Op0.getOperand(1)); + SDValue AndLHS = Op0.getOperand(0); + if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { + LHS = AndLHS.getOperand(0); + RHS = AndLHS.getOperand(1); + } + } - if (translateX86CC(cast(CC)->get(), isFP, X86CC, - Op0, Op1, DAG)) { - Cond = DAG.getNode(X86ISD::CMP, MVT::i32, Op0, Op1); - return DAG.getNode(X86ISD::SETCC, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), Cond); + if (LHS.getNode()) { + // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT + // instruction. Since the shift amount is in-range-or-undefined, we know + // that doing a bittest on the i16 value is ok. We extend to i32 because + // the encoding for the i16 version is larger than the i32 version. + if (LHS.getValueType() == MVT::i8) + LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); + + // If the operand types disagree, extend the shift amount to match. Since + // BT ignores high bits (like shifts) we can use anyextend. + if (LHS.getValueType() != RHS.getValueType()) + RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); + + SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); + unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(Cond, MVT::i8), BT); + } } - assert(0 && "Illegal SetCC!"); - return SDValue(); + bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); + unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + + SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), Cond); } SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { @@ -5038,6 +5573,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); + DebugLoc dl = Op.getDebugLoc(); if (isFP) { unsigned SSECC = 8; @@ -5050,7 +5586,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { default: break; case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; - case ISD::SETOGT: + case ISD::SETOGT: case ISD::SETGT: Swap = true; // Fallthrough case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; @@ -5074,28 +5610,28 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (SSECC == 8) { if (SetCCOpcode == ISD::SETUEQ) { SDValue UNORD, EQ; - UNORD = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); - EQ = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); - return DAG.getNode(ISD::OR, VT, UNORD, EQ); + UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); + EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); } else if (SetCCOpcode == ISD::SETONE) { SDValue ORD, NEQ; - ORD = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); - NEQ = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); - return DAG.getNode(ISD::AND, VT, ORD, NEQ); + ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); + NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); } assert(0 && "Illegal FP comparison"); } // Handle all other FP comparisons here. - return DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); + return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); } - + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc = 0, EQOpc = 0, GTOpc = 0; bool Swap = false, Invert = false, FlipSigns = false; - + switch (VT.getSimpleVT()) { default: break; case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; @@ -5103,7 +5639,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; } - + switch (SetCCOpcode) { default: break; case ISD::SETNE: Invert = true; @@ -5119,36 +5655,50 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { } if (Swap) std::swap(Op0, Op1); - + // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); - SDValue SignBit = DAG.getConstant(EltVT.getIntegerVTSignBit(), EltVT); + SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), + EltVT); std::vector SignBits(VT.getVectorNumElements(), SignBit); - SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, VT, &SignBits[0], + SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], SignBits.size()); - Op0 = DAG.getNode(ISD::XOR, VT, Op0, SignVec); - Op1 = DAG.getNode(ISD::XOR, VT, Op1, SignVec); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); } - - SDValue Result = DAG.getNode(Opc, VT, Op0, Op1); + + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. - if (Invert) { - MVT EltVT = VT.getVectorElementType(); - SDValue NegOne = DAG.getConstant(EltVT.getIntegerVTBitMask(), EltVT); - std::vector NegOnes(VT.getVectorNumElements(), NegOne); - SDValue NegOneV = DAG.getNode(ISD::BUILD_VECTOR, VT, &NegOnes[0], - NegOnes.size()); - Result = DAG.getNode(ISD::XOR, VT, Result, NegOneV); - } + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + return Result; } +// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. +static bool isX86LogicalCmp(SDValue Op) { + unsigned Opc = Op.getNode()->getOpcode(); + if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) + return true; + if (Op.getResNo() == 1 && + (Opc == X86ISD::ADD || + Opc == X86ISD::SUB || + Opc == X86ISD::SMUL || + Opc == X86ISD::UMUL || + Opc == X86ISD::INC || + Opc == X86ISD::DEC)) + return true; + + return false; +} + SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { bool addTest = true; SDValue Cond = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); SDValue CC; if (Cond.getOpcode() == ISD::SETCC) @@ -5162,15 +5712,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); MVT VT = Op.getValueType(); - + bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); - - if ((Opc == X86ISD::CMP || - Opc == X86ISD::COMI || - Opc == X86ISD::UCOMI) && !IllegalFPCMov) { + + if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || + Opc == X86ISD::BT) { // FIXME Cond = Cmp; addTest = false; } @@ -5178,7 +5727,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond= DAG.getNode(X86ISD::CMP, MVT::i32, Cond, DAG.getConstant(0, MVT::i8)); + Cond = EmitTest(Cond, X86::COND_NE, DAG); } const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(), @@ -5190,7 +5739,33 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { Ops.push_back(Op.getOperand(1)); Ops.push_back(CC); Ops.push_back(Cond); - return DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::CMOV, dl, VTs, 2, &Ops[0], Ops.size()); +} + +// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or +// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart +// from the AND / OR. +static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { + Opc = Op.getOpcode(); + if (Opc != ISD::OR && Opc != ISD::AND) + return false; + return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse() && + Op.getOperand(1).getOpcode() == X86ISD::SETCC && + Op.getOperand(1).hasOneUse()); +} + +// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and +// 1 and that the SETCC node has a single use. +static bool isXor1OfSetCC(SDValue Op) { + if (Op.getOpcode() != ISD::XOR) + return false; + ConstantSDNode *N1C = dyn_cast(Op.getOperand(1)); + if (N1C && N1C->getAPIntValue() == 1) { + return Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse(); + } + return false; } SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { @@ -5198,10 +5773,19 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); SDValue CC; if (Cond.getOpcode() == ISD::SETCC) Cond = LowerSETCC(Cond, DAG); +#if 0 + // FIXME: LowerXALUO doesn't handle these!! + else if (Cond.getOpcode() == X86ISD::ADD || + Cond.getOpcode() == X86ISD::SUB || + Cond.getOpcode() == X86ISD::SMUL || + Cond.getOpcode() == X86ISD::UMUL) + Cond = LowerXALUO(Cond, DAG); +#endif // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -5210,84 +5794,92 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); - if (Opc == X86ISD::CMP || - Opc == X86ISD::COMI || - Opc == X86ISD::UCOMI) { - Cond = Cmp; - addTest = false; - } - // Also, recognize the pattern generated by an FCMP_UNE. We can emit - // two branches instead of an explicit OR instruction with a - // separate test. - } else if (Cond.getOpcode() == ISD::OR && - Cond.hasOneUse() && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC && - Cond.getOperand(0).hasOneUse() && - Cond.getOperand(1).getOpcode() == X86ISD::SETCC && - Cond.getOperand(1).hasOneUse()) { - SDValue Cmp = Cond.getOperand(0).getOperand(1); - unsigned Opc = Cmp.getOpcode(); - if (Cmp == Cond.getOperand(1).getOperand(1) && - (Opc == X86ISD::CMP || - Opc == X86ISD::COMI || - Opc == X86ISD::UCOMI)) { - CC = Cond.getOperand(0).getOperand(0); - Chain = DAG.getNode(X86ISD::BRCOND, Op.getValueType(), - Chain, Dest, CC, Cmp); - CC = Cond.getOperand(1).getOperand(0); + // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? + if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { Cond = Cmp; addTest = false; + } else { + switch (cast(CC)->getZExtValue()) { + default: break; + case X86::COND_O: + case X86::COND_B: + // These can only come from an arithmetic instruction with overflow, + // e.g. SADDO, UADDO. + Cond = Cond.getNode()->getOperand(1); + addTest = false; + break; + } } - // Also, recognize the pattern generated by an FCMP_OEQ. We can emit - // two branches instead of an explicit AND instruction with a - // separate test. However, we only do this if this block doesn't - // have a fall-through edge, because this requires an explicit - // jmp when the condition is false. - } else if (Cond.getOpcode() == ISD::AND && - Cond.hasOneUse() && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC && - Cond.getOperand(0).hasOneUse() && - Cond.getOperand(1).getOpcode() == X86ISD::SETCC && - Cond.getOperand(1).hasOneUse()) { - SDValue Cmp = Cond.getOperand(0).getOperand(1); - unsigned Opc = Cmp.getOpcode(); - if (Cmp == Cond.getOperand(1).getOperand(1) && - (Opc == X86ISD::CMP || - Opc == X86ISD::COMI || - Opc == X86ISD::UCOMI) && - Op.getNode()->hasOneUse()) { + } else { + unsigned CondOpc; + if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { + SDValue Cmp = Cond.getOperand(0).getOperand(1); + if (CondOpc == ISD::OR) { + // Also, recognize the pattern generated by an FCMP_UNE. We can emit + // two branches instead of an explicit OR instruction with a + // separate test. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp)) { + CC = Cond.getOperand(0).getOperand(0); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = Cond.getOperand(1).getOperand(0); + Cond = Cmp; + addTest = false; + } + } else { // ISD::AND + // Also, recognize the pattern generated by an FCMP_OEQ. We can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp) && + Op.getNode()->hasOneUse()) { + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + SDValue User = SDValue(*Op.getNode()->use_begin(), 0); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User.getOpcode() == ISD::BR) { + SDValue FalseBB = User.getOperand(1); + SDValue NewBR = + DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); + assert(NewBR == User); + Dest = FalseBB; + + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } + } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { + // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. + // It should be transformed during dag combiner except when the condition + // is set by a arithmetics with overflow node. X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, MVT::i8); - SDValue User = SDValue(*Op.getNode()->use_begin(), 0); - // Look for an unconditional branch following this conditional branch. - // We need this because we need to reverse the successors in order - // to implement FCMP_OEQ. - if (User.getOpcode() == ISD::BR) { - SDValue FalseBB = User.getOperand(1); - SDValue NewBR = - DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); - assert(NewBR == User); - Dest = FalseBB; - - Chain = DAG.getNode(X86ISD::BRCOND, Op.getValueType(), - Chain, Dest, CC, Cmp); - X86::CondCode CCode = - (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); - CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, MVT::i8); - Cond = Cmp; - addTest = false; - } + Cond = Cond.getOperand(0).getOperand(1); + addTest = false; } } if (addTest) { CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond= DAG.getNode(X86ISD::CMP, MVT::i32, Cond, DAG.getConstant(0, MVT::i8)); + Cond = EmitTest(Cond, X86::COND_NE, DAG); } - return DAG.getNode(X86ISD::BRCOND, Op.getValueType(), + return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } @@ -5302,6 +5894,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) { assert(Subtarget->isTargetCygMing() && "This should be used only on Cygwin/Mingw targets"); + DebugLoc dl = Op.getDebugLoc(); // Get the inputs. SDValue Chain = Op.getOperand(0); @@ -5315,7 +5908,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); - Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag); + Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); @@ -5324,7 +5917,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, DAG.getRegister(X86::EAX, IntPtr), DAG.getRegister(X86StackPtr, SPTy), Flag }; - Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 5); + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); Flag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, @@ -5332,14 +5925,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, DAG.getIntPtrConstant(0, true), Flag); - Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1); + Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); SDValue Ops1[2] = { Chain.getValue(0), Chain }; - return DAG.getMergeValues(Ops1, 2); + return DAG.getMergeValues(Ops1, 2, dl); } SDValue -X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, +X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, @@ -5363,7 +5956,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { MVT IntPtr = getPointerTy(); const Type *IntPtrTy = TD->getIntPtrType(); - TargetLowering::ArgListTy Args; + TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; Entry.Ty = IntPtrTy; @@ -5371,9 +5964,9 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, Entry.Node = Size; Args.push_back(Entry); std::pair CallResult = - LowerCallTo(Chain, Type::VoidTy, false, false, false, false, - CallingConv::C, false, - DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG); + LowerCallTo(Chain, Type::VoidTy, false, false, false, false, + CallingConv::C, false, + DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); return CallResult.second; } @@ -5423,20 +6016,22 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, BytesLeft = SizeVal % UBytes; } - Chain = DAG.getCopyToReg(Chain, ValReg, DAG.getConstant(Val, AVT), + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; Count = DAG.getIntPtrConstant(SizeVal); - Chain = DAG.getCopyToReg(Chain, X86::AL, Src, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); InFlag = Chain.getValue(1); } - Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX, + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + X86::ECX, Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI, + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); @@ -5445,15 +6040,16 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, Ops.push_back(Chain); Ops.push_back(DAG.getValueType(AVT)); Ops.push_back(InFlag); - Chain = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size()); + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); if (TwoRepStos) { InFlag = Chain.getValue(1); Count = Size; MVT CVT = Count.getValueType(); - SDValue Left = DAG.getNode(ISD::AND, CVT, Count, + SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); - Chain = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX, + Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : + X86::ECX, Left, InFlag); InFlag = Chain.getValue(1); Tys = DAG.getVTList(MVT::Other, MVT::Flag); @@ -5461,15 +6057,15 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, Ops.push_back(Chain); Ops.push_back(DAG.getValueType(MVT::i8)); Ops.push_back(InFlag); - Chain = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size()); + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); } else if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; MVT AddrVT = Dst.getValueType(); MVT SizeVT = Size.getValueType(); - Chain = DAG.getMemset(Chain, - DAG.getNode(ISD::ADD, AddrVT, Dst, + Chain = DAG.getMemset(Chain, dl, + DAG.getNode(ISD::ADD, dl, AddrVT, Dst, DAG.getConstant(Offset, AddrVT)), Src, DAG.getConstant(BytesLeft, SizeVT), @@ -5481,12 +6077,12 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, } SDValue -X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, +X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool AlwaysInline, const Value *DstSV, uint64_t DstSVOff, - const Value *SrcSV, uint64_t SrcSVOff) { + const Value *SrcSV, uint64_t SrcSVOff) { // This requires the copy size to be a constant, preferrably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast(Size); @@ -5511,13 +6107,16 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag(0, 0); - Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX, + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + X86::ECX, Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI, + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : X86::ESI, + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : + X86::ESI, Src, InFlag); InFlag = Chain.getValue(1); @@ -5526,7 +6125,7 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, Ops.push_back(Chain); Ops.push_back(DAG.getValueType(AVT)); Ops.push_back(InFlag); - SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size()); + SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); SmallVector Results; Results.push_back(RepMovs); @@ -5536,10 +6135,10 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, MVT DstVT = Dst.getValueType(); MVT SrcVT = Src.getValueType(); MVT SizeVT = Size.getValueType(); - Results.push_back(DAG.getMemcpy(Chain, - DAG.getNode(ISD::ADD, DstVT, Dst, + Results.push_back(DAG.getMemcpy(Chain, dl, + DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, DstVT)), - DAG.getNode(ISD::ADD, SrcVT, Src, + DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, SrcVT)), DAG.getConstant(BytesLeft, SizeVT), Align, AlwaysInline, @@ -5547,47 +6146,19 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SrcSV, SrcSVOff + Offset)); } - return DAG.getNode(ISD::TokenFactor, MVT::Other, &Results[0], Results.size()); -} - -/// Expand the result of: i64,outchain = READCYCLECOUNTER inchain -SDNode *X86TargetLowering::ExpandREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG){ - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue TheChain = N->getOperand(0); - SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheChain, 1); - if (Subtarget->is64Bit()) { - SDValue rax = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1)); - SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), X86::RDX, - MVT::i64, rax.getValue(2)); - SDValue Tmp = DAG.getNode(ISD::SHL, MVT::i64, rdx, - DAG.getConstant(32, MVT::i8)); - SDValue Ops[] = { - DAG.getNode(ISD::OR, MVT::i64, rax, Tmp), rdx.getValue(1) - }; - - return DAG.getMergeValues(Ops, 2).getNode(); - } - - SDValue eax = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), X86::EDX, - MVT::i32, eax.getValue(2)); - // Use a buildpair to merge the two 32-bit values into a 64-bit one. - SDValue Ops[] = { eax, edx }; - Ops[0] = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Ops, 2); - - // Use a MERGE_VALUES to return the value and chain. - Ops[1] = edx.getValue(1); - return DAG.getMergeValues(Ops, 2).getNode(); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &Results[0], Results.size()); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { const Value *SV = cast(Op.getOperand(2))->getValue(); + DebugLoc dl = Op.getDebugLoc(); if (!Subtarget->is64Bit()) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); - return DAG.getStore(Op.getOperand(0), FR,Op.getOperand(1), SV, 0); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); } // __va_list_tag: @@ -5598,30 +6169,34 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset - SDValue Store = DAG.getStore(Op.getOperand(0), + SDValue Store = DAG.getStore(Op.getOperand(0), dl, DAG.getConstant(VarArgsGPOffset, MVT::i32), FIN, SV, 0); MemOps.push_back(Store); // Store fp_offset - FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, DAG.getIntPtrConstant(4)); - Store = DAG.getStore(Op.getOperand(0), + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), + FIN, DAG.getIntPtrConstant(4)); + Store = DAG.getStore(Op.getOperand(0), dl, DAG.getConstant(VarArgsFPOffset, MVT::i32), FIN, SV, 0); MemOps.push_back(Store); // Store ptr to overflow_arg_area - FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, DAG.getIntPtrConstant(4)); + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), + FIN, DAG.getIntPtrConstant(4)); SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); - Store = DAG.getStore(Op.getOperand(0), OVFIN, FIN, SV, 0); + Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, DAG.getIntPtrConstant(8)); + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), + FIN, DAG.getIntPtrConstant(8)); SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); - Store = DAG.getStore(Op.getOperand(0), RSFIN, FIN, SV, 0); + Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); MemOps.push_back(Store); - return DAG.getNode(ISD::TokenFactor, MVT::Other, &MemOps[0], MemOps.size()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { @@ -5644,14 +6219,16 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); + DebugLoc dl = Op.getDebugLoc(); - return DAG.getMemcpy(Chain, DstPtr, SrcPtr, + return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, DAG.getIntPtrConstant(24), 8, false, DstSV, 0, SrcSV, 0); } SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. @@ -5746,15 +6323,13 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { break; } - unsigned X86CC; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - translateX86CC(CC, true, X86CC, LHS, RHS, DAG); - - SDValue Cond = DAG.getNode(Opc, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, MVT::i8, + unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); + SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), Cond); - return DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, SetCC); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } // Fix vector shift instructions where the last operand is a non-immediate @@ -5839,9 +6414,9 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } } MVT VT = Op.getValueType(); - ShAmt = DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, ShAmtVT, ShAmt)); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT, + ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt)); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(NewIntNo, MVT::i32), Op.getOperand(1), ShAmt); } @@ -5849,24 +6424,36 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { - // Depths > 0 not supported yet! - if (cast(Op.getOperand(0))->getZExtValue() > 0) - return SDValue(); - - // Just load the return address + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + DebugLoc dl = Op.getDebugLoc(); + + if (Depth > 0) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = + DAG.getConstant(TD->getPointerSize(), + Subtarget->is64Bit() ? MVT::i64 : MVT::i32); + return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, getPointerTy(), + FrameAddr, Offset), + NULL, 0); + } + + // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); - return DAG.getLoad(getPointerTy(), DAG.getEntryNode(), RetAddrFI, NULL, 0); + return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + RetAddrFI, NULL, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); MVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; - SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), FrameReg, VT); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) - FrameAddr = DAG.getLoad(VT, DAG.getEntryNode(), FrameAddr, NULL, 0); + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); return FrameAddr; } @@ -5881,19 +6468,20 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, getPointerTy()); unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); - SDValue StoreAddr = DAG.getNode(ISD::SUB, getPointerTy(), Frame, + SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, DAG.getIntPtrConstant(-TD->getPointerSize())); - StoreAddr = DAG.getNode(ISD::ADD, getPointerTy(), StoreAddr, Offset); - Chain = DAG.getStore(Chain, Handler, StoreAddr, NULL, 0); - Chain = DAG.getCopyToReg(Chain, StoreAddrReg, StoreAddr); + StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); + Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); + Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); MF.getRegInfo().addLiveOut(StoreAddrReg); - return DAG.getNode(X86ISD::EH_RETURN, + return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); } @@ -5904,6 +6492,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value + DebugLoc dl = Op.getDebugLoc(); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); @@ -5926,36 +6515,41 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, // Load the pointer to the nested function into R11. unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; - OutChains[0] = DAG.getStore(Root, DAG.getConstant(OpCode, MVT::i16), Addr, - TrmpAddr, 0); + OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, TrmpAddr, 0); - Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(2, MVT::i64)); - OutChains[1] = DAG.getStore(Root, FPtr, Addr, TrmpAddr, 2, false, 2); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(2, MVT::i64)); + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 - Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(10, MVT::i64)); - OutChains[2] = DAG.getStore(Root, DAG.getConstant(OpCode, MVT::i16), Addr, - TrmpAddr, 10); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(10, MVT::i64)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, TrmpAddr, 10); - Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(12, MVT::i64)); - OutChains[3] = DAG.getStore(Root, Nest, Addr, TrmpAddr, 12, false, 2); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(12, MVT::i64)); + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... - Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(20, MVT::i64)); - OutChains[4] = DAG.getStore(Root, DAG.getConstant(OpCode, MVT::i16), Addr, - TrmpAddr, 20); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(20, MVT::i64)); + OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, TrmpAddr, 20); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 - Addr = DAG.getNode(ISD::ADD, MVT::i64, Trmp, DAG.getConstant(22, MVT::i64)); - OutChains[5] = DAG.getStore(Root, DAG.getConstant(ModRM, MVT::i8), Addr, + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(22, MVT::i64)); + OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, TrmpAddr, 22); SDValue Ops[] = - { Trmp, DAG.getNode(ISD::TokenFactor, MVT::Other, OutChains, 6) }; - return DAG.getMergeValues(Ops, 2); + { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; + return DAG.getMergeValues(Ops, 2, dl); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); @@ -6003,28 +6597,33 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, SDValue OutChains[4]; SDValue Addr, Disp; - Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(10, MVT::i32)); - Disp = DAG.getNode(ISD::SUB, MVT::i32, FPtr, Addr); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(10, MVT::i32)); + Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); - OutChains[0] = DAG.getStore(Root, DAG.getConstant(MOV32ri|N86Reg, MVT::i8), + OutChains[0] = DAG.getStore(Root, dl, + DAG.getConstant(MOV32ri|N86Reg, MVT::i8), Trmp, TrmpAddr, 0); - Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(1, MVT::i32)); - OutChains[1] = DAG.getStore(Root, Nest, Addr, TrmpAddr, 1, false, 1); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(1, MVT::i32)); + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); - Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(5, MVT::i32)); - OutChains[2] = DAG.getStore(Root, DAG.getConstant(JMP, MVT::i8), Addr, + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(5, MVT::i32)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, TrmpAddr, 5, false, 1); - Addr = DAG.getNode(ISD::ADD, MVT::i32, Trmp, DAG.getConstant(6, MVT::i32)); - OutChains[3] = DAG.getStore(Root, Disp, Addr, TrmpAddr, 6, false, 1); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(6, MVT::i32)); + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); SDValue Ops[] = - { Trmp, DAG.getNode(ISD::TokenFactor, MVT::Other, OutChains, 4) }; - return DAG.getMergeValues(Ops, 2); + { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; + return DAG.getMergeValues(Ops, 2, dl); } } @@ -6053,56 +6652,58 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { const TargetFrameInfo &TFI = *TM.getFrameInfo(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, MVT::Other, + SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, DAG.getEntryNode(), StackSlot); // Load FP Control Word from stack slot - SDValue CWD = DAG.getLoad(MVT::i16, Chain, StackSlot, NULL, 0); + SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); // Transform as necessary SDValue CWD1 = - DAG.getNode(ISD::SRL, MVT::i16, - DAG.getNode(ISD::AND, MVT::i16, + DAG.getNode(ISD::SRL, dl, MVT::i16, + DAG.getNode(ISD::AND, dl, MVT::i16, CWD, DAG.getConstant(0x800, MVT::i16)), DAG.getConstant(11, MVT::i8)); SDValue CWD2 = - DAG.getNode(ISD::SRL, MVT::i16, - DAG.getNode(ISD::AND, MVT::i16, + DAG.getNode(ISD::SRL, dl, MVT::i16, + DAG.getNode(ISD::AND, dl, MVT::i16, CWD, DAG.getConstant(0x400, MVT::i16)), DAG.getConstant(9, MVT::i8)); SDValue RetVal = - DAG.getNode(ISD::AND, MVT::i16, - DAG.getNode(ISD::ADD, MVT::i16, - DAG.getNode(ISD::OR, MVT::i16, CWD1, CWD2), + DAG.getNode(ISD::AND, dl, MVT::i16, + DAG.getNode(ISD::ADD, dl, MVT::i16, + DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), DAG.getConstant(1, MVT::i16)), DAG.getConstant(3, MVT::i16)); return DAG.getNode((VT.getSizeInBits() < 16 ? - ISD::TRUNCATE : ISD::ZERO_EXTEND), VT, RetVal); + ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); } SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType(); MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; - Op = DAG.getNode(ISD::ZERO_EXTEND, OpVT, Op); + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); - Op = DAG.getNode(X86ISD::BSR, VTs, Op); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); // If src is zero (i.e. bsr sets ZF), returns NumBits. SmallVector Ops; @@ -6110,13 +6711,13 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); Ops.push_back(Op.getValue(1)); - Op = DAG.getNode(X86ISD::CMOV, OpVT, &Ops[0], 4); + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); // Finally xor with NumBits-1. - Op = DAG.getNode(ISD::XOR, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); if (VT == MVT::i8) - Op = DAG.getNode(ISD::TRUNCATE, MVT::i8, Op); + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } @@ -6124,16 +6725,17 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType(); MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); Op = Op.getOperand(0); if (VT == MVT::i8) { OpVT = MVT::i32; - Op = DAG.getNode(ISD::ZERO_EXTEND, OpVT, Op); + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); - Op = DAG.getNode(X86ISD::BSF, VTs, Op); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); // If src is zero (i.e. bsf sets ZF), returns NumBits. SmallVector Ops; @@ -6141,36 +6743,143 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { Ops.push_back(DAG.getConstant(NumBits, OpVT)); Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); Ops.push_back(Op.getValue(1)); - Op = DAG.getNode(X86ISD::CMOV, OpVT, &Ops[0], 4); + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); if (VT == MVT::i8) - Op = DAG.getNode(ISD::TRUNCATE, MVT::i8, Op); + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } -SDValue X86TargetLowering::LowerXADDO(SDValue Op, SelectionDAG &DAG, - ISD::NodeType NTy) { - return SDValue(); +SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType(); + assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); + DebugLoc dl = Op.getDebugLoc(); + + // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); + // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); + // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); + // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); + // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); + // + // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); + // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); + // return AloBlo + AloBhi + AhiBlo; + + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + A, DAG.getConstant(32, MVT::i32)); + SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + B, DAG.getConstant(32, MVT::i32)); + SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), + A, B); + SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), + A, Bhi); + SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), + Ahi, B); + AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + AloBhi, DAG.getConstant(32, MVT::i32)); + AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + AhiBlo, DAG.getConstant(32, MVT::i32)); + SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); + Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); + return Res; } -SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { - MVT T = Op.getValueType(); - unsigned Reg = 0; - unsigned size = 0; - switch(T.getSimpleVT()) { - default: - assert(false && "Invalid value type!"); - case MVT::i8: Reg = X86::AL; size = 1; break; + +SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { + // Lower the "add/sub/mul with overflow" instruction into a regular ins plus + // a "setcc" instruction that checks the overflow flag. The "brcond" lowering + // looks for this combo and may remove the "setcc" instruction if the "setcc" + // has only one use. + SDNode *N = Op.getNode(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + unsigned BaseOp = 0; + unsigned Cond = 0; + DebugLoc dl = Op.getDebugLoc(); + + switch (Op.getOpcode()) { + default: assert(0 && "Unknown ovf instruction!"); + case ISD::SADDO: + // A subtract of one will be selected as a INC. Note that INC doesn't + // set CF, so we can't do this for UADDO. + if (ConstantSDNode *C = dyn_cast(Op)) + if (C->getAPIntValue() == 1) { + BaseOp = X86ISD::INC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::ADD; + Cond = X86::COND_O; + break; + case ISD::UADDO: + BaseOp = X86ISD::ADD; + Cond = X86::COND_B; + break; + case ISD::SSUBO: + // A subtract of one will be selected as a DEC. Note that DEC doesn't + // set CF, so we can't do this for USUBO. + if (ConstantSDNode *C = dyn_cast(Op)) + if (C->getAPIntValue() == 1) { + BaseOp = X86ISD::DEC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::SUB; + Cond = X86::COND_O; + break; + case ISD::USUBO: + BaseOp = X86ISD::SUB; + Cond = X86::COND_B; + break; + case ISD::SMULO: + BaseOp = X86ISD::SMUL; + Cond = X86::COND_O; + break; + case ISD::UMULO: + BaseOp = X86ISD::UMUL; + Cond = X86::COND_B; + break; + } + + // Also sets EFLAGS. + SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); + SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), + DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); + return Sum; +} + +SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { + MVT T = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned Reg = 0; + unsigned size = 0; + switch(T.getSimpleVT()) { + default: + assert(false && "Invalid value type!"); + case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; - case MVT::i64: - if (Subtarget->is64Bit()) { - Reg = X86::RAX; size = 8; - } else //Should go away when LegalizeType stuff lands - return SDValue(ExpandATOMIC_CMP_SWAP(Op.getNode(), DAG), 0); + case MVT::i64: + assert(Subtarget->is64Bit() && "Node not type legal!"); + Reg = X86::RAX; size = 8; break; - }; - SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), Reg, + } + SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), @@ -6178,86 +6887,39 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { DAG.getTargetConstant(size, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, Tys, Ops, 5); - SDValue cpOut = - DAG.getCopyFromReg(Result.getValue(0), Reg, T, Result.getValue(1)); + SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); + SDValue cpOut = + DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); return cpOut; } -SDNode* X86TargetLowering::ExpandATOMIC_CMP_SWAP(SDNode* Op, +SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) { - MVT T = Op->getValueType(0); - assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); - SDValue cpInL, cpInH; - cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(2), - DAG.getConstant(0, MVT::i32)); - cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(2), - DAG.getConstant(1, MVT::i32)); - cpInL = DAG.getCopyToReg(Op->getOperand(0), X86::EAX, - cpInL, SDValue()); - cpInH = DAG.getCopyToReg(cpInL.getValue(0), X86::EDX, - cpInH, cpInL.getValue(1)); - SDValue swapInL, swapInH; - swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(3), - DAG.getConstant(0, MVT::i32)); - swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(3), - DAG.getConstant(1, MVT::i32)); - swapInL = DAG.getCopyToReg(cpInH.getValue(0), X86::EBX, - swapInL, cpInH.getValue(1)); - swapInH = DAG.getCopyToReg(swapInL.getValue(0), X86::ECX, - swapInH, swapInL.getValue(1)); - SDValue Ops[] = { swapInH.getValue(0), - Op->getOperand(1), - swapInH.getValue(1) }; + assert(Subtarget->is64Bit() && "Result not type legalized?"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, Tys, Ops, 3); - SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), X86::EAX, MVT::i32, - Result.getValue(1)); - SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), X86::EDX, MVT::i32, - cpOutL.getValue(2)); - SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; - SDValue ResultVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2); - SDValue Vals[2] = { ResultVal, cpOutH.getValue(1) }; - return DAG.getMergeValues(Vals, 2).getNode(); -} - -SDValue X86TargetLowering::LowerATOMIC_BINARY_64(SDValue Op, - SelectionDAG &DAG, - unsigned NewOp) { - SDNode *Node = Op.getNode(); - MVT T = Node->getValueType(0); - assert (T == MVT::i64 && "Only know how to expand i64 atomics"); - - SDValue Chain = Node->getOperand(0); - SDValue In1 = Node->getOperand(1); - SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(0)); - SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(1)); - // This is a generalized SDNode, not an AtomicSDNode, so it doesn't - // have a MemOperand. Pass the info through as a normal operand. - SDValue LSI = DAG.getMemOperand(cast(Node)->getMemOperand()); - SDValue Ops[] = { Chain, In1, In2L, In2H, LSI }; - SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); - SDValue Result = DAG.getNode(NewOp, Tys, Ops, 5); - SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - SDValue ResultVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2); - SDValue Vals[2] = { ResultVal, Result.getValue(2) }; - return SDValue(DAG.getMergeValues(Vals, 2).getNode(), 0); + SDValue TheChain = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); + SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); + SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, + rax.getValue(2)); + SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, + DAG.getConstant(32, MVT::i8)); + SDValue Ops[] = { + DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), + rdx.getValue(1) + }; + return DAG.getMergeValues(Ops, 2, dl); } SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); MVT T = Node->getValueType(0); - SDValue negOp = DAG.getNode(ISD::SUB, T, - DAG.getConstant(0, T), Node->getOperand(2)); - return DAG.getAtomic((Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_8 ? - ISD::ATOMIC_LOAD_ADD_8 : - Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_16 ? - ISD::ATOMIC_LOAD_ADD_16 : - Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_32 ? - ISD::ATOMIC_LOAD_ADD_32 : - ISD::ATOMIC_LOAD_ADD_64), + SDValue negOp = DAG.getNode(ISD::SUB, dl, T, + DAG.getConstant(0, T), Node->getOperand(2)); + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, + cast(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), negOp, cast(Node)->getSrcValue(), @@ -6269,29 +6931,8 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { switch (Op.getOpcode()) { default: assert(0 && "Should not custom lower this!"); - case ISD::ATOMIC_CMP_SWAP_8: - case ISD::ATOMIC_CMP_SWAP_16: - case ISD::ATOMIC_CMP_SWAP_32: - case ISD::ATOMIC_CMP_SWAP_64: return LowerCMP_SWAP(Op,DAG); - case ISD::ATOMIC_LOAD_SUB_8: - case ISD::ATOMIC_LOAD_SUB_16: - case ISD::ATOMIC_LOAD_SUB_32: return LowerLOAD_SUB(Op,DAG); - case ISD::ATOMIC_LOAD_SUB_64: return (Subtarget->is64Bit()) ? - LowerLOAD_SUB(Op,DAG) : - LowerATOMIC_BINARY_64(Op,DAG, - X86ISD::ATOMSUB64_DAG); - case ISD::ATOMIC_LOAD_AND_64: return LowerATOMIC_BINARY_64(Op,DAG, - X86ISD::ATOMAND64_DAG); - case ISD::ATOMIC_LOAD_OR_64: return LowerATOMIC_BINARY_64(Op, DAG, - X86ISD::ATOMOR64_DAG); - case ISD::ATOMIC_LOAD_XOR_64: return LowerATOMIC_BINARY_64(Op,DAG, - X86ISD::ATOMXOR64_DAG); - case ISD::ATOMIC_LOAD_NAND_64:return LowerATOMIC_BINARY_64(Op,DAG, - X86ISD::ATOMNAND64_DAG); - case ISD::ATOMIC_LOAD_ADD_64: return LowerATOMIC_BINARY_64(Op,DAG, - X86ISD::ATOMADD64_DAG); - case ISD::ATOMIC_SWAP_64: return LowerATOMIC_BINARY_64(Op,DAG, - X86ISD::ATOMSWAP64_DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); + case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); @@ -6332,24 +6973,130 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); - case ISD::SADDO: return LowerXADDO(Op, DAG, ISD::SADDO); - case ISD::UADDO: return LowerXADDO(Op, DAG, ISD::UADDO); - - // FIXME: REMOVE THIS WHEN LegalizeDAGTypes lands. - case ISD::READCYCLECOUNTER: - return SDValue(ExpandREADCYCLECOUNTER(Op.getNode(), DAG), 0); + case ISD::MUL: return LowerMUL_V2I64(Op, DAG); + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + case ISD::SMULO: + case ISD::UMULO: return LowerXALUO(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); } } +void X86TargetLowering:: +ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, + SelectionDAG &DAG, unsigned NewOp) { + MVT T = Node->getValueType(0); + DebugLoc dl = Node->getDebugLoc(); + assert (T == MVT::i64 && "Only know how to expand i64 atomics"); + + SDValue Chain = Node->getOperand(0); + SDValue In1 = Node->getOperand(1); + SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(0)); + SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(1)); + // This is a generalized SDNode, not an AtomicSDNode, so it doesn't + // have a MemOperand. Pass the info through as a normal operand. + SDValue LSI = DAG.getMemOperand(cast(Node)->getMemOperand()); + SDValue Ops[] = { Chain, In1, In2L, In2H, LSI }; + SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5); + SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(Result.getValue(2)); +} + /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. -SDNode *X86TargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG) { +void X86TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl&Results, + SelectionDAG &DAG) { + DebugLoc dl = N->getDebugLoc(); switch (N->getOpcode()) { default: - return X86TargetLowering::LowerOperation(SDValue (N, 0), DAG).getNode(); - case ISD::FP_TO_SINT: return ExpandFP_TO_SINT(N, DAG); - case ISD::READCYCLECOUNTER: return ExpandREADCYCLECOUNTER(N, DAG); - case ISD::ATOMIC_CMP_SWAP_64: return ExpandATOMIC_CMP_SWAP(N, DAG); + assert(false && "Do not know how to custom type legalize this operation!"); + return; + case ISD::FP_TO_SINT: { + std::pair Vals = FP_TO_SINTHelper(SDValue(N, 0), DAG); + SDValue FIST = Vals.first, StackSlot = Vals.second; + if (FIST.getNode() != 0) { + MVT VT = N->getValueType(0); + // Return a load from the stack slot. + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); + } + return; + } + case ISD::READCYCLECOUNTER: { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDValue TheChain = N->getOperand(0); + SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); + SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, + rd.getValue(1)); + SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, + eax.getValue(2)); + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { eax, edx }; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); + Results.push_back(edx.getValue(1)); + return; + } + case ISD::ATOMIC_CMP_SWAP: { + MVT T = N->getValueType(0); + assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); + SDValue cpInL, cpInH; + cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), + DAG.getConstant(0, MVT::i32)); + cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), + DAG.getConstant(1, MVT::i32)); + cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); + cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, + cpInL.getValue(1)); + SDValue swapInL, swapInH; + swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), + DAG.getConstant(0, MVT::i32)); + swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), + DAG.getConstant(1, MVT::i32)); + swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, + cpInH.getValue(1)); + swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, + swapInL.getValue(1)); + SDValue Ops[] = { swapInH.getValue(0), + N->getOperand(1), + swapInH.getValue(1) }; + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, + MVT::i32, Result.getValue(1)); + SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, + MVT::i32, cpOutL.getValue(2)); + SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(cpOutH.getValue(1)); + return; + } + case ISD::ATOMIC_LOAD_ADD: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); + return; + case ISD::ATOMIC_LOAD_AND: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); + return; + case ISD::ATOMIC_LOAD_NAND: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); + return; + case ISD::ATOMIC_LOAD_OR: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); + return; + case ISD::ATOMIC_LOAD_SUB: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); + return; + case ISD::ATOMIC_LOAD_XOR: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); + return; + case ISD::ATOMIC_SWAP: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); + return; } } @@ -6374,6 +7121,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::TAILCALL: return "X86ISD::TAILCALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; + case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; @@ -6390,6 +7138,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -6421,23 +7170,33 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; + case X86ISD::ADD: return "X86ISD::ADD"; + case X86ISD::SUB: return "X86ISD::SUB"; + case X86ISD::SMUL: return "X86ISD::SMUL"; + case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::INC: return "X86ISD::INC"; + case X86ISD::DEC: return "X86ISD::DEC"; } } // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. -bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, +bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, const Type *Ty) const { // X86 supports extremely general addressing modes. - + // X86 allows a sign-extended 32-bit immediate field as a displacement. if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1) return false; - + if (AM.BaseGV) { // We can only fold this if we don't need an extra load. if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false)) return false; + // If BaseGV requires a register, we cannot also have a BaseReg. + if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) && + AM.HasBaseReg) + return false; // X86-64 only supports addr of globals in small code model. if (Subtarget->is64Bit()) { @@ -6448,7 +7207,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, return false; } } - + switch (AM.Scale) { case 0: case 1: @@ -6468,7 +7227,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, default: // Other stuff never works. return false; } - + return true; } @@ -6500,12 +7259,14 @@ bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const { bool X86TargetLowering::isShuffleMaskLegal(SDValue Mask, MVT VT) const { // Only do shuffles on 128-bit vector types for now. + // FIXME: pshufb, blends if (VT.getSizeInBits() == 64) return false; return (Mask.getNode()->getNumOperands() <= 4 || isIdentityMask(Mask.getNode()) || isIdentityMask(Mask.getNode(), true) || isSplatMask(Mask.getNode()) || - isPSHUFHW_PSHUFLWMask(Mask.getNode()) || + X86::isPSHUFHWMask(Mask.getNode()) || + X86::isPSHUFLWMask(Mask.getNode()) || X86::isUNPCKLMask(Mask.getNode()) || X86::isUNPCKHMask(Mask.getNode()) || X86::isUNPCKL_v_undef_Mask(Mask.getNode()) || @@ -6522,7 +7283,7 @@ X86TargetLowering::isVectorClearMaskLegal(const std::vector &BVOps, if (NumElts == 4) { return (isMOVLMask(&BVOps[0], 4) || isCommutedMOVL(&BVOps[0], 4, true) || - isSHUFPMask(&BVOps[0], 4) || + isSHUFPMask(&BVOps[0], 4) || isCommutedSHUFP(&BVOps[0], 4)); } return false; @@ -6544,7 +7305,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, unsigned notOpc, unsigned EAXreg, TargetRegisterClass *RC, - bool invSrc) { + bool invSrc) const { // For the atomic bitwise operator, we generate // thisMBB: // newMBB: @@ -6558,7 +7319,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction::iterator MBBIter = MBB; ++MBBIter; - + /// First build the CFG MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; @@ -6566,39 +7327,41 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, newMBB); F->insert(MBBIter, nextMBB); - + // Move all successors to thisMBB to nextMBB nextMBB->transferSuccessors(thisMBB); - + // Update thisMBB to fall through to newMBB thisMBB->addSuccessor(newMBB); - + // newMBB jumps to itself and fall through to nextMBB newMBB->addSuccessor(nextMBB); newMBB->addSuccessor(newMBB); - + // Insert instructions into newMBB based on incoming instruction - assert(bInstr->getNumOperands() < 8 && "unexpected number of operands"); + assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && + "unexpected number of operands"); + DebugLoc dl = bInstr->getDebugLoc(); MachineOperand& destOper = bInstr->getOperand(0); - MachineOperand* argOpers[6]; + MachineOperand* argOpers[2 + X86AddrNumOperands]; int numArgs = bInstr->getNumOperands() - 1; for (int i=0; i < numArgs; ++i) argOpers[i] = &bInstr->getOperand(i+1); // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = 3; // [0,3] - int valArgIndx = 4; - + int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] + int valArgIndx = lastAddrIndx + 1; + unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(LoadOpc), t1); + MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); unsigned tt = F->getRegInfo().createVirtualRegister(RC); if (invSrc) { - MIB = BuildMI(newMBB, TII->get(notOpc), tt).addReg(t1); + MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); } - else + else tt = t1; unsigned t2 = F->getRegInfo().createVirtualRegister(RC); @@ -6606,27 +7369,27 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, argOpers[valArgIndx]->isImm()) && "invalid operand"); if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, TII->get(regOpc), t2); + MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); else - MIB = BuildMI(newMBB, TII->get(immOpc), t2); + MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); MIB.addReg(tt); (*MIB).addOperand(*argOpers[valArgIndx]); - MIB = BuildMI(newMBB, TII->get(copyOpc), EAXreg); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); MIB.addReg(t1); - - MIB = BuildMI(newMBB, TII->get(CXchgOpc)); + + MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); MIB.addReg(t2); assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); - MIB = BuildMI(newMBB, TII->get(copyOpc), destOper.getReg()); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); MIB.addReg(EAXreg); - + // insert branch - BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB); + BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. return nextMBB; @@ -6640,7 +7403,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, unsigned regOpcH, unsigned immOpcL, unsigned immOpcH, - bool invSrc) { + bool invSrc) const { // For the atomic bitwise operator, we generate // thisMBB (instructions are in pairs, except cmpxchg8b) // ld t1,t2 = [bitinstr.addr] @@ -6664,7 +7427,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction::iterator MBBIter = MBB; ++MBBIter; - + /// First build the CFG MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; @@ -6672,35 +7435,37 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, newMBB); F->insert(MBBIter, nextMBB); - + // Move all successors to thisMBB to nextMBB nextMBB->transferSuccessors(thisMBB); - + // Update thisMBB to fall through to newMBB thisMBB->addSuccessor(newMBB); - + // newMBB jumps to itself and fall through to nextMBB newMBB->addSuccessor(nextMBB); newMBB->addSuccessor(newMBB); - + + DebugLoc dl = bInstr->getDebugLoc(); // Insert instructions into newMBB based on incoming instruction // There are 8 "real" operands plus 9 implicit def/uses, ignored here. - assert(bInstr->getNumOperands() < 18 && "unexpected number of operands"); + assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && + "unexpected number of operands"); MachineOperand& dest1Oper = bInstr->getOperand(0); MachineOperand& dest2Oper = bInstr->getOperand(1); - MachineOperand* argOpers[6]; - for (int i=0; i < 6; ++i) + MachineOperand* argOpers[2 + X86AddrNumOperands]; + for (int i=0; i < 2 + X86AddrNumOperands; ++i) argOpers[i] = &bInstr->getOperand(i+2); // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = 3; // [0,3] - + int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] + unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(thisMBB, TII->get(LoadOpc), t1); + MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); unsigned t2 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(thisMBB, TII->get(LoadOpc), t2); + MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); // add 4 to displacement. for (int i=0; i <= lastAddrIndx-1; ++i) (*MIB).addOperand(*argOpers[i]); @@ -6714,66 +7479,70 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, // t3/4 are defined later, at the bottom of the loop unsigned t3 = F->getRegInfo().createVirtualRegister(RC); unsigned t4 = F->getRegInfo().createVirtualRegister(RC); - BuildMI(newMBB, TII->get(X86::PHI), dest1Oper.getReg()) + BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); - BuildMI(newMBB, TII->get(X86::PHI), dest2Oper.getReg()) + BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); - if (invSrc) { - MIB = BuildMI(newMBB, TII->get(NotOpc), tt1).addReg(t1); - MIB = BuildMI(newMBB, TII->get(NotOpc), tt2).addReg(t2); + if (invSrc) { + MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); + MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); } else { tt1 = t1; tt2 = t2; } - assert((argOpers[4]->isReg() || argOpers[4]->isImm()) && + int valArgIndx = lastAddrIndx + 1; + assert((argOpers[valArgIndx]->isReg() || + argOpers[valArgIndx]->isImm()) && "invalid operand"); unsigned t5 = F->getRegInfo().createVirtualRegister(RC); unsigned t6 = F->getRegInfo().createVirtualRegister(RC); - if (argOpers[4]->isReg()) - MIB = BuildMI(newMBB, TII->get(regOpcL), t5); + if (argOpers[valArgIndx]->isReg()) + MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); else - MIB = BuildMI(newMBB, TII->get(immOpcL), t5); + MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); if (regOpcL != X86::MOV32rr) MIB.addReg(tt1); - (*MIB).addOperand(*argOpers[4]); - assert(argOpers[5]->isReg() == argOpers[4]->isReg()); - assert(argOpers[5]->isImm() == argOpers[4]->isImm()); - if (argOpers[5]->isReg()) - MIB = BuildMI(newMBB, TII->get(regOpcH), t6); + (*MIB).addOperand(*argOpers[valArgIndx]); + assert(argOpers[valArgIndx + 1]->isReg() == + argOpers[valArgIndx]->isReg()); + assert(argOpers[valArgIndx + 1]->isImm() == + argOpers[valArgIndx]->isImm()); + if (argOpers[valArgIndx + 1]->isReg()) + MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); else - MIB = BuildMI(newMBB, TII->get(immOpcH), t6); + MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); if (regOpcH != X86::MOV32rr) MIB.addReg(tt2); - (*MIB).addOperand(*argOpers[5]); + (*MIB).addOperand(*argOpers[valArgIndx + 1]); - MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EAX); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); MIB.addReg(t1); - MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EDX); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); MIB.addReg(t2); - MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EBX); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); MIB.addReg(t5); - MIB = BuildMI(newMBB, TII->get(copyOpc), X86::ECX); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); MIB.addReg(t6); - - MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG8B)); + + MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); - MIB = BuildMI(newMBB, TII->get(copyOpc), t3); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); MIB.addReg(X86::EAX); - MIB = BuildMI(newMBB, TII->get(copyOpc), t4); + MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); MIB.addReg(X86::EDX); - + // insert branch - BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB); + BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. return nextMBB; @@ -6783,12 +7552,12 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, MachineBasicBlock * X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock *MBB, - unsigned cmovOpc) { + unsigned cmovOpc) const { // For the atomic min/max operator, we generate // thisMBB: // newMBB: // ld t1 = [min/max.addr] - // mov t2 = [min/max.val] + // mov t2 = [min/max.val] // cmp t1, t2 // cmov[cond] t2 = t1 // mov EAX = t1 @@ -6800,7 +7569,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction::iterator MBBIter = MBB; ++MBBIter; - + /// First build the CFG MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; @@ -6808,31 +7577,33 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, newMBB); F->insert(MBBIter, nextMBB); - + // Move all successors to thisMBB to nextMBB nextMBB->transferSuccessors(thisMBB); - + // Update thisMBB to fall through to newMBB thisMBB->addSuccessor(newMBB); - + // newMBB jumps to newMBB and fall through to nextMBB newMBB->addSuccessor(nextMBB); newMBB->addSuccessor(newMBB); - + + DebugLoc dl = mInstr->getDebugLoc(); // Insert instructions into newMBB based on incoming instruction - assert(mInstr->getNumOperands() < 8 && "unexpected number of operands"); + assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && + "unexpected number of operands"); MachineOperand& destOper = mInstr->getOperand(0); - MachineOperand* argOpers[6]; + MachineOperand* argOpers[2 + X86AddrNumOperands]; int numArgs = mInstr->getNumOperands() - 1; for (int i=0; i < numArgs; ++i) argOpers[i] = &mInstr->getOperand(i+1); - + // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = 3; // [0,3] - int valArgIndx = 4; - + int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] + int valArgIndx = lastAddrIndx + 1; + unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); - MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), t1); + MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); @@ -6840,40 +7611,40 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, assert((argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm()) && "invalid operand"); - - unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + + unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2); - else - MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2); + MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); + else + MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); (*MIB).addOperand(*argOpers[valArgIndx]); - MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), X86::EAX); + MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); MIB.addReg(t1); - MIB = BuildMI(newMBB, TII->get(X86::CMP32rr)); + MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); MIB.addReg(t1); MIB.addReg(t2); // Generate movc unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); - MIB = BuildMI(newMBB, TII->get(cmovOpc),t3); + MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); MIB.addReg(t2); MIB.addReg(t1); // Cmp and exchange if none has modified the memory location - MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32)); + MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); MIB.addReg(t3); assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); (*MIB).addMemOperand(*F, *mInstr->memoperands_begin()); - - MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg()); + + MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); MIB.addReg(X86::EAX); - + // insert branch - BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB); + BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. return nextMBB; @@ -6882,10 +7653,12 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB) { + MachineBasicBlock *BB) const { + DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); switch (MI->getOpcode()) { default: assert(false && "Unexpected instr type to insert"); + case X86::CMOV_V1I64: case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_V4F32: @@ -6911,7 +7684,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); unsigned Opc = X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); - BuildMI(BB, TII->get(Opc)).addMBB(sinkMBB); + BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // Update machine-CFG edges by transferring all successors of the current @@ -6934,7 +7707,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; - BuildMI(BB, TII->get(X86::PHI), MI->getOperand(0).getReg()) + BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); @@ -6955,22 +7728,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // mode when truncating to an integer value. MachineFunction *F = BB->getParent(); int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); - addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx); + addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx); // Load the old value of the high byte of the control word... unsigned OldCW = F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); - addFrameReference(BuildMI(BB, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); + addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW), + CWFrameIdx); // Set the high part to be round to zero... - addFrameReference(BuildMI(BB, TII->get(X86::MOV16mi)), CWFrameIdx) + addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx) .addImm(0xC7F); // Reload the modified control word now... - addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx); + addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); // Restore the memory image of control word to original value - addFrameReference(BuildMI(BB, TII->get(X86::MOV16mr)), CWFrameIdx) + addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx) .addReg(OldCW); // Get the X86 opcode to use. @@ -7009,30 +7783,30 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, } else { AM.Disp = Op.getImm(); } - addFullAddress(BuildMI(BB, TII->get(Opc)), AM) + addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM) .addReg(MI->getOperand(4).getReg()); // Reload the original control word now. - addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx); + addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. return BB; } case X86::ATOMAND32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, - X86::AND32ri, X86::MOV32rm, + X86::AND32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::MOV32rr, X86::NOT32r, X86::EAX, X86::GR32RegisterClass); case X86::ATOMOR32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, - X86::OR32ri, X86::MOV32rm, + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, + X86::OR32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::MOV32rr, X86::NOT32r, X86::EAX, X86::GR32RegisterClass); case X86::ATOMXOR32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, - X86::XOR32ri, X86::MOV32rm, + X86::XOR32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::MOV32rr, X86::NOT32r, X86::EAX, X86::GR32RegisterClass); @@ -7058,7 +7832,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::NOT16r, X86::AX, X86::GR16RegisterClass); case X86::ATOMOR16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, X86::OR16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::MOV16rr, X86::NOT16r, X86::AX, @@ -7091,7 +7865,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::NOT8r, X86::AL, X86::GR8RegisterClass); case X86::ATOMOR8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, X86::OR8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::MOV8rr, X86::NOT8r, X86::AL, @@ -7112,19 +7886,19 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // This group is for 64-bit host. case X86::ATOMAND64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, - X86::AND64ri32, X86::MOV64rm, + X86::AND64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::MOV64rr, X86::NOT64r, X86::RAX, X86::GR64RegisterClass); case X86::ATOMOR64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, - X86::OR64ri32, X86::MOV64rm, + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, + X86::OR64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::MOV64rr, X86::NOT64r, X86::RAX, X86::GR64RegisterClass); case X86::ATOMXOR64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, - X86::XOR64ri32, X86::MOV64rm, + X86::XOR64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::MOV64rr, X86::NOT64r, X86::RAX, X86::GR64RegisterClass); @@ -7145,37 +7919,37 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // This group does 64-bit operations on a 32-bit host. case X86::ATOMAND6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, + return EmitAtomicBit6432WithCustomInserter(MI, BB, X86::AND32rr, X86::AND32rr, X86::AND32ri, X86::AND32ri, false); case X86::ATOMOR6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, + return EmitAtomicBit6432WithCustomInserter(MI, BB, X86::OR32rr, X86::OR32rr, X86::OR32ri, X86::OR32ri, false); case X86::ATOMXOR6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, + return EmitAtomicBit6432WithCustomInserter(MI, BB, X86::XOR32rr, X86::XOR32rr, X86::XOR32ri, X86::XOR32ri, false); case X86::ATOMNAND6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, + return EmitAtomicBit6432WithCustomInserter(MI, BB, X86::AND32rr, X86::AND32rr, X86::AND32ri, X86::AND32ri, true); case X86::ATOMADD6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, + return EmitAtomicBit6432WithCustomInserter(MI, BB, X86::ADD32rr, X86::ADC32rr, X86::ADD32ri, X86::ADC32ri, false); case X86::ATOMSUB6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, + return EmitAtomicBit6432WithCustomInserter(MI, BB, X86::SUB32rr, X86::SBB32rr, X86::SUB32ri, X86::SBB32ri, false); case X86::ATOMSWAP6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, + return EmitAtomicBit6432WithCustomInserter(MI, BB, X86::MOV32rr, X86::MOV32rr, X86::MOV32ri, X86::MOV32ri, false); @@ -7203,6 +7977,16 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. switch (Opc) { default: break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::SMUL: + case X86ISD::UMUL: + case X86ISD::INC: + case X86ISD::DEC: + // These nodes' second result is a boolean. + if (Op.getResNo() == 0) + break; + // Fallthrough case X86ISD::SETCC: KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), Mask.getBitWidth() - 1); @@ -7275,6 +8059,7 @@ static bool EltsFromConsecutiveLoads(SDNode *N, SDValue PermMask, static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + DebugLoc dl = N->getDebugLoc(); MVT VT = N->getValueType(0); MVT EVT = VT.getVectorElementType(); SDValue PermMask = N->getOperand(2); @@ -7286,18 +8071,21 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, LoadSDNode *LD = cast(Base); if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI)) - return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), - LD->getSrcValueOffset(), LD->isVolatile()); - return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), - LD->getSrcValueOffset(), LD->isVolatile(), - LD->getAlignment()); + return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), + LD->getSrcValue(), LD->getSrcValueOffset(), + LD->isVolatile()); + return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), + LD->getSrcValue(), LD->getSrcValueOffset(), + LD->isVolatile(), LD->getAlignment()); } /// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd. static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget, const TargetLowering &TLI) { unsigned NumOps = N->getNumOperands(); + DebugLoc dl = N->getDebugLoc(); // Ignore single operand BUILD_VECTOR. if (NumOps == 1) @@ -7327,100 +8115,478 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, // Transform it into VZEXT_LOAD addr. LoadSDNode *LD = cast(Base); - + // Load must not be an extload. if (LD->getExtensionType() != ISD::NON_EXTLOAD) return SDValue(); - + + // Load type should legal type so we don't have to legalize it. + if (!TLI.isTypeLegal(VT)) + return SDValue(); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; - SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, Tys, Ops, 2); - DAG.ReplaceAllUsesOfValueWith(SDValue(Base, 1), ResNode.getValue(1)); + SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); + TargetLowering::TargetLoweringOpt TLO(DAG); + TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1)); + DCI.CommitTargetLoweringOpt(TLO); return ResNode; -} +} /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); - + // Get the LHS/RHS of the select. + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + // If we have SSE[12] support, try to form min/max nodes. if (Subtarget->hasSSE2() && - (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) { - if (Cond.getOpcode() == ISD::SETCC) { - // Get the LHS/RHS of the select. - SDValue LHS = N->getOperand(1); - SDValue RHS = N->getOperand(2); - ISD::CondCode CC = cast(Cond.getOperand(2))->get(); - - unsigned Opcode = 0; - if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { - switch (CC) { - default: break; - case ISD::SETOLE: // (X <= Y) ? X : Y -> min - case ISD::SETULE: - case ISD::SETLE: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min - case ISD::SETLT: - Opcode = X86ISD::FMIN; - break; + (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && + Cond.getOpcode() == ISD::SETCC) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); - case ISD::SETOGT: // (X > Y) ? X : Y -> max - case ISD::SETUGT: - case ISD::SETGT: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max - case ISD::SETGE: - Opcode = X86ISD::FMAX; - break; - } - } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { - switch (CC) { - default: break; - case ISD::SETOGT: // (X > Y) ? Y : X -> min - case ISD::SETUGT: - case ISD::SETGT: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min - case ISD::SETGE: - Opcode = X86ISD::FMIN; - break; + unsigned Opcode = 0; + if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { + switch (CC) { + default: break; + case ISD::SETOLE: // (X <= Y) ? X : Y -> min + case ISD::SETULE: + case ISD::SETLE: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min + case ISD::SETLT: + Opcode = X86ISD::FMIN; + break; - case ISD::SETOLE: // (X <= Y) ? Y : X -> max - case ISD::SETULE: - case ISD::SETLE: - if (!UnsafeFPMath) break; - // FALL THROUGH. - case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max - case ISD::SETLT: - Opcode = X86ISD::FMAX; - break; + case ISD::SETOGT: // (X > Y) ? X : Y -> max + case ISD::SETUGT: + case ISD::SETGT: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max + case ISD::SETGE: + Opcode = X86ISD::FMAX; + break; + } + } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { + switch (CC) { + default: break; + case ISD::SETOGT: // (X > Y) ? Y : X -> min + case ISD::SETUGT: + case ISD::SETGT: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min + case ISD::SETGE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETOLE: // (X <= Y) ? Y : X -> max + case ISD::SETULE: + case ISD::SETLE: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max + case ISD::SETLT: + Opcode = X86ISD::FMAX; + break; + } + } + + if (Opcode) + return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); + } + + // If this is a select between two integer constants, try to do some + // optimizations. + if (ConstantSDNode *TrueC = dyn_cast(LHS)) { + if (ConstantSDNode *FalseC = dyn_cast(RHS)) + // Don't do this for crazy integer types. + if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { + // If this is efficiently invertible, canonicalize the LHSC/RHSC values + // so that TrueC (the true value) is larger than FalseC. + bool NeedsCondInvert = false; + + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && + // Efficiently invertible. + (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. + (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. + isa(Cond.getOperand(1))))) { + NeedsCondInvert = true; + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. + if (FalseC->getAPIntValue() == 0 && + TrueC->getAPIntValue().isPowerOf2()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + return Cond; + } + } } + } + + return SDValue(); +} - if (Opcode) - return DAG.getNode(Opcode, N->getValueType(0), LHS, RHS); +/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] +static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + DebugLoc DL = N->getDebugLoc(); + + // If the flag operand isn't dead, don't touch this CMOV. + if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) + return SDValue(); + + // If this is a select between two integer constants, try to do some + // optimizations. Note that the operands are ordered the opposite of SELECT + // operands. + if (ConstantSDNode *TrueC = dyn_cast(N->getOperand(1))) { + if (ConstantSDNode *FalseC = dyn_cast(N->getOperand(0))) { + // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is + // larger than FalseC (the false value). + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. + // This is efficient for any integer data type (including i8/i16) and + // shift amount. + if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { + SDValue Cond = N->getOperand(3); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient + // for any integer data type, including i8/i16. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + SDValue Cond = N->getOperand(3); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + SDValue Cond = N->getOperand(3); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + } } + } + return SDValue(); +} + + +/// PerformMulCombine - Optimize a single multiply with constant into two +/// in order to implement it with two cheaper instructions, e.g. +/// LEA + SHL, LEA + LEA. +static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DAG.getMachineFunction(). + getFunction()->hasFnAttr(Attribute::OptimizeForSize)) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + MVT VT = N->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + ConstantSDNode *C = dyn_cast(N->getOperand(1)); + if (!C) + return SDValue(); + uint64_t MulAmt = C->getZExtValue(); + if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) + return SDValue(); + + uint64_t MulAmt1 = 0; + uint64_t MulAmt2 = 0; + if ((MulAmt % 9) == 0) { + MulAmt1 = 9; + MulAmt2 = MulAmt / 9; + } else if ((MulAmt % 5) == 0) { + MulAmt1 = 5; + MulAmt2 = MulAmt / 5; + } else if ((MulAmt % 3) == 0) { + MulAmt1 = 3; + MulAmt2 = MulAmt / 3; + } + if (MulAmt2 && + (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ + DebugLoc DL = N->getDebugLoc(); + + if (isPowerOf2_64(MulAmt2) && + !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) + // If second multiplifer is pow2, issue it first. We want the multiply by + // 3, 5, or 9 to be folded into the addressing mode unless the lone use + // is an add. + std::swap(MulAmt1, MulAmt2); + + SDValue NewMul; + if (isPowerOf2_64(MulAmt1)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); + else + NewMul = DAG.getNode(ISD::MUL, DL, VT, N->getOperand(0), + DAG.getConstant(MulAmt1, VT)); + + if (isPowerOf2_64(MulAmt2)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); + else + NewMul = DAG.getNode(ISD::MUL, DL, VT, NewMul, + DAG.getConstant(MulAmt2, VT)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, NewMul, false); } + return SDValue(); +} + + +/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts +/// when possible. +static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // On X86 with SSE2 support, we can transform this to a vector shift if + // all elements are shifted by the same amount. We can't do this in legalize + // because the a constant vector is typically transformed to a constant pool + // so we have no knowledge of the shift amount. + if (!Subtarget->hasSSE2()) + return SDValue(); + + MVT VT = N->getValueType(0); + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + + SDValue ShAmtOp = N->getOperand(1); + MVT EltVT = VT.getVectorElementType(); + DebugLoc DL = N->getDebugLoc(); + SDValue BaseShAmt; + if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned i = 0; + for (; i != NumElts; ++i) { + SDValue Arg = ShAmtOp.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + BaseShAmt = Arg; + break; + } + for (; i != NumElts; ++i) { + SDValue Arg = ShAmtOp.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + if (Arg != BaseShAmt) { + return SDValue(); + } + } + } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && + isSplatMask(ShAmtOp.getOperand(2).getNode())) { + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, + DAG.getIntPtrConstant(0)); + } else + return SDValue(); + + if (EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); + else if (EltVT.bitsLT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt); + // The shift amount is identical so we can do a vector shift. + SDValue ValOp = N->getOperand(0); + switch (N->getOpcode()) { + default: + assert(0 && "Unknown shift opcode!"); + break; + case ISD::SHL: + if (VT == MVT::v2i64) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v4i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + ValOp, BaseShAmt); + break; + case ISD::SRA: + if (VT == MVT::v4i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), + ValOp, BaseShAmt); + break; + case ISD::SRL: + if (VT == MVT::v2i64) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v4i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), + ValOp, BaseShAmt); + break; + } return SDValue(); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + const X86Subtarget *Subtarget) { // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. + + // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. StoreSDNode *St = cast(N); - if (St->getValue().getValueType().isVector() && - St->getValue().getValueType().getSizeInBits() == 64 && + MVT VT = St->getValue().getValueType(); + if (VT.getSizeInBits() != 64) + return SDValue(); + + bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2(); + if ((VT.isVector() || + (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { @@ -7444,59 +8610,72 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Ops.push_back(ChainVal->getOperand(i)); } } - if (Ld) { - // If we are a 64-bit capable x86, lower to a single movq load/store pair. - if (Subtarget->is64Bit()) { - SDValue NewLd = DAG.getLoad(MVT::i64, Ld->getChain(), - Ld->getBasePtr(), Ld->getSrcValue(), - Ld->getSrcValueOffset(), Ld->isVolatile(), - Ld->getAlignment()); - SDValue NewChain = NewLd.getValue(1); - if (TokenFactorIndex != -1) { - Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Ops[0], - Ops.size()); - } - return DAG.getStore(NewChain, NewLd, St->getBasePtr(), - St->getSrcValue(), St->getSrcValueOffset(), - St->isVolatile(), St->getAlignment()); - } - // Otherwise, lower to two 32-bit copies. - SDValue LoAddr = Ld->getBasePtr(); - SDValue HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); + if (!Ld || !ISD::isNormalLoad(Ld)) + return SDValue(); - SDValue LoLd = DAG.getLoad(MVT::i32, Ld->getChain(), LoAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset(), - Ld->isVolatile(), Ld->getAlignment()); - SDValue HiLd = DAG.getLoad(MVT::i32, Ld->getChain(), HiAddr, - Ld->getSrcValue(), Ld->getSrcValueOffset()+4, - Ld->isVolatile(), - MinAlign(Ld->getAlignment(), 4)); + // If this is not the MMX case, i.e. we are just turning i64 load/store + // into f64 load/store, avoid the transformation if there are multiple + // uses of the loaded value. + if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + return SDValue(); - SDValue NewChain = LoLd.getValue(1); + DebugLoc LdDL = Ld->getDebugLoc(); + DebugLoc StDL = N->getDebugLoc(); + // If we are a 64-bit capable x86, lower to a single movq load/store pair. + // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store + // pair instead. + if (Subtarget->is64Bit() || F64IsLegal) { + MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), + Ld->getBasePtr(), Ld->getSrcValue(), + Ld->getSrcValueOffset(), Ld->isVolatile(), + Ld->getAlignment()); + SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { - Ops.push_back(LoLd); - Ops.push_back(HiLd); - NewChain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Ops[0], + Ops.push_back(NewChain); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], Ops.size()); } - - LoAddr = St->getBasePtr(); - HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); - - SDValue LoSt = DAG.getStore(NewChain, LoLd, LoAddr, + return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getSrcValue(), St->getSrcValueOffset(), St->isVolatile(), St->getAlignment()); - SDValue HiSt = DAG.getStore(NewChain, HiLd, HiAddr, - St->getSrcValue(), - St->getSrcValueOffset() + 4, - St->isVolatile(), - MinAlign(St->getAlignment(), 4)); - return DAG.getNode(ISD::TokenFactor, MVT::Other, LoSt, HiSt); } + + // Otherwise, lower to two pairs of 32-bit loads / stores. + SDValue LoAddr = Ld->getBasePtr(); + SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->getAlignment()); + SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, + Ld->getSrcValue(), Ld->getSrcValueOffset()+4, + Ld->isVolatile(), + MinAlign(Ld->getAlignment(), 4)); + + SDValue NewChain = LoLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(LoLd); + Ops.push_back(HiLd); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], + Ops.size()); + } + + LoAddr = St->getBasePtr(); + HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, + St->getSrcValue(), St->getSrcValueOffset(), + St->isVolatile(), St->getAlignment()); + SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, + St->getSrcValue(), + St->getSrcValueOffset() + 4, + St->isVolatile(), + MinAlign(St->getAlignment(), 4)); + return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } return SDValue(); } @@ -7529,6 +8708,23 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformBTCombine(SDNode *N, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // BT ignores high bits in the bit index operand. + SDValue Op1 = N->getOperand(1); + if (Op1.hasOneUse()) { + unsigned BitWidth = Op1.getValueSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG); + TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || + TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); +} SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -7537,12 +8733,18 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); case ISD::BUILD_VECTOR: - return PerformBuildVectorCombine(N, DAG, Subtarget, *this); + return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case ISD::MUL: return PerformMulCombine(N, DAG, DCI); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); + case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); } return SDValue(); @@ -7570,6 +8772,9 @@ X86TargetLowering::getConstraintType(const std::string &Constraint) const { case 'y': case 'Y': return C_RegisterClass; + case 'e': + case 'Z': + return C_Other; default: break; } @@ -7590,7 +8795,7 @@ LowerXConstraint(MVT ConstraintVT) const { if (Subtarget->hasSSE1()) return "x"; } - + return TargetLowering::LowerXConstraint(ConstraintVT); } @@ -7602,7 +8807,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); - + switch (Constraint) { default: break; case 'I': @@ -7629,10 +8834,38 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } return; + case 'e': { + // 32-bit signed value + if (ConstantSDNode *C = dyn_cast(Op)) { + const ConstantInt *CI = C->getConstantIntValue(); + if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) { + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); + break; + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + } + return; + } + case 'Z': { + // 32-bit unsigned value + if (ConstantSDNode *C = dyn_cast(Op)) { + const ConstantInt *CI = C->getConstantIntValue(); + if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + return; + } case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast(Op)) { - Result = DAG.getTargetConstant(CST->getZExtValue(), Op.getValueType()); + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); break; } @@ -7640,7 +8873,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // an optional displacement) to be used with 'i'. GlobalAddressSDNode *GA = dyn_cast(Op); int64_t Offset = 0; - + // Match either (GA) or (GA+C) if (GA) { Offset = GA->getOffset(); @@ -7658,10 +8891,11 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, C = 0, GA = 0; } } - + if (GA) { - if (hasMemory) - Op = LowerGlobalAddress(GA->getGlobal(), Offset, DAG); + if (hasMemory) + Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), + Offset, DAG); else Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), Offset); @@ -7673,7 +8907,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; } } - + if (Result.getNode()) { Ops.push_back(Result); return; @@ -7723,7 +8957,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (VT == MVT::i16) return std::make_pair(0U, X86::GR16RegisterClass); if (VT == MVT::i32 || !Subtarget->is64Bit()) - return std::make_pair(0U, X86::GR32RegisterClass); + return std::make_pair(0U, X86::GR32RegisterClass); return std::make_pair(0U, X86::GR64RegisterClass); case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the @@ -7763,7 +8997,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, break; } } - + // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; @@ -7871,28 +9105,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, /// When and where to widen is target dependent based on the cost of /// scalarizing vs using the wider vector type. -MVT X86TargetLowering::getWidenVectorType(MVT VT) { +MVT X86TargetLowering::getWidenVectorType(MVT VT) const { assert(VT.isVector()); if (isTypeLegal(VT)) return VT; - + // TODO: In computeRegisterProperty, we can compute the list of legal vector // type based on element type. This would speed up our search (though // it may not be worth it since the size of the list is relatively // small). MVT EltVT = VT.getVectorElementType(); unsigned NElts = VT.getVectorNumElements(); - + // On X86, it make sense to widen any vector wider than 1 if (NElts <= 1) return MVT::Other; - - for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; + + for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { MVT SVT = (MVT::SimpleValueType)nVT; - - if (isTypeLegal(SVT) && - SVT.getVectorElementType() == EltVT && + + if (isTypeLegal(SVT) && + SVT.getVectorElementType() == EltVT && SVT.getVectorNumElements() > NElts) return SVT; }