X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=bdb7ea6869cd845ad4667e3a06f7e4b2aac77a5d;hb=23d1d5eb566dbd10a81d9ce2dc67ad1548110b08;hp=2b6ff3602a488cdeaacc47ab599390d7afe67f26;hpb=5141d97d3ee9afca936bc870e67c53e1ed05f790;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2b6ff3602a4..bdb7ea6869c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -85,6 +85,11 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) * ElemsPerChunk); + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -158,10 +163,28 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) Subtarget = &TM.getSubtarget(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - RegInfo = TM.getRegisterInfo(); TD = getDataLayout(); + resetOperationActions(); +} + +void X86TargetLowering::resetOperationActions() { + const TargetMachine &TM = getTargetMachine(); + static bool FirstTimeThrough = true; + + // If none of the target options have changed, then we don't need to reset the + // operation actions. + if (!FirstTimeThrough && TO == TM.Options) return; + + if (!FirstTimeThrough) { + // Reinitialize the actions. + initActions(); + FirstTimeThrough = false; + } + + TO = TM.Options; + // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -181,9 +204,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass i32 with i8 on Atom when compiling with O2 - if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) + // Bypass expensive divides on Atom when compiling with O2 + if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { addBypassSlowDiv(32, 8); + if (Subtarget->is64Bit()) + addBypassSlowDiv(64, 16); + } if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { // Setup Windows compiler runtime calls. @@ -368,7 +394,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); - setOperationAction(ISD::BR_CC , MVT::Other, Expand); + setOperationAction(ISD::BR_CC , MVT::f32, Expand); + setOperationAction(ISD::BR_CC , MVT::f64, Expand); + setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::i8, Expand); + setOperationAction(ISD::BR_CC , MVT::i16, Expand); + setOperationAction(ISD::BR_CC , MVT::i32, Expand); + setOperationAction(ISD::BR_CC , MVT::i64, Expand); setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); @@ -456,7 +488,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); - // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build @@ -494,16 +526,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); - setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); - // On X86 and X86-64, atomic operations are lowered to locked instructions. - // Locked instructions, in turn, have implicit fence semantics (all memory - // operations are flushed before issuing the locked instruction, and they - // are not buffered), so we can fold away the common pattern of - // fence-atomic-fence. - setShouldFoldAtomicFences(true); - // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; @@ -605,10 +629,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); // We don't support sin/cos/fmod - setOperationAction(ISD::FSIN , MVT::f64, Expand); - setOperationAction(ISD::FCOS , MVT::f64, Expand); - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Expand FP immediates into loads from the stack, except for the special // cases we handle. @@ -633,8 +659,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Special cases we handle for FP constants. addLegalFPImmediate(APFloat(+0.0f)); // xorps @@ -644,8 +671,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f64 , Expand); - setOperationAction(ISD::FCOS , MVT::f64 , Expand); + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. @@ -659,10 +687,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f32 , Expand); - setOperationAction(ISD::FSIN , MVT::f64 , Expand); - setOperationAction(ISD::FCOS , MVT::f32 , Expand); - setOperationAction(ISD::FCOS , MVT::f64 , Expand); + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 @@ -699,8 +729,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f80 , Expand); - setOperationAction(ISD::FCOS , MVT::f80 , Expand); + setOperationAction(ISD::FSIN , MVT::f80, Expand); + setOperationAction(ISD::FCOS , MVT::f80, Expand); + setOperationAction(ISD::FSINCOS, MVT::f80, Expand); } setOperationAction(ISD::FFLOOR, MVT::f80, Expand); @@ -748,7 +779,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); @@ -1030,23 +1063,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v8i16, Custom); setOperationAction(ISD::SRA, MVT::v16i8, Custom); - if (Subtarget->hasInt256()) { - setOperationAction(ISD::SRL, MVT::v2i64, Legal); - setOperationAction(ISD::SRL, MVT::v4i32, Legal); - - setOperationAction(ISD::SHL, MVT::v2i64, Legal); - setOperationAction(ISD::SHL, MVT::v4i32, Legal); + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Legal); - } else { - setOperationAction(ISD::SRL, MVT::v2i64, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); - setOperationAction(ISD::SHL, MVT::v2i64, Custom); - setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v4i32, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Custom); - } setOperationAction(ISD::SDIV, MVT::v8i16, Custom); setOperationAction(ISD::SDIV, MVT::v4i32, Custom); } @@ -1095,6 +1121,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); @@ -1163,14 +1190,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - setOperationAction(ISD::SRL, MVT::v4i64, Legal); - setOperationAction(ISD::SRL, MVT::v8i32, Legal); - - setOperationAction(ISD::SHL, MVT::v4i64, Legal); - setOperationAction(ISD::SHL, MVT::v8i32, Legal); - - setOperationAction(ISD::SRA, MVT::v8i32, Legal); - setOperationAction(ISD::SDIV, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); @@ -1187,15 +1206,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); // Don't lower v32i8 because there is no 128-bit byte mul + } - setOperationAction(ISD::SRL, MVT::v4i64, Custom); - setOperationAction(ISD::SRL, MVT::v8i32, Custom); + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); - setOperationAction(ISD::SHL, MVT::v4i64, Custom); - setOperationAction(ISD::SHL, MVT::v8i32, Custom); + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); - setOperationAction(ISD::SRA, MVT::v8i32, Custom); - } + setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. for (int i = MVT::FIRST_VECTOR_VALUETYPE; @@ -1281,6 +1302,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setLibcallName(RTLIB::SRA_I128, 0); } + // Combine sin / cos into one node or libcall if possible. + if (Subtarget->hasSinCos()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + if (Subtarget->isTargetDarwin()) { + // For MacOSX, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret to avoid memory + // traffic. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } + } + // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -1301,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -1312,22 +1347,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // On Darwin, -Os means optimize for size without hurting performance, // do not reduce the limit. - maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; - maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; - maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores - maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores + MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores + MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores + MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(4); // 2^4 bytes. - benefitFromCodePlacementOpt = true; // Predictable cmov don't hurt on atom because it's in-order. - predictableSelectIsExpensive = !Subtarget->isAtom(); + PredictableSelectIsExpensive = !Subtarget->isAtom(); setPrefFunctionAlignment(4); // 2^4 bytes. } -EVT X86TargetLowering::getSetCCResultType(EVT VT) const { +EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return MVT::i8; return VT.changeVectorElementTypeToInteger(); } @@ -1568,14 +1602,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); - // Add the regs to the liveout set for the function. - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - for (unsigned i = 0; i != RVLocs.size(); ++i) - if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) - MRI.addLiveOut(RVLocs[i].getLocReg()); - SDValue Flag; - SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop @@ -1644,14 +1671,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } - // The x86-64 ABI for returning structs by value requires that we copy - // the sret argument into %rax for the return. We saved the argument into - // a virtual register in the entry block, so now we copy the value out - // and into %rax. - if (Subtarget->is64Bit() && - DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into %rax/%eax. + if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); @@ -1659,11 +1688,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); - Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); + unsigned RetValReg + = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? + X86::RAX : X86::EAX; + Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); - // RAX now acts like a return value. - MRI.addLiveOut(X86::RAX); + // RAX/EAX now acts like a return value. + RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); } RetOps[0] = Chain; // Update chain. @@ -1763,7 +1795,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; SDValue Ops[] = { Chain, InFlag }; Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, - MVT::Other, MVT::Glue, Ops, 2), 1); + MVT::Other, MVT::Glue, Ops), 1); Val = Chain.getValue(0); // Round the f80 to the right size, which also moves it to the appropriate @@ -2015,14 +2047,18 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, InVals.push_back(ArgValue); } - // The x86-64 ABI for returning structs by value requires that we copy - // the sret argument into %rax for the return. Save the argument into - // a virtual register so that we can access it from the return points. - if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { + // The x86-64 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // Win32 requires us to put the sret argument to %eax as well. + // Save the argument into a virtual register so that we can access it + // from the return points. + if (MF.getFunction()->hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { - Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); + MVT PtrTy = getPointerTy(); + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); @@ -2636,8 +2672,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. - return DAG.getNode(X86ISD::TC_RETURN, dl, - NodeTys, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); } Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); @@ -2795,7 +2830,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, - SelectionDAG& DAG) const { + SelectionDAG &DAG) const { if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; @@ -2834,7 +2869,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // An stdcall caller is expected to clean up its arguments; the callee // isn't going to do that. - if (!CCMatch && CallerCC==CallingConv::X86_StdCall) + if (!CCMatch && CallerCC == CallingConv::X86_StdCall) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via @@ -2954,9 +2989,15 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget->is64Bit() && - !isa(Callee) && - !isa(Callee)) { + ((!isa(Callee) && + !isa(Callee)) || + getTargetMachine().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; + // In PIC we need an extra register to formulate the address computation + // for the callee. + unsigned MaxInRegs = + (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) @@ -2965,7 +3006,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: - if (++NumInRegs == 3) + if (++NumInRegs == MaxInRegs) return false; break; } @@ -3001,7 +3042,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: - case X86ISD::PALIGN: + case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: @@ -3051,7 +3092,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::PALIGN: + case X86ISD::PALIGNR: case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, @@ -3586,7 +3627,7 @@ static bool isMOVLHPSMask(ArrayRef Mask, EVT VT) { static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); DebugLoc dl = SVOp->getDebugLoc(); if (VT != MVT::v8i32 && VT != MVT::v8f32) @@ -3837,7 +3878,7 @@ static bool isVPERM2X128Mask(ArrayRef Mask, EVT VT, bool HasFp256) { /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned HalfSize = VT.getVectorNumElements()/2; @@ -4040,7 +4081,7 @@ bool X86::isVINSERTF128Index(SDNode *N) { /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for PSHUF/SHUFP"); @@ -4070,7 +4111,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4094,7 +4135,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4118,7 +4159,7 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); @@ -4149,8 +4190,8 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { uint64_t Index = cast(N->getOperand(1).getNode())->getZExtValue(); - EVT VecVT = N->getOperand(0).getValueType(); - EVT ElVT = VecVT.getVectorElementType(); + MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); + MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; @@ -4166,8 +4207,8 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { uint64_t Index = cast(N->getOperand(2).getNode())->getZExtValue(); - EVT VecVT = N->getValueType(0); - EVT ElVT = VecVT.getVectorElementType(); + MVT VecVT = N->getValueType(0).getSimpleVT(); + MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; @@ -4177,7 +4218,7 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. /// Handles 256-bit. static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); + MVT VT = N->getValueType(0).getSimpleVT(); unsigned NumElts = VT.getVectorNumElements(); @@ -4197,17 +4238,18 @@ static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { - return ((isa(Elt) && - cast(Elt)->isNullValue()) || - (isa(Elt) && - cast(Elt)->getValueAPF().isPosZero())); + if (ConstantSDNode *CN = dyn_cast(Elt)) + return CN->isNullValue(); + if (ConstantFPSDNode *CFP = dyn_cast(Elt)) + return CFP->getValueAPF().isPosZero(); + return false; } /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in /// their permute mask. static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned NumElems = VT.getVectorNumElements(); SmallVector MaskVec; @@ -4372,13 +4414,15 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, + array_lengthof(Ops)); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, + array_lengthof(Ops)); } } else llvm_unreachable("Unexpected vector type"); @@ -4390,7 +4434,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG, +static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4399,7 +4443,8 @@ static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG, if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, + array_lengthof(Ops)); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); @@ -4589,6 +4634,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); break; + case X86ISD::PALIGNR: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePALIGNRMask(VT, cast(ImmN)->getZExtValue(), Mask); + break; case X86ISD::PSHUFD: case X86ISD::VPERMILP: ImmN = N->getOperand(N->getNumOperands()-1); @@ -4632,7 +4681,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLPS: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: - case X86ISD::PALIGN: // Not yet implemented return false; default: llvm_unreachable("unknown target shuffle node"); @@ -4708,19 +4756,27 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The /// search can start in two different directions, from left or right. -static -unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, - bool ZerosFromLeft, SelectionDAG &DAG) { - unsigned i; - for (i = 0; i != NumElems; ++i) { - unsigned Index = ZerosFromLeft ? i : NumElems-i-1; +/// We count undefs as zeros until PreferredNum is reached. +static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, + unsigned NumElems, bool ZerosFromLeft, + SelectionDAG &DAG, + unsigned PreferredNum = -1U) { + unsigned NumZeros = 0; + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); - if (!(Elt.getNode() && - (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) + if (!Elt.getNode()) + break; + + if (X86::isZeroNode(Elt)) + ++NumZeros; + else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. + NumZeros = std::min(NumZeros + 1, PreferredNum); + else break; } - return i; + return NumZeros; } /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) @@ -4758,8 +4814,9 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, - false /* check zeros from right */, DAG); + unsigned NumZeros = getNumOfConsecutiveZeros( + SVOp, NumElems, false /* check zeros from right */, DAG, + SVOp->getMaskElt(0)); unsigned OpSrc; if (!NumZeros) @@ -4791,8 +4848,9 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, - true /* check zeros from left */, DAG); + unsigned NumZeros = getNumOfConsecutiveZeros( + SVOp, NumElems, true /* check zeros from left */, DAG, + NumElems - SVOp->getMaskElt(NumElems - 1) - 1); unsigned OpSrc; if (!NumZeros) @@ -4927,7 +4985,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, DAG.getConstant(NumBits, - TLI.getShiftAmountTy(SrcOp.getValueType())))); + TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); } SDValue @@ -5051,22 +5109,35 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + SDValue NewLd = SDValue(); if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) - return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), 0); - return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), LDBase->getAlignment()); + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), 0); + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), LDBase->getAlignment()); + + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + } + + return NewLd; } if (NumElems == 4 && LastLoadedElt == 1 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, + array_lengthof(Ops), MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, @@ -5100,7 +5171,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasFp256()) return SDValue(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); assert((VT.is128BitVector() || VT.is256BitVector()) && @@ -5298,8 +5369,8 @@ SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT ExtVT = VT.getVectorElementType(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Vectors containing all zeros can be matched by pxor and xorps later @@ -5315,7 +5386,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. - if (ISD::isBuildVectorAllOnes(Op.getNode())) { + if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; @@ -5630,7 +5701,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); - EVT ResVT = Op.getValueType(); + MVT ResVT = Op.getValueType().getSimpleVT(); assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); @@ -5656,8 +5727,8 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = SVOp->getValueType(0); - EVT EltVT = VT.getVectorElementType(); + MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); if (!Subtarget->hasSSE41() || EltVT == MVT::i8) @@ -5668,41 +5739,40 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, // Check the mask for BLEND and build the value. unsigned MaskValue = 0; // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems-1)/8 + 1; + unsigned NumLanes = (NumElems-1)/8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { - int SndLaneEltIdx = (NumLanes == 2) ? + int SndLaneEltIdx = (NumLanes == 2) ? SVOp->getMaskElt(i + NumElemsInLane) : -1; int EltIdx = SVOp->getMaskElt(i); - if ((EltIdx == -1 || EltIdx == (int)i) && - (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane))) + if ((EltIdx < 0 || EltIdx == (int)i) && + (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) continue; - if (((unsigned)EltIdx == (i + NumElems)) && - (SndLaneEltIdx == -1 || + if (((unsigned)EltIdx == (i + NumElems)) && + (SndLaneEltIdx < 0 || (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) MaskValue |= (1<hasInt256())) { - BlendVT = EVT::getVectorVT(*DAG.getContext(), - EVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); } - - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, - DAG.getConstant(MaskValue, MVT::i32)); + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, + DAG.getConstant(MaskValue, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } @@ -5837,6 +5907,11 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, } } + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. @@ -5970,6 +6045,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef MaskVals = SVOp->getMask(); + // Promote splats to a larger type which usually leads to more efficient code. + // FIXME: Is this true if pshufb is available? + if (SVOp->isSplat()) + return PromoteSplat(SVOp, DAG); + // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. @@ -6088,7 +6168,7 @@ static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); @@ -6135,8 +6215,9 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, DebugLoc dl) { + SelectionDAG &DAG) { MVT VT = SVOp->getValueType(0).getSimpleVT(); + DebugLoc dl = SVOp->getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); MVT NewVT; unsigned Scale; @@ -6172,7 +6253,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, /// getVZextMovL - Return a zero-extending vector move low node. /// -static SDValue getVZextMovL(EVT VT, EVT OpVT, +static SDValue getVZextMovL(MVT VT, EVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, DebugLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { @@ -6214,14 +6295,14 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { if (NewOp.getNode()) return NewOp; - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; DebugLoc dl = SVOp->getDebugLoc(); - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); + MVT EltVT = VT.getVectorElementType(); + MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); SDValue Output[2]; SmallVector Mask; @@ -6326,7 +6407,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getValueType(0).getSimpleVT(); assert(VT.is128BitVector() && "Unsupported vector size"); @@ -6580,7 +6661,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { // Reduce a vector shuffle to zext. SDValue -X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { +X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { // PMOVZX is only available from SSE41. if (!Subtarget->hasSSE41()) return SDValue(); @@ -6624,9 +6705,10 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } + LLVMContext *Context = DAG.getContext(); unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; - EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift); + EVT NeVT = EVT::getIntegerVT(*Context, NBits); + EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift); if (!isTypeLegal(NVT)) return SDValue(); @@ -6645,8 +6727,21 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { // If it's foldable, i.e. normal load with single use, we will let code // selection to fold it. Otherwise, we will short the conversion sequence. if (CIdx && CIdx->getZExtValue() == 0 && - (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) + (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { + if (V.getValueSizeInBits() > V1.getValueSizeInBits()) { + // The "ext_vec_elt" node is wider than the result node. + // In this case we should extract subvector from V. + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). + unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits(); + EVT FullVT = V.getValueType(); + EVT SubVecVT = EVT::getVectorVT(*Context, + FullVT.getVectorElementType(), + FullVT.getVectorNumElements()/Ratio); + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, + DAG.getIntPtrConstant(0)); + } V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + } } return DAG.getNode(ISD::BITCAST, DL, VT, @@ -6656,7 +6751,7 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast(Op); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -6666,24 +6761,14 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // Handle splat operations if (SVOp->isSplat()) { - unsigned NumElem = VT.getVectorNumElements(); - // Use vbroadcast whenever the splat comes from a foldable load SDValue Broadcast = LowerVectorBroadcast(Op, DAG); if (Broadcast.getNode()) return Broadcast; - - // Handle splats by matching through known shuffle masks - if ((VT.is128BitVector() && NumElem <= 4) || - (VT.is256BitVector() && NumElem <= 8)) - return SDValue(); - - // All remaning splats are promoted to target supported vector shuffles. - return PromoteSplat(SVOp, DAG); } // Check integer expanding shuffles. - SDValue NewOp = lowerVectorIntExtend(Op, DAG); + SDValue NewOp = LowerVectorIntExtend(Op, DAG); if (NewOp.getNode()) return NewOp; @@ -6691,7 +6776,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || VT == MVT::v32i8) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if ((VT == MVT::v4i32 || @@ -6699,18 +6784,18 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - EVT NewVT = NewOp.getValueType(); + MVT NewVT = NewOp.getValueType().getSimpleVT(); if (isCommutedMOVLMask(cast(NewOp)->getMask(), NewVT, true, false)) return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - EVT NewVT = NewOp.getValueType(); + MVT NewVT = NewOp.getValueType().getSimpleVT(); if (isMOVLMask(cast(NewOp)->getMask(), NewVT)) return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, dl); @@ -6725,7 +6810,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; @@ -6808,6 +6893,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { TargetMask, DAG); } + if (isPALIGNRMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, + getShufflePALIGNRImmediate(SVOp), + DAG); + // Check if this can be converted into a logical shift. bool isLeft = false; unsigned ShAmt = 0; @@ -6816,7 +6906,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } @@ -6855,7 +6945,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (isShift) { // No better options. Use a vshldq / vsrldq. - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } @@ -6925,11 +7015,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - if (isPALIGNRMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, - getShufflePALIGNRImmediate(SVOp), - DAG); - if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -7035,13 +7120,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); +static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); - if (!Op.getOperand(0).getValueType().is128BitVector()) + if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { @@ -7106,7 +7189,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); SDValue Vec = Op.getOperand(0); - EVT VecVT = Vec.getValueType(); + MVT VecVT = Vec.getValueType().getSimpleVT(); // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. @@ -7133,7 +7216,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return Res; } - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); DebugLoc dl = Op.getDebugLoc(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { @@ -7146,7 +7229,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. - EVT EltVT = MVT::i32; + MVT EltVT = MVT::i32; SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, @@ -7161,7 +7244,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast(Idx), -1, -1, -1 }; - EVT VVT = Op.getOperand(0).getValueType(); + MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7180,7 +7263,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; - EVT VVT = Op.getOperand(0).getValueType(); + MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7190,11 +7273,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); } -SDValue -X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - EVT EltVT = VT.getVectorElementType(); +static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); DebugLoc dl = Op.getDebugLoc(); SDValue N0 = Op.getOperand(0); @@ -7247,8 +7328,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - EVT EltVT = VT.getVectorElementType(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT.getVectorElementType(); DebugLoc dl = Op.getDebugLoc(); SDValue N0 = Op.getOperand(0); @@ -7296,7 +7377,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); - EVT OpVT = Op.getValueType(); + MVT OpVT = Op.getValueType().getSimpleVT(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. @@ -7511,8 +7592,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, - int64_t Offset, - SelectionDAG &DAG) const { + int64_t Offset, SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. unsigned char OpFlags = @@ -7580,10 +7660,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); } else { SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. @@ -7732,7 +7812,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), - getTargetMachine().getRelocationModel() == Reloc::PIC_); + getTargetMachine().getRelocationModel() == Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } @@ -7782,7 +7862,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Chain.getValue(1)); } - if (Subtarget->isTargetWindows()) { + if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -7802,18 +7882,19 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or - // %gs:0x58 (64-bit). + // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly + // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); - SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, - Subtarget->is64Bit() - ? DAG.getIntPtrConstant(0x58) - : DAG.getExternalSymbol("_tls_array", - getPointerTy()), + SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : + (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) : + DAG.getExternalSymbol("_tls_array", getPointerTy())); + + SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, false, false, 0); @@ -7892,7 +7973,7 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ } SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, 2, dl); + return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, @@ -8015,9 +8096,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SmallVector CV1; CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4330000000000000ULL)))); CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); @@ -8111,7 +8194,8 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SVT == MVT::v8i8 || SVT == MVT::v8i16) && "Custom UINT_TO_FP is not supported!"); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements()); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + SVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); } @@ -8172,13 +8256,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; - SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, - MVT::i64, MMO); + SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, + array_lengthof(Ops), MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. - SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), + SDValue SignSet = DAG.getSetCC(dl, + getSetCCResultType(*DAG.getContext(), MVT::i64), Op.getOperand(0), DAG.getConstant(0, MVT::i64), ISD::SETLT); @@ -8204,8 +8289,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); } -std::pair X86TargetLowering:: -FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { +std::pair +X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { DebugLoc DL = Op.getDebugLoc(); EVT DstTy = Op.getValueType(); @@ -8264,8 +8350,8 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); - Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, - DstTy, MMO); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, + array_lengthof(Ops), DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); @@ -8279,7 +8365,8 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, 3, DstTy, MMO); + Ops, array_lengthof(Ops), DstTy, + MMO); return std::make_pair(FIST, StackSlot); } else { SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, @@ -8291,17 +8378,17 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co MVT::i32, eax.getValue(2)); SDValue Ops[] = { eax, edx }; SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) - : DAG.getMergeValues(Ops, 2, DL); + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) + : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); return std::make_pair(pair, SDValue()); } } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { - EVT VT = Op->getValueType(0); + MVT VT = Op->getValueType(0).getSimpleVT(); SDValue In = Op->getOperand(0); - EVT InVT = In.getValueType(); + MVT InVT = In.getValueType().getSimpleVT(); DebugLoc dl = Op->getDebugLoc(); // Optimize vectors in AVX mode: @@ -8330,7 +8417,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); - EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); @@ -8352,9 +8439,9 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); - EVT SVT = In.getValueType(); + MVT SVT = In.getValueType().getSimpleVT(); if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); @@ -8382,11 +8469,11 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); } -SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); - EVT SVT = In.getValueType(); + MVT SVT = In.getValueType().getSimpleVT(); if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. @@ -8501,9 +8588,10 @@ SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) { - if (Op.getValueType() == MVT::v8i16) - return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(), + MVT VT = Op.getValueType().getSimpleVT(); + if (VT.isVector()) { + if (VT == MVT::v8i16) + return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT, DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), MVT::v8i32, Op.getOperand(0))); return SDValue(); @@ -8542,12 +8630,11 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return FIST; } -SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); SDValue In = Op.getOperand(0); - EVT SVT = In.getValueType(); + MVT SVT = In.getValueType().getSimpleVT(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -8559,8 +8646,8 @@ SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT EltVT = VT; + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { EltVT = VT.getVectorElementType(); @@ -8568,9 +8655,11 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { } Constant *C; if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, ~(1ULL << 63)))); else - C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, + APInt(32, ~(1U << 31)))); C = ConstantVector::getSplat(NumElts, C); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); unsigned Alignment = cast(CPIdx)->getAlignment(); @@ -8591,8 +8680,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT EltVT = VT; + MVT VT = Op.getValueType().getSimpleVT(); + MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { EltVT = VT.getVectorElementType(); @@ -8600,9 +8689,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { } Constant *C; if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 1ULL << 63))); else - C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); + C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, + APInt(32, 1U << 31))); C = ConstantVector::getSplat(NumElts, C); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); unsigned Alignment = cast(CPIdx)->getAlignment(); @@ -8626,8 +8717,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - EVT SrcVT = Op1.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); + MVT SrcVT = Op1.getValueType().getSimpleVT(); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -8646,13 +8737,15 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // First get the sign bit of second operand. SmallVector CV; if (SrcVT == MVT::f64) { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); + const fltSemantics &Sem = APFloat::IEEEdouble; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); } else { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + const fltSemantics &Sem = APFloat::IEEEsingle; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); } Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); @@ -8675,13 +8768,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Clear first operand sign bit. CV.clear(); if (VT == MVT::f64) { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); + const fltSemantics &Sem = APFloat::IEEEdouble; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, + APInt(64, ~(1ULL << 63))))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); } else { - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + const fltSemantics &Sem = APFloat::IEEEsingle; + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, + APInt(32, ~(1U << 31))))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); @@ -8697,7 +8794,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, @@ -8707,7 +8804,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. // -SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, + SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget->hasSSE41()) @@ -9105,14 +9203,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } if (LHS.getNode()) { - // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip - // the condition code later. - bool Invert = false; - if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) { - Invert = true; - LHS = LHS.getOperand(0); - } - // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because @@ -9129,9 +9219,6 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; - // Flip the condition if the LHS was a not instruction - if (Invert) - Cond = X86::GetOppositeBranchCondition(Cond); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(Cond, MVT::i8), BT); } @@ -9139,65 +9226,10 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, return SDValue(); } -SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - - if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); - - assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); - ISD::CondCode CC = cast(Op.getOperand(2))->get(); - - // Optimize to BT if possible. - // Lower (X & (1 << N)) == 0 to BT(X, N). - // Lower ((X >>u N) & 1) != 0 to BT(X, N). - // Lower ((X >>s N) & 1) != 0 to BT(X, N). - if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - Op1.getOpcode() == ISD::Constant && - cast(Op1)->isNullValue() && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) - return NewSetCC; - } - - // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of - // these. - if (Op1.getOpcode() == ISD::Constant && - (cast(Op1)->getZExtValue() == 1 || - cast(Op1)->isNullValue()) && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - - // If the input is a setcc, then reuse the input setcc or use a new one with - // the inverted condition. - if (Op0.getOpcode() == X86ISD::SETCC) { - X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast(Op1)->isNullValue(); - if (!Invert) return Op0; - - CCode = X86::GetOppositeBranchCondition(CCode); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); - } - } - - bool isFP = Op1.getValueType().isFloatingPoint(); - unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); - if (X86CC == X86::COND_INVALID) - return SDValue(); - - SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); - EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), EFLAGS); -} - // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); @@ -9217,26 +9249,27 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back - MVT EltVT = VT.getVectorElementType().getSimpleVT(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } -SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); ISD::CondCode SetCCOpcode = cast(CC)->get(); - bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); + bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint(); DebugLoc dl = Op.getDebugLoc(); if (isFP) { #ifndef NDEBUG - EVT EltVT = Op0.getValueType().getVectorElementType(); + MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif @@ -9329,14 +9362,54 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { - if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) - return SDValue(); + if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { + assert(Subtarget->hasSSE2() && "Don't know how to lower!"); + + // First cast everything to the right type. + Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); + Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. The lower + // compare is always unsigned. + SDValue SB; + if (FlipSigns) { + SB = DAG.getConstant(0x80000000U, MVT::v4i32); + } else { + SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); + SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); + SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Sign, Zero, Sign, Zero); + } + Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); + + // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); + + // Create masks for only the low parts/high parts of the 64 bit integers. + const int MaskHi[] = { 1, 1, 3, 3 }; + const int MaskLo[] = { 0, 0, 2, 2 }; + SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); + SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); + SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); + Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); + + if (Invert) + Result = DAG.getNOT(dl, Result, MVT::v4i32); + + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); - // First cast everything to the right type, + // First cast everything to the right type. Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); @@ -9355,17 +9428,13 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { } } - // Since SSE has no unsigned integer comparisons, we need to flip the sign + // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { EVT EltVT = VT.getVectorElementType(); - SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), - EltVT); - std::vector SignBits(VT.getVectorNumElements(), SignBit); - SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], - SignBits.size()); - Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); - Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); + SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); @@ -9377,6 +9446,63 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { return Result; } +SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + MVT VT = Op.getValueType().getSimpleVT(); + + if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); + + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + + // Optimize to BT if possible. + // Lower (X & (1 << N)) == 0 to BT(X, N). + // Lower ((X >>u N) & 1) != 0 to BT(X, N). + // Lower ((X >>s N) & 1) != 0 to BT(X, N). + if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && + Op1.getOpcode() == ISD::Constant && + cast(Op1)->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); + if (NewSetCC.getNode()) + return NewSetCC; + } + + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of + // these. + if (Op1.getOpcode() == ISD::Constant && + (cast(Op1)->getZExtValue() == 1 || + cast(Op1)->isNullValue()) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + // If the input is a setcc, then reuse the input setcc or use a new one with + // the inverted condition. + if (Op0.getOpcode() == X86ISD::SETCC) { + X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); + bool Invert = (CC == ISD::SETNE) ^ + cast(Op1)->isNullValue(); + if (!Invert) return Op0; + + CCode = X86::GetOppositeBranchCondition(CCode); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + } + } + + bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint(); + unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + if (X86CC == X86::COND_INVALID) + return SDValue(); + + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), EFLAGS); +} + // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); @@ -9499,7 +9625,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); - EVT VT = Op.getValueType(); + MVT VT = Op.getValueType().getSimpleVT(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && @@ -9610,9 +9736,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op->getValueType(0); + MVT VT = Op->getValueType(0).getSimpleVT(); SDValue In = Op->getOperand(0); - EVT InVT = In.getValueType(); + MVT InVT = In.getValueType().getSimpleVT(); DebugLoc dl = Op->getDebugLoc(); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && @@ -9646,7 +9772,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); - EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); @@ -10155,7 +10281,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } -// getTargetVShiftNOde - Handle vector element shifts where the shift amount +// getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, SDValue SrcOp, SDValue ShAmt, @@ -10851,28 +10977,47 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - // RDRAND intrinsics. + // RDRAND/RDSEED intrinsics. case Intrinsic::x86_rdrand_16: case Intrinsic::x86_rdrand_32: - case Intrinsic::x86_rdrand_64: { + case Intrinsic::x86_rdrand_64: + case Intrinsic::x86_rdseed_16: + case Intrinsic::x86_rdseed_32: + case Intrinsic::x86_rdseed_64: { + unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || + IntNo == Intrinsic::x86_rdseed_32 || + IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : + X86ISD::RDRAND; // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); - SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0)); + SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); - // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise - // return the value from Rand, which is always 0, casted to i32. + // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. + // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, Op->getValueType(1)), DAG.getConstant(X86::COND_B, MVT::i32), SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), - Ops, 4); + Ops, array_lengthof(Ops)); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } + + // XTEST intrinsics. + case Intrinsic::x86_xtest: { + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_NE, MVT::i8), + InTrans); + SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), + Ret, SDValue(InTrans.getNode(), 1)); + } } } @@ -10908,7 +11053,10 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); - unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; + unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, @@ -10928,21 +11076,23 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Handler = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); - SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, - Subtarget->is64Bit() ? X86::RBP : X86::EBP, - getPointerTy()); - unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); - - SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, - DAG.getIntPtrConstant(RegInfo->getSlotSize())); - StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); + EVT PtrVT = getPointerTy(); + unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || + (FrameReg == X86::EBP && PtrVT == MVT::i32)) && + "Invalid Frame Register!"); + SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); + unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; + + SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, + DAG.getIntPtrConstant(RegInfo->getSlotSize())); + StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); - return DAG.getNode(X86ISD::EH_RETURN, dl, - MVT::Other, - Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); + return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, + DAG.getRegister(StoreAddrReg, PtrVT)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, @@ -11153,7 +11303,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), - Ops, 2, MVT::i16, MMO); + Ops, array_lengthof(Ops), MVT::i16, + MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, @@ -11420,16 +11571,12 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { - +static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - LLVMContext *Context = DAG.getContext(); - - if (!Subtarget->hasSSE2()) - return SDValue(); // Optimize shl/srl/sra with constant shift amount. if (isSplatVector(Amt.getNode())) { @@ -11541,19 +11688,229 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { } } + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Amt.getOpcode() == ISD::BITCAST && + Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getValueType().getVectorNumElements() / + VT.getVectorNumElements(); + unsigned RatioInLog2 = Log2_32_Ceil(Ratio); + uint64_t ShiftAmt = 0; + for (unsigned i = 0; i != Ratio; ++i) { + ConstantSDNode *C = dyn_cast(Amt.getOperand(i)); + if (C == 0) + return SDValue(); + // 6 == Log2(64) + ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); + } + // Check remaining shift amounts. + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = + dyn_cast(Amt.getOperand(i + j)); + if (C == 0) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) + return SDValue(); + } + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + return DAG.getNode(X86ISD::VSHLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + case ISD::SRL: + return DAG.getNode(X86ISD::VSRLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + case ISD::SRA: + return DAG.getNode(X86ISD::VSRAI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + return SDValue(); +} + +static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || + VT == MVT::v4i32 || VT == MVT::v8i16 || + (Subtarget->hasInt256() && + ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || + VT == MVT::v8i32 || VT == MVT::v16i16))) { + SDValue BaseShAmt; + EVT EltVT = VT.getVectorElementType(); + + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned i, j; + for (i = 0; i != NumElts; ++i) { + if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) + continue; + break; + } + for (j = i; j != NumElts; ++j) { + SDValue Arg = Amt.getOperand(j); + if (Arg.getOpcode() == ISD::UNDEF) continue; + if (Arg != Amt.getOperand(i)) + break; + } + if (i != NumElts && j == NumElts) + BaseShAmt = Amt.getOperand(i); + } else { + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) + Amt = Amt.getOperand(0); + if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && + cast(Amt)->isSplat()) { + SDValue InVec = Amt.getOperand(0); + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = InVec.getValueType().getVectorNumElements(); + unsigned i = 0; + for (; i != NumElts; ++i) { + SDValue Arg = InVec.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + BaseShAmt = Arg; + break; + } + } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { + if (ConstantSDNode *C = + dyn_cast(InVec.getOperand(2))) { + unsigned SplatIdx = + cast(Amt)->getSplatIndex(); + if (C->getZExtValue() == SplatIdx) + BaseShAmt = InVec.getOperand(1); + } + } + if (BaseShAmt.getNode() == 0) + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, + DAG.getIntPtrConstant(0)); + } + } + + if (BaseShAmt.getNode()) { + if (EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); + else if (EltVT.bitsLT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); + + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); + } + case ISD::SRA: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v4i32: + case MVT::v8i16: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); + } + case ISD::SRL: + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); + } + } + } + } + + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Amt.getOpcode() == ISD::BITCAST && + Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getValueType().getVectorNumElements() / + VT.getVectorNumElements(); + std::vector Vals(Ratio); + for (unsigned i = 0; i != Ratio; ++i) + Vals[i] = Amt.getOperand(i); + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + for (unsigned j = 0; j != Ratio; ++j) + if (Vals[j] != Amt.getOperand(i + j)) + return SDValue(); + } + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + case ISD::SHL: + return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); + case ISD::SRL: + return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); + case ISD::SRA: + return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); + } + } + + return SDValue(); +} + +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + SDValue V; + + if (!Subtarget->hasSSE2()) + return SDValue(); + + V = LowerScalarImmediateShift(Op, DAG, Subtarget); + if (V.getNode()) + return V; + + V = LowerScalarVariableShift(Op, DAG, Subtarget); + if (V.getNode()) + return V; + + // AVX2 has VPSLLV/VPSRAV/VPSRLV. + if (Subtarget->hasInt256()) { + if (Op.getOpcode() == ISD::SRL && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v4i64 || VT == MVT::v8i32)) + return Op; + if (Op.getOpcode() == ISD::SHL && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v4i64 || VT == MVT::v8i32)) + return Op; + if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) + return Op; + } + // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { - Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), - DAG.getConstant(23, MVT::i32)); - - const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; - Constant *C = ConstantDataVector::get(*Context, CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); @@ -11562,8 +11919,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); // a = a << 5; - Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), - DAG.getConstant(5, MVT::i32)); + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); // Turn 'a' into a mask suitable for VSELECT @@ -11766,59 +12122,28 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, // fall through case MVT::v4i32: case MVT::v8i16: { - SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, - Op.getOperand(0), ShAmt, DAG); + // (sext (vzext x)) -> (vsext x) + SDValue Op0 = Op.getOperand(0); + SDValue Op00 = Op0.getOperand(0); + SDValue Tmp1; + // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. + if (Op0.getOpcode() == ISD::BITCAST && + Op00.getOpcode() == ISD::VECTOR_SHUFFLE) + Tmp1 = LowerVectorIntExtend(Op00, DAG); + if (Tmp1.getNode()) { + SDValue Tmp1Op0 = Tmp1.getOperand(0); + assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && + "This optimization is invalid without a VZEXT."); + return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + } + + // If the above didn't work, then just use Shift-Left + Shift-Right. + Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG); return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); } } } -static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); - - // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. - // There isn't any reason to disable it if the target processor supports it. - if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { - SDValue Chain = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, MVT::i32); - SDValue Ops[] = { - DAG.getRegister(X86::ESP, MVT::i32), // Base - DAG.getTargetConstant(1, MVT::i8), // Scale - DAG.getRegister(0, MVT::i32), // Index - DAG.getTargetConstant(0, MVT::i32), // Disp - DAG.getRegister(0, MVT::i32), // Segment. - Zero, - Chain - }; - SDNode *Res = - DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, - array_lengthof(Ops)); - return SDValue(Res, 0); - } - - unsigned isDev = cast(Op.getOperand(5))->getZExtValue(); - if (!isDev) - return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); - - unsigned Op1 = cast(Op.getOperand(1))->getZExtValue(); - unsigned Op2 = cast(Op.getOperand(2))->getZExtValue(); - unsigned Op3 = cast(Op.getOperand(3))->getZExtValue(); - unsigned Op4 = cast(Op.getOperand(4))->getZExtValue(); - - // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; - if (!Op1 && !Op2 && !Op3 && Op4) - return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); - - // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; - if (Op1 && !Op2 && !Op3 && !Op4) - return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); - - // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), - // (MFENCE)>; - return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); -} - static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); @@ -11847,9 +12172,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, Zero, Chain }; - SDNode *Res = - DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, - array_lengthof(Ops)); + SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); return SDValue(Res, 0); } @@ -11883,7 +12206,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, - Ops, 5, T, MMO); + Ops, array_lengthof(Ops), T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); return cpOut; @@ -11905,7 +12228,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), rdx.getValue(1) }; - return DAG.getMergeValues(Ops, 2, dl); + return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); } SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { @@ -11995,13 +12318,63 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } +SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); + + // For MacOSX, we want to call an alternative entry point: __sincos_stret, + // which returns the values as { float, float } (in XMM0) or + // { double, double } (which is returned in XMM0, XMM1). + DebugLoc dl = Op.getDebugLoc(); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + ArgListTy Args; + ArgListEntry Entry; + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + bool isF64 = ArgVT == MVT::f64; + // Only optimize x86_64 for now. i386 is a bit messy. For f32, + // the small struct {f32, f32} is returned in (eax, edx). For f64, + // the results are returned via SRet in memory. + const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + + Type *RetTy = isF64 + ? (Type*)StructType::get(ArgTy, ArgTy, NULL) + : (Type*)VectorType::get(ArgTy, 4); + TargetLowering:: + CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, + false, false, false, false, 0, + CallingConv::C, /*isTaillCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed*/true, + Callee, Args, DAG, dl); + std::pair CallResult = LowerCallTo(CLI); + + if (isF64) + // Returned in xmm0 and xmm1. + return CallResult.first; + + // Returned in bits 0:31 and 32:64 xmm0. + SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(0)); + SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(1)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); - case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); @@ -12024,13 +12397,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); - case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG); + case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); - case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -12077,6 +12450,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); } } @@ -12117,7 +12491,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, SDValue Ops[] = { Chain, In1, In2L, In2H }; SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, cast(Node)->getMemOperand()); SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); @@ -12164,7 +12538,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::UINT_TO_FP: { - if (N->getOperand(0).getValueType() != MVT::v2i32 && + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + if (N->getOperand(0).getValueType() != MVT::v2i32 || N->getValueType(0) != MVT::v2f32) return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, @@ -12196,7 +12571,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, eax.getValue(2)); // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { eax, edx }; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, + array_lengthof(Ops))); Results.push_back(edx.getValue(1)); return; } @@ -12235,7 +12611,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, - Ops, 3, T, MMO); + Ops, array_lengthof(Ops), T, MMO); SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -12416,7 +12792,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; - case X86ISD::PALIGN: return "X86ISD::PALIGN"; + case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; @@ -12447,6 +12823,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; + case X86ISD::RDSEED: return "X86ISD::RDSEED"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; @@ -12455,6 +12832,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; + case X86ISD::XTEST: return "X86ISD::XTEST"; } } @@ -12806,13 +13184,16 @@ static unsigned getPseudoCMOVOpc(EVT VT) { // to // // ... -// EAX = LOAD MI.addr +// t1 = LOAD MI.addr // loop: -// t1 = OP MI.val, EAX -// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] +// t4 = phi(t1, t3 / loop) +// t2 = OP MI.val, t4 +// EAX = t4 +// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] +// t3 = EAX // JNE loop // sink: -// dst = EAX +// dst = t3 // ... MachineBasicBlock * X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, @@ -12827,7 +13208,7 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, MachineFunction::iterator I = MBB; ++I; - assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 && + assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && "Unexpected number of operands"); assert(MI->hasOneMemOperand() && @@ -12849,7 +13230,11 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, const TargetRegisterClass *RC = MRI.getRegClass(DstReg); MVT::SimpleValueType VT = *RC->vt_begin(); - unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT); + unsigned t1 = MRI.createVirtualRegister(RC); + unsigned t2 = MRI.createVirtualRegister(RC); + unsigned t3 = MRI.createVirtualRegister(RC); + unsigned t4 = MRI.createVirtualRegister(RC); + unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); unsigned LOADOpc = getLoadOpcode(VT); @@ -12857,12 +13242,16 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, // For the atomic load-arith operator, we generate // // thisMBB: - // EAX = LOAD [MI.addr] + // t1 = LOAD [MI.addr] // mainMBB: + // t4 = phi(t1 / thisMBB, t3 / mainMBB) // t1 = OP MI.val, EAX + // EAX = t4 // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] + // t3 = EAX // JNE mainMBB // sinkMBB: + // dst = t3 MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); @@ -12878,23 +13267,34 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { + unsigned flags = (*MMOI)->getFlags(); + flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; + MachineMemOperand *MMO = + MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, + (*MMOI)->getSize(), + (*MMOI)->getBaseAlignment(), + (*MMOI)->getTBAAInfo(), + (*MMOI)->getRanges()); + MIB.addMemOperand(MMO); + } thisMBB->addSuccessor(mainMBB); // mainMBB: MachineBasicBlock *origMainMBB = mainMBB; - mainMBB->addLiveIn(AccPhyReg); - // Copy AccPhyReg as it is used more than once. - unsigned AccReg = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg) - .addReg(AccPhyReg); + // Add a PHI. + MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) + .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - unsigned t1 = MRI.createVirtualRegister(RC); unsigned Opc = MI->getOpcode(); switch (Opc) { default: @@ -12912,20 +13312,20 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, case X86::ATOMXOR32: case X86::ATOMXOR64: { unsigned ARITHOpc = getNonAtomicOpcode(Opc); - BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg) - .addReg(AccReg); + BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) + .addReg(t4); break; } case X86::ATOMNAND8: case X86::ATOMNAND16: case X86::ATOMNAND32: case X86::ATOMNAND64: { - unsigned t2 = MRI.createVirtualRegister(RC); + unsigned Tmp = MRI.createVirtualRegister(RC); unsigned NOTOpc; unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); - BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg) - .addReg(AccReg); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2); + BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) + .addReg(t4); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); break; } case X86::ATOMMAX8: @@ -12949,20 +13349,22 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, BuildMI(mainMBB, DL, TII->get(CMPOpc)) .addReg(SrcReg) - .addReg(AccReg); + .addReg(t4); if (Subtarget->hasCMov()) { if (VT != MVT::i8) { // Native support - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1) + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) .addReg(SrcReg) - .addReg(AccReg); + .addReg(t4); } else { // Promote i8 to i32 to use CMOV32 - const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32); + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterClass *RC32 = + TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); unsigned SrcReg32 = MRI.createVirtualRegister(RC32); unsigned AccReg32 = MRI.createVirtualRegister(RC32); - unsigned t2 = MRI.createVirtualRegister(RC32); + unsigned Tmp = MRI.createVirtualRegister(RC32); unsigned Undef = MRI.createVirtualRegister(RC32); BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); @@ -12973,15 +13375,15 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, .addImm(X86::sub_8bit); BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) .addReg(Undef) - .addReg(AccReg) + .addReg(t4) .addImm(X86::sub_8bit); - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) + BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) .addReg(SrcReg32) .addReg(AccReg32); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1) - .addReg(t2, 0, X86::sub_8bit); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) + .addReg(Tmp, 0, X86::sub_8bit); } } else { // Use pseudo select and lower them. @@ -12990,36 +13392,47 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, unsigned SelOpc = getPseudoCMOVOpc(VT); X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); - MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1) - .addReg(SrcReg).addReg(AccReg) + MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) + .addReg(SrcReg).addReg(t4) .addImm(CC); mainMBB = EmitLoweredSelect(MIB, mainMBB); + // Replace the original PHI node as mainMBB is changed after CMOV + // lowering. + BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) + .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); + Phi->eraseFromParent(); } break; } } - // Copy AccPhyReg back from virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg) - .addReg(AccReg); + // Copy PhyReg back from virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) + .addReg(t4); MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); - MIB.addReg(t1); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + MIB.addReg(t2); MIB.setMemRefs(MMOBegin, MMOEnd); + // Copy PhyReg back to virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) + .addReg(PhyReg); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); mainMBB->addSuccessor(origMainMBB); mainMBB->addSuccessor(sinkMBB); // sinkMBB: - sinkMBB->addLiveIn(AccPhyReg); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), DstReg) - .addReg(AccPhyReg); + .addReg(t3); MI->eraseFromParent(); return sinkMBB; @@ -13036,15 +13449,24 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, // to // // ... -// EAX = LOAD [MI.addr + 0] -// EDX = LOAD [MI.addr + 4] +// t1L = LOAD [MI.addr + 0] +// t1H = LOAD [MI.addr + 4] // loop: -// EBX = OP MI.val.lo, EAX -// ECX = OP MI.val.hi, EDX +// t4L = phi(t1L, t3L / loop) +// t4H = phi(t1H, t3H / loop) +// t2L = OP MI.val.lo, t4L +// t2H = OP MI.val.hi, t4H +// EAX = t4L +// EDX = t4H +// EBX = t2L +// ECX = t2H // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] +// t3L = EAX +// t3H = EDX // JNE loop // sink: -// dst = EDX:EAX +// dstL = t3L +// dstH = t3H // ... MachineBasicBlock * X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, @@ -13059,7 +13481,7 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, MachineFunction::iterator I = MBB; ++I; - assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && + assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && "Unexpected number of operands"); assert(MI->hasOneMemOperand() && @@ -13085,20 +13507,37 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, const TargetRegisterClass *RC = &X86::GR32RegClass; const TargetRegisterClass *RC8 = &X86::GR8RegClass; + unsigned t1L = MRI.createVirtualRegister(RC); + unsigned t1H = MRI.createVirtualRegister(RC); + unsigned t2L = MRI.createVirtualRegister(RC); + unsigned t2H = MRI.createVirtualRegister(RC); + unsigned t3L = MRI.createVirtualRegister(RC); + unsigned t3H = MRI.createVirtualRegister(RC); + unsigned t4L = MRI.createVirtualRegister(RC); + unsigned t4H = MRI.createVirtualRegister(RC); + unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; unsigned LOADOpc = X86::MOV32rm; // For the atomic load-arith operator, we generate // // thisMBB: - // EAX = LOAD [MI.addr + 0] - // EDX = LOAD [MI.addr + 4] + // t1L = LOAD [MI.addr + 0] + // t1H = LOAD [MI.addr + 4] // mainMBB: - // EBX = OP MI.vallo, EAX - // ECX = OP MI.valhi, EDX + // t4L = phi(t1L / thisMBB, t3L / mainMBB) + // t4H = phi(t1H / thisMBB, t3H / mainMBB) + // t2L = OP MI.val.lo, t4L + // t2H = OP MI.val.hi, t4H + // EBX = t2L + // ECX = t2H // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] - // JNE mainMBB + // t3L = EAX + // t3H = EDX + // JNE loop // sinkMBB: + // dstL = t3L + // dstH = t3H MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); @@ -13115,35 +13554,50 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, // thisMBB: // Lo - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } + for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { + unsigned flags = (*MMOI)->getFlags(); + flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; + MachineMemOperand *MMO = + MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, + (*MMOI)->getSize(), + (*MMOI)->getBaseAlignment(), + (*MMOI)->getTBAAInfo(), + (*MMOI)->getRanges()); + MIB.addMemOperand(MMO); + }; + MachineInstr *LowMI = MIB; + // Hi - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX); + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) + if (i == X86::AddrDisp) { MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) - else - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } else { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } } - MIB.setMemRefs(MMOBegin, MMOEnd); + MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); thisMBB->addSuccessor(mainMBB); // mainMBB: MachineBasicBlock *origMainMBB = mainMBB; - mainMBB->addLiveIn(X86::EAX); - mainMBB->addLiveIn(X86::EDX); - - // Copy EDX:EAX as they are used more than once. - unsigned LoReg = MRI.createVirtualRegister(RC); - unsigned HiReg = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX); - unsigned t1L = MRI.createVirtualRegister(RC); - unsigned t1H = MRI.createVirtualRegister(RC); + // Add PHIs. + MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) + .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); + MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) + .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); unsigned Opc = MI->getOpcode(); switch (Opc) { @@ -13156,19 +13610,23 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, case X86::ATOMSUB6432: { unsigned HiOpc; unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) + .addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) + .addReg(SrcHiReg); break; } case X86::ATOMNAND6432: { unsigned HiOpc, NOTOpc; unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); - unsigned t2L = MRI.createVirtualRegister(RC); - unsigned t2H = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H); + unsigned TmpL = MRI.createVirtualRegister(RC); + unsigned TmpH = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) + .addReg(t4L); + BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) + .addReg(t4H); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); break; } case X86::ATOMMAX6432: @@ -13184,12 +13642,12 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, unsigned cc = MRI.createVirtualRegister(RC); // cl := cmp src_lo, lo BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcLoReg).addReg(LoReg); + .addReg(SrcLoReg).addReg(t4L); BuildMI(mainMBB, DL, TII->get(LoOpc), cL); BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); // ch := cmp src_hi, hi BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcHiReg).addReg(HiReg); + .addReg(SrcHiReg).addReg(t4H); BuildMI(mainMBB, DL, TII->get(HiOpc), cH); BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); // cc := if (src_hi == hi) ? cl : ch; @@ -13204,58 +13662,74 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, } BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L) - .addReg(SrcLoReg).addReg(LoReg); - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H) - .addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) + .addReg(SrcLoReg).addReg(t4L); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) + .addReg(SrcHiReg).addReg(t4H); } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L) - .addReg(SrcLoReg).addReg(LoReg) + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) + .addReg(SrcLoReg).addReg(t4L) .addImm(X86::COND_NE); mainMBB = EmitLoweredSelect(MIB, mainMBB); - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H) - .addReg(SrcHiReg).addReg(HiReg) + // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the + // 2nd CMOV lowering. + mainMBB->addLiveIn(X86::EFLAGS); + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) + .addReg(SrcHiReg).addReg(t4H) .addImm(X86::COND_NE); mainMBB = EmitLoweredSelect(MIB, mainMBB); + // Replace the original PHI node as mainMBB is changed after CMOV + // lowering. + BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) + .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); + BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) + .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); + PhiL->eraseFromParent(); + PhiH->eraseFromParent(); } break; } case X86::ATOMSWAP6432: { unsigned HiOpc; unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); break; } } // Copy EDX:EAX back from HiReg:LoReg - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); // Copy ECX:EBX from t1H:t1L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); + if (NewMO.isReg()) + NewMO.setIsKill(false); + MIB.addOperand(NewMO); + } MIB.setMemRefs(MMOBegin, MMOEnd); + // Copy EDX:EAX back to t3H:t3L + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); mainMBB->addSuccessor(origMainMBB); mainMBB->addSuccessor(sinkMBB); // sinkMBB: - sinkMBB->addLiveIn(X86::EAX); - sinkMBB->addLiveIn(X86::EDX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), DstLoReg) - .addReg(X86::EAX); + .addReg(t3L); BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), DstHiReg) - .addReg(X86::EDX); + .addReg(t3H); MI->eraseFromParent(); return sinkMBB; @@ -14624,7 +15098,8 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + array_lengthof(Ops), Ld->getMemoryVT(), Ld->getPointerInfo(), Ld->getAlignment(), @@ -15290,13 +15765,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { APInt A = cast(OpRHS.getOperand(0))->getAPIntValue(); - if (CondRHS.getConstantOperandVal(0) == -A-1) { - SmallVector V(VT.getVectorNumElements(), - DAG.getConstant(-A, VT.getScalarType())); + if (CondRHS.getConstantOperandVal(0) == -A-1) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getNode(ISD::BUILD_VECTOR, DL, VT, - V.data(), V.size())); - } + DAG.getConstant(-A, VT)); } // Another special case: If C was a sign bit, the sub has been @@ -15320,6 +15791,51 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. + if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::SETCC) { + + assert(Cond.getValueType().isVector() && + "vector select expects a vector selector!"); + + EVT IntVT = Cond.getValueType(); + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if (!TValIsAllOnes && !FValIsAllZeros) { + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); + + if (TValIsAllZeros || FValIsAllOnes) { + SDValue CC = Cond.getOperand(2); + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast(CC)->get(), + Cond.getOperand(0).getValueType().isInteger()); + Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); + std::swap(LHS, RHS); + TValIsAllOnes = FValIsAllOnes; + FValIsAllZeros = TValIsAllZeros; + } + } + + if (TValIsAllOnes || FValIsAllZeros) { + SDValue Ret; + + if (TValIsAllOnes && FValIsAllZeros) + Ret = Cond; + else if (TValIsAllOnes) + Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); + else if (FValIsAllZeros) + Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); + + return DAG.getNode(ISD::BITCAST, DL, VT, Ret); + } + } + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits @@ -15380,6 +15896,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SDValue SetCC; const ConstantSDNode* C = 0; bool needOppositeCond = (CC == X86::COND_E); + bool checkAgainstTrue = false; // Is it a comparison against 1? if ((C = dyn_cast(Op1))) SetCC = Op2; @@ -15388,17 +15905,46 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { else // Quit if all operands are not constants. return SDValue(); - if (C->getZExtValue() == 1) + if (C->getZExtValue() == 1) { needOppositeCond = !needOppositeCond; - else if (C->getZExtValue() != 0) + checkAgainstTrue = true; + } else if (C->getZExtValue() != 0) // Quit if the constant is neither 0 or 1. return SDValue(); - // Skip 'zext' node. - if (SetCC.getOpcode() == ISD::ZERO_EXTEND) - SetCC = SetCC.getOperand(0); + bool truncatedToBoolWithAnd = false; + // Skip (zext $x), (trunc $x), or (and $x, 1) node. + while (SetCC.getOpcode() == ISD::ZERO_EXTEND || + SetCC.getOpcode() == ISD::TRUNCATE || + SetCC.getOpcode() == ISD::AND) { + if (SetCC.getOpcode() == ISD::AND) { + int OpIdx = -1; + ConstantSDNode *CS; + if ((CS = dyn_cast(SetCC.getOperand(0))) && + CS->getZExtValue() == 1) + OpIdx = 1; + if ((CS = dyn_cast(SetCC.getOperand(1))) && + CS->getZExtValue() == 1) + OpIdx = 0; + if (OpIdx == -1) + break; + SetCC = SetCC.getOperand(OpIdx); + truncatedToBoolWithAnd = true; + } else + SetCC = SetCC.getOperand(0); + } switch (SetCC.getOpcode()) { + case X86ISD::SETCC_CARRY: + // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to + // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, + // i.e. it's a comparison against true but the result of SETCC_CARRY is not + // truncated to i1 using 'and'. + if (checkAgainstTrue && !truncatedToBoolWithAnd) + break; + assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && + "Invalid use of SETCC_CARRY!"); + // FALL THROUGH case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); @@ -15414,9 +15960,15 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { return SDValue(); // Quit if false value is not a constant. if (!FVal) { - // A special case for rdrand, where 0 is set if false cond is found. SDValue Op = SetCC.getOperand(0); - if (Op.getOpcode() != X86ISD::RDRAND) + // Skip 'zext' or 'trunc' node. + if (Op.getOpcode() == ISD::ZERO_EXTEND || + Op.getOpcode() == ISD::TRUNCATE) + Op = Op.getOperand(0); + // A special case for rdrand/rdseed, where 0 is set if false cond is + // found. + if ((Op.getOpcode() != X86ISD::RDRAND && + Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. @@ -15596,7 +16148,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, ConstantSDNode *CmpAgainst = 0; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast(Cond.getOperand(1))) && - dyn_cast(Cond.getOperand(0)) == 0) { + !isa(Cond.getOperand(0))) { if (CC == X86::COND_NE && CmpAgainst == dyn_cast(FalseOp)) { @@ -15723,129 +16275,16 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts -/// when possible. +/// PerformShiftCombine - Combine shifts. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SHL) { SDValue V = PerformSHLCombine(N, DAG); if (V.getNode()) return V; } - // On X86 with SSE2 support, we can transform this to a vector shift if - // all elements are shifted by the same amount. We can't do this in legalize - // because the a constant vector is typically transformed to a constant pool - // so we have no knowledge of the shift amount. - if (!Subtarget->hasSSE2()) - return SDValue(); - - if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && - (!Subtarget->hasInt256() || - (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) - return SDValue(); - - SDValue ShAmtOp = N->getOperand(1); - EVT EltVT = VT.getVectorElementType(); - DebugLoc DL = N->getDebugLoc(); - SDValue BaseShAmt = SDValue(); - if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned i = 0; - for (; i != NumElts; ++i) { - SDValue Arg = ShAmtOp.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - BaseShAmt = Arg; - break; - } - // Handle the case where the build_vector is all undef - // FIXME: Should DAG allow this? - if (i == NumElts) - return SDValue(); - - for (; i != NumElts; ++i) { - SDValue Arg = ShAmtOp.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - if (Arg != BaseShAmt) { - return SDValue(); - } - } - } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && - cast(ShAmtOp)->isSplat()) { - SDValue InVec = ShAmtOp.getOperand(0); - if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = InVec.getValueType().getVectorNumElements(); - unsigned i = 0; - for (; i != NumElts; ++i) { - SDValue Arg = InVec.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - BaseShAmt = Arg; - break; - } - } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { - if (ConstantSDNode *C = dyn_cast(InVec.getOperand(2))) { - unsigned SplatIdx= cast(ShAmtOp)->getSplatIndex(); - if (C->getZExtValue() == SplatIdx) - BaseShAmt = InVec.getOperand(1); - } - } - if (BaseShAmt.getNode() == 0) { - // Don't create instructions with illegal types after legalize - // types has run. - if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && - !DCI.isBeforeLegalize()) - return SDValue(); - - BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, - DAG.getIntPtrConstant(0)); - } - } else - return SDValue(); - - // The shift amount is an i32. - if (EltVT.bitsGT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); - else if (EltVT.bitsLT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); - - // The shift amount is identical so we can do a vector shift. - SDValue ValOp = N->getOperand(0); - switch (N->getOpcode()) { - default: - llvm_unreachable("Unknown shift opcode!"); - case ISD::SHL: - switch (VT.getSimpleVT().SimpleTy) { - default: return SDValue(); - case MVT::v2i64: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v4i64: - case MVT::v8i32: - case MVT::v16i16: - return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); - } - case ISD::SRA: - switch (VT.getSimpleVT().SimpleTy) { - default: return SDValue(); - case MVT::v4i32: - case MVT::v8i16: - case MVT::v8i32: - case MVT::v16i16: - return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); - } - case ISD::SRL: - switch (VT.getSimpleVT().SimpleTy) { - default: return SDValue(); - case MVT::v2i64: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v4i64: - case MVT::v8i32: - case MVT::v16i16: - return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); - } - } + return SDValue(); } // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) @@ -15876,8 +16315,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, if (VT == MVT::f32 || VT == MVT::f64) { bool ExpectingFlags = false; // Check for any users that want flags: - for (SDNode::use_iterator UI = N->use_begin(), - UE = N->use_end(); + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); !ExpectingFlags && UI != UE; ++UI) switch (UI->getOpcode()) { default: @@ -16157,23 +16595,24 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // Validate that the Mask operand is a vector sra node. // FIXME: what to do for bytes, since there is a psignb/pblendvb, but // there is no psrai.b - if (Mask.getOpcode() != X86ISD::VSRAI) - return SDValue(); - - // Check that the SRA is all signbits. - SDValue SraC = Mask.getOperand(1); - unsigned SraAmt = cast(SraC)->getZExtValue(); unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + unsigned SraAmt = ~0; + if (Mask.getOpcode() == ISD::SRA) { + SDValue Amt = Mask.getOperand(1); + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast(SclrAmt)) + SraAmt = C->getZExtValue(); + } + } else if (Mask.getOpcode() == X86ISD::VSRAI) { + SDValue SraC = Mask.getOperand(1); + SraAmt = cast(SraC)->getZExtValue(); + } if ((SraAmt + 1) != EltBits) return SDValue(); DebugLoc DL = N->getDebugLoc(); - // We are going to replace the AND, OR, NAND with either BLEND - // or PSIGN, which only look at the MSB. The VSRAI instruction - // does not affect the highest bit, so we can get rid of it. - Mask = Mask.getOperand(0); - // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. // psign = x.type == y.type == mask.type && y = sub(0, x); @@ -16182,7 +16621,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Unsupported VT for PSIGN"); - Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } // PBLENDVB only available on SSE 4.1 @@ -16340,8 +16779,41 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, EVT MemVT = Ld->getMemoryVT(); DebugLoc dl = Ld->getDebugLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + // On Sandybridge unaligned 256bit loads are inefficient. ISD::LoadExtType Ext = Ld->getExtensionType(); + unsigned Alignment = Ld->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; + if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { + unsigned NumElems = RegVT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + NumElems/2); + SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Alignment); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + std::min(16U, Alignment)); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Load1.getValue(1), + Load2.getValue(1)); + + SDValue NewVec = DAG.getUNDEF(RegVT); + NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); + NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); + return DCI.CombineTo(N, NewVec, TF, true); + } // If this is a vector EXT Load then attempt to optimize it using a // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the @@ -16356,7 +16828,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, assert(MemVT.isVector() && "Must load a vector from memory"); unsigned NumElems = RegVT.getVectorNumElements(); - unsigned RegSz = RegVT.getSizeInBits(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); @@ -16400,8 +16871,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = - EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), loadRegZize/MemVT.getScalarType().getSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && @@ -16470,10 +16941,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Build the arithmetic shift. unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - MemVT.getVectorElementType().getSizeInBits(); - SmallVector C(NumElems, - DAG.getConstant(Amt, RegVT.getScalarType())); - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size()); - Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV); + Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, + DAG.getConstant(Amt, RegVT)); return DCI.CombineTo(N, Shuff, TF, true); } @@ -16511,11 +16980,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // On Sandy Bridge, 256-bit memory operations are executed by two // 128-bit ports. However, on Haswell it is better to issue a single 256-bit // memory operation. + unsigned Alignment = St->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; if (VT.is256BitVector() && !Subtarget->hasInt256() && - StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && - StoredVal.getNumOperands() == 2) { - SDValue Value0 = StoredVal.getOperand(0); - SDValue Value1 = StoredVal.getOperand(1); + StVT == VT && !IsAligned) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); + SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); SDValue Ptr0 = St->getBasePtr(); @@ -16523,10 +16997,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->isNonTemporal(), Alignment); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->isNonTemporal(), + std::min(16U, Alignment)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } @@ -16961,41 +17436,52 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (!VT.isVector()) return SDValue(); - SDValue In = N->getOperand(0); - EVT InVT = In.getValueType(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT ExtraVT = cast(N1)->getVT(); DebugLoc dl = N->getDebugLoc(); - unsigned ExtendedEltSize = VT.getVectorElementType().getSizeInBits(); - // Split SIGN_EXTEND operation to use vmovsx instruction when possible - if (InVT == MVT::v8i8) { - if (ExtendedEltSize > 16 && !Subtarget->hasInt256()) - In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, In); - if (ExtendedEltSize > 32) - In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i32, In); - return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, In); - } + // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the + // both SSE and AVX2 since there is no sign-extended shift right + // operation on a vector with 64-bit elements. + //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> + // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) + if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND)) { + SDValue N00 = N0.getOperand(0); + + // EXTLOAD has a better solution on AVX2, + // it may be replaced with X86ISD::VSEXT node. + if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) + if (!ISD::isNormalLoad(N00.getNode())) + return SDValue(); - if ((InVT == MVT::v4i8 || InVT == MVT::v4i16) && - ExtendedEltSize > 32 && !Subtarget->hasInt256()) { - In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In); - return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, In); + if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, + N00, N1); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); + } } + return SDValue(); +} +static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget->hasFp256()) return SDValue(); - if (VT.is256BitVector()) { + EVT VT = N->getValueType(0); + if (VT.isVector() && VT.getSizeInBits() == 256) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) return R; @@ -17105,8 +17591,8 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Helper function of PerformSETCCCombine. It is to materialize "setb reg" -// as "sbb reg,reg", since it can be extended without zext and produces +// Helper function of PerformSETCCCombine. It is to materialize "setb reg" +// as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, MVT::i8, @@ -17124,13 +17610,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, SDValue EFLAGS = N->getOperand(1); if (CC == X86::COND_A) { - // Try to convert COND_A into COND_B in an attempt to facilitate + // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), @@ -17338,7 +17824,8 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, if (In.getOpcode() != X86ISD::VZEXT) return SDValue(); - return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0)); + return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), + In.getOperand(0)); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, @@ -17376,13 +17863,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles - case X86ISD::PALIGN: + case X86ISD::PALIGNR: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: @@ -17565,7 +18053,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - std::sort(AsmPieces.begin(), AsmPieces.end()); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (AsmPieces.size() == 4 && AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && @@ -17583,7 +18071,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - std::sort(AsmPieces.begin(), AsmPieces.end()); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (AsmPieces.size() == 4 && AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && @@ -18063,7 +18551,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // really want an 8-bit or 32-bit register, map to the appropriate register // class and return the appropriate register. if (Res.second == &X86::GR16RegClass) { - if (VT == MVT::i8) { + if (VT == MVT::i8 || VT == MVT::i1) { unsigned DestReg = 0; switch (Res.first) { default: break; @@ -18076,7 +18564,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.first = DestReg; Res.second = &X86::GR8RegClass; } - } else if (VT == MVT::i32) { + } else if (VT == MVT::i32 || VT == MVT::f32) { unsigned DestReg = 0; switch (Res.first) { default: break; @@ -18093,7 +18581,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.first = DestReg; Res.second = &X86::GR32RegClass; } - } else if (VT == MVT::i64) { + } else if (VT == MVT::i64 || VT == MVT::f64) { unsigned DestReg = 0; switch (Res.first) { default: break;