cl::desc("Enable an experimental vector shuffle lowering code path."),
cl::Hidden);
+static cl::opt<int> ReciprocalEstimateRefinementSteps(
+ "x86-recip-refinement-steps", cl::init(1),
+ cl::desc("Specify the number of Newton-Raphson iterations applied to the "
+ "result of the hardware reciprocal estimate instruction."),
+ cl::NotHidden);
+
// Forward declarations.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+ makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
- VecIdx);
-
- return Result;
-
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
}
+
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
/// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, SDLoc dl) {
* ElemsPerChunk);
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
- VecIdx);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
}
+
/// Generate a DAG to put 128-bits into a vector > 128 bits. This
/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
/// simple superregister reference. Idx is an index in the 128 bits
-/// we want. It need not be aligned to a 128-bit bounday. That makes
+/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl) {
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG,SDLoc dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl) {
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
}
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
- if (TT.isOSBinFormatMachO()) {
- if (TT.getArch() == Triple::x86_64)
- return new X86_64MachoTargetObjectFile();
- return new TargetLoweringObjectFileMachO();
- }
-
- if (TT.isOSLinux())
- return new X86LinuxTargetObjectFile();
- if (TT.isOSBinFormatELF())
- return new TargetLoweringObjectFileELF();
- if (TT.isKnownWindowsMSVCEnvironment())
- return new X86WindowsTargetObjectFile();
- if (TT.isOSBinFormatCOFF())
- return new TargetLoweringObjectFileCOFF();
- llvm_unreachable("unknown subtarget type");
-}
-
// FIXME: This should stop caching the target machine as soon as
// we can remove resetOperationActions et al.
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
- : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+ : TargetLowering(TM) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
- // X86 is weird, it always uses i8 for shift amounts and setcc results.
+ // X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
- // For 64-bit since we have so many registers use the ILP scheduler, for
- // 32-bit code use the register pressure specific scheduling.
+ // For 64-bit, since we have so many registers, use the ILP scheduler.
+ // For 32-bit, use the register pressure specific scheduling.
// For Atom, always use ILP scheduling.
if (Subtarget->isAtom())
setSchedulingPreference(Sched::ILP);
TM.getSubtarget<X86Subtarget>().getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
- // Bypass expensive divides on Atom when compiling with O2
- if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
- addBypassSlowDiv(32, 8);
- if (Subtarget->is64Bit())
+ // Bypass expensive divides on Atom when compiling with O2.
+ if (TM.getOptLevel() >= CodeGenOpt::Default) {
+ if (Subtarget->hasSlowDivide32())
+ addBypassSlowDiv(32, 8);
+ if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
addBypassSlowDiv(64, 16);
}
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ // Only provide customized ctpop vector bit twiddling for vector types we
+ // know to perform better than using the popcnt instructions on each vector
+ // element. If popcnt isn't supported, always provide the custom version.
+ if (!Subtarget->hasPOPCNT()) {
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
+ }
+
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+ // Only provide customized ctpop vector bit twiddling for vector types we
+ // know to perform better than using the popcnt instructions on each
+ // vector element. If popcnt isn't supported, always provide the custom
+ // version.
+ if (!Subtarget->hasPOPCNT())
+ setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
+
+ // Custom CTPOP always performs better on natively supported v8i32
+ setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
MVT VT = (MVT::SimpleValueType)i;
+ if (VT.getScalarSizeInBits() >= 32) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
- if (VT.is128BitVector())
+ if (VT.is128BitVector()) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+ }
// Do not attempt to custom lower other non-256-bit vectors
if (!VT.is256BitVector())
continue;
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
// Extract subvector is special because the value type
// (result) is 256/128-bit but the source is 512-bit wide.
- if (VT.is128BitVector() || VT.is256BitVector())
+ if (VT.is128BitVector() || VT.is256BitVector()) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+ }
if (VT.getVectorElementType() == MVT::i1)
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
}
}
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
- // Do not attempt to promote non-256-bit vectors
+ // Do not attempt to promote non-256-bit vectors.
if (!VT.is512BitVector())
continue;
setOperationAction(ISD::LOAD, MVT::v64i8, Legal);
setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v64i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v32i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v64i8, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i16, Legal);
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
const MVT VT = (MVT::SimpleValueType)i;
const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- // Do not attempt to promote non-256-bit vectors
+ // Do not attempt to promote non-256-bit vectors.
if (!VT.is512BitVector())
continue;
- if ( EltSize < 32) {
+ if (EltSize < 32) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Legal);
}
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
+
+ setOperationAction(ISD::AND, MVT::v8i32, Legal);
+ setOperationAction(ISD::OR, MVT::v8i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v8i32, Legal);
+ setOperationAction(ISD::AND, MVT::v4i32, Legal);
+ setOperationAction(ISD::OR, MVT::v4i32, Legal);
+ setOperationAction(ISD::XOR, MVT::v4i32, Legal);
}
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
setLibcallName(RTLIB::SINCOS_F32, "sincosf");
setLibcallName(RTLIB::SINCOS_F64, "sincos");
if (Subtarget->isTargetDarwin()) {
- // For MacOSX, we don't want to the normal expansion of a libcall to
- // sincos. We want to issue a libcall to __sincos_stret to avoid memory
- // traffic.
+ // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+ // We want to issue a libcall to __sincos_stret to avoid memory traffic.
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
}
// Predictable cmov don't hurt on atom because it's in-order.
PredictableSelectIsExpensive = !Subtarget->isAtom();
-
+ EnableExtLdPromotion = true;
setPrefFunctionAlignment(4); // 2^4 bytes.
verifyIntrinsicTables();
// This has so far only been implemented for 64-bit MachO.
bool X86TargetLowering::useLoadStackGuardNode() const {
- return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
- Subtarget->is64Bit();
+ return Subtarget->isTargetMachO() && Subtarget->is64Bit();
}
TargetLoweringBase::LegalizeTypeAction
return VT.changeVectorElementTypeToInteger();
}
-/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
if (MaxAlign == 16)
}
}
-/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
return Align;
}
-/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// Returns the target specific optimal type for load
/// and store operations as a result of memset, memcpy, and memmove
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
return true;
}
-/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// Return the entry encoding for a jump table in the
/// current function. The returned value is a member of the
/// MachineJumpTableInfo::JTEntryKind enum.
unsigned X86TargetLowering::getJumpTableEncoding() const {
MCSymbolRefExpr::VK_GOTOFF, Ctx);
}
-/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
-/// jumptable.
+/// Returns relocation base for the given PIC jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
if (!Subtarget->is64Bit())
return Table;
}
-/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
-/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
-/// MCExpr.
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
MCContext &Ctx) const {
SDValue ValToCopy = OutVals[i];
EVT ValVT = ValToCopy.getValueType();
- // Promote values to the appropriate types
+ // Promote values to the appropriate types.
if (VA.getLocInfo() == CCValAssign::SExt)
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::ZExt)
ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
assert(VA.getLocInfo() != CCValAssign::FPExt &&
- "Unexpected FP-extend for return value.");
+ "Unexpected FP-extend for return value.");
// If this is x86-64, and we disabled SSE, we can't return FP values,
// or SSE or MMX vectors.
return VT.bitsLT(MinVT) ? MinVT : VT;
}
-/// LowerCallResult - Lower the result values of a call into the
+/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
SDValue
return StackStructReturn;
}
-/// ArgsAreStructReturn - Determines whether a function uses struct
-/// return semantics.
+/// Determines whether a function uses struct return semantics.
static StructReturnType
argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
if (Ins.empty())
return StackStructReturn;
}
-/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
-/// by "Src" to address "Dst" with size and alignment information specified by
-/// the specific parameter attribute. The copy will be passed as a byval
-/// function parameter.
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
static SDValue
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
MachinePointerInfo(), MachinePointerInfo());
}
-/// IsTailCallConvention - Return true if the calling convention is one that
+/// Return true if the calling convention is one that
/// supports tail call optimization.
static bool IsTailCallConvention(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
return true;
}
-/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// Return true if the function is being made into
/// a tailcall target by changing its ABI.
static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
bool GuaranteedTailCallOpt) {
MFI->CreateFixedObject(1, StackSize, true));
}
+ // Figure out if XMM registers are in use.
+ assert(!(MF.getTarget().Options.UseSoftFloat &&
+ Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
// 64-bit calling conventions support varargs and register parameters, so we
- // have to do extra work to spill them in the prologue or forward them to
- // musttail calls.
- if (Is64Bit && isVarArg &&
- (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
+ // have to do extra work to spill them in the prologue.
+ if (Is64Bit && isVarArg && MFI->hasVAStart()) {
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
}
}
- // Store them to the va_list returned by va_start.
- if (MFI->hasVAStart()) {
- if (IsWin64) {
- const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
- // Get to the caller-allocated home save location. Add 8 to account
- // for the return address.
- int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
- FuncInfo->setRegSaveFrameIndex(
+ if (IsWin64) {
+ const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
- // Fixup to set vararg frame on shadow area (4 x i64).
- if (NumIntRegs < 4)
- FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
- } else {
- // For X86-64, if there are vararg parameters that are passed via
- // registers, then we must store them to their spots on the stack so
- // they may be loaded by deferencing the result of va_next.
- FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
- FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
- FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
- ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
- }
-
- // Store the integer parameter registers.
- SmallVector<SDValue, 8> MemOps;
- SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
- getPointerTy());
- unsigned Offset = FuncInfo->getVarArgsGPOffset();
- for (SDValue Val : LiveGPRs) {
- SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
- DAG.getIntPtrConstant(Offset));
- SDValue Store =
- DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo::getFixedStack(
- FuncInfo->getRegSaveFrameIndex(), Offset),
- false, false, 0);
- MemOps.push_back(Store);
- Offset += 8;
- }
-
- if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
- // Now store the XMM (fp + vector) parameter registers.
- SmallVector<SDValue, 12> SaveXMMOps;
- SaveXMMOps.push_back(Chain);
- SaveXMMOps.push_back(ALVal);
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getRegSaveFrameIndex()));
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getVarArgsFPOffset()));
- SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
- LiveXMMRegs.end());
- MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other, SaveXMMOps));
- }
-
- if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
} else {
- // Add all GPRs, al, and XMMs to the list of forwards. We will add then
- // to the liveout set on a musttail call.
- assert(MFI->hasMustTailInVarArgFunc());
- auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
- typedef X86MachineFunctionInfo::Forward Forward;
-
- for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
- unsigned VReg =
- MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
- Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
- }
-
- if (!ArgXMMs.empty()) {
- unsigned ALVReg =
- MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
- Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
-
- for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
- unsigned VReg =
- MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
- Forwards.push_back(
- Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
- }
- }
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by deferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy());
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+ DAG.getIntPtrConstant(Offset));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ FuncInfo->getRegSaveFrameIndex(), Offset),
+ false, false, 0);
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+ // Now store the XMM (fp + vector) parameter registers.
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getRegSaveFrameIndex()));
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getVarArgsFPOffset()));
+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+ LiveXMMRegs.end());
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ }
+
+ if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget->hasAVX512() &&
+ (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget->hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget->hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Conservatively forward AL on x86_64, since it might be used for varargs.
+ if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+ unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &F : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
}
}
false, false, 0);
}
-/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// Emit a load of return address if tail call
/// optimization is performed and it is required.
SDValue
X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
return SDValue(OutRetAddr.getNode(), 1);
}
-/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
+/// Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx,
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
- if (Is64Bit && isVarArg && IsMustTail) {
+ if (isVarArg && IsMustTail) {
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
// In PIC we need an extra register to formulate the address computation
// for the callee.
unsigned MaxInRegs =
- (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+ (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
// For kernel code model we know that all object resist in the negative half
// of 32bits address space. We may not accept negative offsets, since they may
// be just off and we may accept pretty large positive ones.
- if (M == CodeModel::Kernel && Offset > 0)
+ if (M == CodeModel::Kernel && Offset >= 0)
return true;
return false;
return true;
}
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+ unsigned Index) const {
+ if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+ return false;
+
+ return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget->hasLZCNT();
+}
+
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (L, L+Pos]. or is undef.
+/// sequential range (Low, Low+Size]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
unsigned Pos, unsigned Size, int Low) {
for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
}
/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
-/// the second operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
- if (VT == MVT::v4f32 || VT == MVT::v4i32 )
- return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
- if (VT == MVT::v2f64 || VT == MVT::v2i64)
- return (Mask[0] < 2 && Mask[1] < 2);
- return false;
+/// is suitable for input to PSHUFD. That is, it doesn't reference the other
+/// operand - by default will match for first operand.
+static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
+ bool TestSecondOperand = false) {
+ if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
+ VT != MVT::v2f64 && VT != MVT::v2i64)
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned Lo = TestSecondOperand ? NumElems : 0;
+ unsigned Hi = Lo + NumElems;
+
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
+ return false;
+
+ return true;
}
/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
SDValue Vec;
if (VT.is128BitVector()) { // SSE
if (Subtarget->hasSSE2()) { // SSE2
- SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
} else { // SSE1
- SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
}
} else if (VT.is256BitVector()) { // AVX
if (Subtarget->hasInt256()) { // AVX2
- SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else {
// 256-bit logic and arithmetic instructions in AVX are all
// floating-point, no support for integer ops. Emit fp zeroed vectors.
- SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
}
} else if (VT.is512BitVector()) { // AVX-512
- SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
} else if (VT.getScalarType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
- SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+ SDValue Cst = DAG.getConstant(0, MVT::i1);
SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
SDLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+ SDValue Cst = DAG.getConstant(~0U, MVT::i32);
SDValue Vec;
if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
}
/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
-static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
- unsigned NonZeros, unsigned NumNonZero,
- unsigned NumZero, SelectionDAG &DAG,
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
- // We know there's at least one non-zero element
- unsigned FirstNonZeroIdx = 0;
- SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
- while (FirstNonZero.getOpcode() == ISD::UNDEF ||
- X86::isZeroNode(FirstNonZero)) {
- ++FirstNonZeroIdx;
- FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ // Find all zeroable elements.
+ bool Zeroable[4];
+ for (int i=0; i < 4; ++i) {
+ SDValue Elt = Op->getOperand(i);
+ Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
+ }
+ assert(std::count_if(&Zeroable[0], &Zeroable[4],
+ [](bool M) { return !M; }) > 1 &&
+ "We expect at least two non-zero elements!");
+
+ // We only know how to deal with build_vector nodes where elements are either
+ // zeroable or extract_vector_elt with constant index.
+ SDValue FirstNonZero;
+ unsigned FirstNonZeroIdx;
+ for (unsigned i=0; i < 4; ++i) {
+ if (Zeroable[i])
+ continue;
+ SDValue Elt = Op->getOperand(i);
+ if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Elt.getOperand(1)))
+ return SDValue();
+ // Make sure that this node is extracting from a 128-bit vector.
+ MVT VT = Elt.getOperand(0).getSimpleValueType();
+ if (!VT.is128BitVector())
+ return SDValue();
+ if (!FirstNonZero.getNode()) {
+ FirstNonZero = Elt;
+ FirstNonZeroIdx = i;
+ }
}
- if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
- return SDValue();
+ assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+ SDValue V1 = FirstNonZero.getOperand(0);
+ MVT VT = V1.getSimpleValueType();
- SDValue V = FirstNonZero.getOperand(0);
- MVT VVT = V.getSimpleValueType();
- if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
- return SDValue();
+ // See if this build_vector can be lowered as a blend with zero.
+ SDValue Elt;
+ unsigned EltMaskIdx, EltIdx;
+ int Mask[4];
+ for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+ if (Zeroable[EltIdx]) {
+ // The zero vector will be on the right hand side.
+ Mask[EltIdx] = EltIdx+4;
+ continue;
+ }
- unsigned FirstNonZeroDst =
- cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
- unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
- unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
- unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+ Elt = Op->getOperand(EltIdx);
+ // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+ EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+ if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+ break;
+ Mask[EltIdx] = EltIdx;
+ }
- for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
- SDValue Elem = Op.getOperand(Idx);
- if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
- continue;
+ if (EltIdx == 4) {
+ // Let the shuffle legalizer deal with blend operations.
+ SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ if (V1.getSimpleValueType() != VT)
+ V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
+ }
- // TODO: What else can be here? Deal with it.
- if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
+ // See if we can lower this build_vector to a INSERTPS.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
- // TODO: Some optimizations are still possible here
- // ex: Getting one element from a vector, and the rest from another.
- if (Elem.getOperand(0) != V)
- return SDValue();
+ SDValue V2 = Elt.getOperand(0);
+ if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
+ V1 = SDValue();
- unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
- if (Dst == Idx)
- ++CorrectIdx;
- else if (IncorrectIdx == -1U) {
- IncorrectIdx = Idx;
- IncorrectDst = Dst;
- } else
- // There was already one element with an incorrect index.
- // We can't optimize this case to an insertps.
- return SDValue();
+ bool CanFold = true;
+ for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+ if (Zeroable[i])
+ continue;
+
+ SDValue Current = Op->getOperand(i);
+ SDValue SrcVector = Current->getOperand(0);
+ if (!V1.getNode())
+ V1 = SrcVector;
+ CanFold = SrcVector == V1 &&
+ cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
}
- if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
- SDLoc dl(Op);
- EVT VT = Op.getSimpleValueType();
- unsigned ElementMoveMask = 0;
- if (IncorrectIdx == -1U)
- ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
- else
- ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+ if (!CanFold)
+ return SDValue();
- SDValue InsertpsMask =
- DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
- }
+ assert(V1.getNode() && "Expected at least two non-zero elements!");
+ if (V1.getSimpleValueType() != MVT::v4f32)
+ V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
+ if (V2.getSimpleValueType() != MVT::v4f32)
+ V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
- return SDValue();
+ // Ok, we can emit an INSERTPS instruction.
+ unsigned ZMask = 0;
+ for (int i = 0; i < 4; ++i)
+ if (Zeroable[i])
+ ZMask |= 1 << i;
+
+ unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
+ DAG.getIntPtrConstant(InsertPSMask));
+ return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
}
/// getVShift - Return a vector logical shift node.
SDValue NewLd = SDValue();
- if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
- NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), 0);
NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), LDBase->getAlignment());
+ LDBase->getPointerInfo(), LDBase->isVolatile(),
+ LDBase->isNonTemporal(), LDBase->isInvariant(),
+ LDBase->getAlignment());
if (LDBase->hasAnyUseOfValue(1)) {
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
return NewLd;
}
- if (NumElems == 4 && LastLoadedElt == 1 &&
+
+ //TODO: The code below fires only for for loading the low v2i32 / v2f32
+ //of a v4i32 / v4f32. It's probably worth generalizing.
+ if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
if (!IsLoad)
return SDValue();
- if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (Subtarget->hasVLX() && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
AllContants = false;
NonConstIdx = idx;
NumNonConsts++;
- }
- else {
+ } else {
NumConsts++;
if (cast<ConstantSDNode>(In)->getZExtValue())
Immediate |= (1ULL << idx);
MVT::getIntegerVT(VT.getSizeInBits()));
DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
}
- else
+ else
DstVec = DAG.getUNDEF(VT);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
Op.getOperand(NonConstIdx),
/// \brief Return true if \p N implements a horizontal binop and return the
/// operands for the horizontal binop into V0 and V1.
-///
+///
/// This is a helper function of PerformBUILD_VECTORCombine.
/// This function checks that the build_vector \p N in input implements a
/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
"Invalid Vector in input!");
-
+
bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
bool CanFold = true;
unsigned ExpectedVExtractIdx = BaseIdx;
}
/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
-/// a concat_vector.
+/// a concat_vector.
///
/// This is a helper function of PerformBUILD_VECTORCombine.
/// This function expects two 256-bit vectors called V0 and V1.
/// At first, each vector is split into two separate 128-bit vectors.
/// Then, the resulting 128-bit vectors are used to implement two
-/// horizontal binary operations.
+/// horizontal binary operations.
///
/// The kind of horizontal binary operation is defined by \p X86Opcode.
///
// Try to match an SSE3 float HADD/HSUB.
if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
-
+
if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
} else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
// Try to match an SSSE3 integer HADD/HSUB.
if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
-
+
if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
}
-
+
if (!Subtarget->hasAVX())
return SDValue();
// Do this only if the target has AVX2.
if (Subtarget->hasAVX2())
return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
-
+
// Do not try to expand this build_vector into a pair of horizontal
// add/sub if we can emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
if (IsAllConstants)
return SDValue();
- // For AVX-length vectors, build the individual 128-bit pieces and use
+ // For AVX-length vectors, see if we can use a vector load to get all of the
+ // elements, otherwise build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.is256BitVector() || VT.is512BitVector()) {
SmallVector<SDValue, 64> V;
for (unsigned i = 0; i != NumElems; ++i)
V.push_back(Op.getOperand(i));
+ // Check for a build vector of consecutive loads.
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
+ return LD;
+
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build both the lower and upper subvector.
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
if (EVTBits == 32 && NumElems == 4) {
- SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
- NumZero, DAG, Subtarget, *this);
+ SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
if (V.getNode())
return V;
}
/// \brief Try to lower a vector shuffle as a byte rotation.
///
-/// We have a generic PALIGNR instruction in x86 that will do an arbitrary
-/// byte-rotation of the concatenation of two vectors. This routine will
-/// try to generically lower a vector shuffle through such an instruction. It
-/// does not check for the availability of PALIGNR-based lowerings, only the
-/// applicability of this strategy to the given mask. This matches shuffle
-/// vectors that look like:
-///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
-///
+///
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SDValue V2,
ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
else if (!Hi)
Hi = Lo;
- // Cast the inputs to v16i8 to match PALIGNR.
- Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
- Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
-
assert(VT.getSizeInBits() == 128 &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
+
// The actual rotate instruction rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector.
int Scale = 16 / Mask.size();
+ // SSSE3 targets can use the palignr instruction
+ if (Subtarget->hasSSSE3()) {
+ // Cast the inputs to v16i8 to match PALIGNR.
+ Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
+ DAG.getConstant(Rotation * Scale, MVT::i8)));
+ }
+
+ // Default SSE2 implementation
+ int LoByteShift = 16 - Rotation * Scale;
+ int HiByteShift = Rotation * Scale;
+
+ // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
+ Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
+
+ SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
+ DAG.getConstant(8 * LoByteShift, MVT::i8));
+ SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
+ DAG.getConstant(8 * HiByteShift, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
- DAG.getConstant(Rotation * Scale, MVT::i8)));
+ DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
}
/// \brief Compute whether each element of a shuffle is zeroable.
return Zeroable;
}
+/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
+/// byte-shift instructions. The mask must consist of a shifted sequential
+/// shuffle from one of the input vectors and zeroable elements for the
+/// remaining 'shifted in' elements.
+///
+/// Note that this only handles 128-bit vector widths currently.
+static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+
+ for (int Shift = 1; Shift < Size; Shift++) {
+ int ByteShift = Shift * Scale;
+
+ // PSRLDQ : (little-endian) right byte shift
+ // [ 5, 6, 7, zz, zz, zz, zz, zz]
+ // [ -1, 5, 6, 7, zz, zz, zz, zz]
+ // [ 1, 2, -1, -1, -1, -1, zz, zz]
+ bool ZeroableRight = true;
+ for (int i = Size - Shift; i < Size; i++) {
+ ZeroableRight &= Zeroable[i];
+ }
+
+ if (ZeroableRight) {
+ bool ValidShiftRight1 =
+ isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
+ bool ValidShiftRight2 =
+ isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
+
+ if (ValidShiftRight1 || ValidShiftRight2) {
+ // Cast the inputs to v2i64 to match PSRLDQ.
+ SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
+ SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
+ DAG.getConstant(ByteShift * 8, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
+ }
+ }
+
+ // PSLLDQ : (little-endian) left byte shift
+ // [ zz, 0, 1, 2, 3, 4, 5, 6]
+ // [ zz, zz, -1, -1, 2, 3, 4, -1]
+ // [ zz, zz, zz, zz, zz, zz, -1, 1]
+ bool ZeroableLeft = true;
+ for (int i = 0; i < Shift; i++) {
+ ZeroableLeft &= Zeroable[i];
+ }
+
+ if (ZeroableLeft) {
+ bool ValidShiftLeft1 =
+ isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
+ bool ValidShiftLeft2 =
+ isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
+
+ if (ValidShiftLeft1 || ValidShiftLeft2) {
+ // Cast the inputs to v2i64 to match PSLLDQ.
+ SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
+ SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
+ DAG.getConstant(ByteShift * 8, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
/// \brief Lower a vector shuffle as a zero or any extension.
///
/// Given a specific number of elements, element bit width, and extension
getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
}
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Shift;
+
// If we have a single input from V2 insert that into V1 if we can do so
// cheaply.
if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
Subtarget, DAG))
return Blend;
- // Try to use rotation instructions if available.
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v2i64, V1, V2, Mask, DAG))
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// We implement this with SHUFPD which is pretty lame because it will likely
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Shift;
+
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
Subtarget, DAG))
return Blend;
- // Try to use rotation instructions if available.
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v4i32, V1, V2, Mask, DAG))
+ DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// We implement this with SHUFPS because it can blend from two vectors.
Mask, Subtarget, DAG))
return Broadcast;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v8i16, V, V, Mask, DAG))
+ return Shift;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
- // Try to use rotation instructions if available.
- if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V, V, Mask, DAG))
- return Rotate;
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
+ return Rotate;
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.");
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Shift;
+
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
Subtarget, DAG))
return Blend;
- // Try to use rotation instructions if available.
- if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V1, V2, Mask, DAG))
- return Rotate;
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
ArrayRef<int> OrigMask = SVOp->getMask();
assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- // Try to use rotation instructions if available.
- if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, OrigMask, DAG))
- return Rotate;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ return Rotate;
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
+ if (M >= NumElements + SplitNumElements)
+ UseHiV2 = true;
+ else
+ UseLoV2 = true;
V2BlendMask.push_back(M - NumElements);
V1BlendMask.push_back(-1);
BlendMask.push_back(SplitNumElements + i);
} else if (M >= 0) {
+ if (M >= SplitNumElements)
+ UseHiV1 = true;
+ else
+ UseLoV1 = true;
V2BlendMask.push_back(-1);
V1BlendMask.push_back(M);
BlendMask.push_back(i);
BlendMask.push_back(-1);
}
}
- SDValue V1Blend =
+
+ // Because the lowering happens after all combining takes place, we need to
+ // manually combine these blend masks as much as possible so that we create
+ // a minimal number of high-level vector shuffle nodes.
+
+ // First try just blending the halves of V1 or V2.
+ if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+ return DAG.getUNDEF(SplitVT);
+ if (!UseLoV2 && !UseHiV2)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ if (!UseLoV1 && !UseHiV1)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+ SDValue V1Blend, V2Blend;
+ if (UseLoV1 && UseHiV1) {
+ V1Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
- SDValue V2Blend =
+ } else {
+ // We only use half of V1 so map the usage down into the final blend mask.
+ V1Blend = UseLoV1 ? LoV1 : HiV1;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+ BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+ }
+ if (UseLoV2 && UseHiV2) {
+ V2Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ } else {
+ // We only use half of V2 so map the usage down into the final blend mask.
+ V2Blend = UseLoV2 ? LoV2 : HiV2;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= SplitNumElements)
+ BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+ }
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
};
SDValue Lo = HalfBlend(LoMask);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
+/// \brief Either split a vector in halves or decompose the shuffles and the
+/// blend.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
+ "lower single-input shuffles as it "
+ "could then recurse on itself.");
+ int Size = Mask.size();
+
+ // If this can be modeled as a broadcast of two elements followed by a blend,
+ // prefer that lowering. This is especially important because broadcasts can
+ // often fold with memory operands.
+ auto DoBothBroadcast = [&] {
+ int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+ for (int M : Mask)
+ if (M >= Size) {
+ if (V2BroadcastIdx == -1)
+ V2BroadcastIdx = M - Size;
+ else if (M - Size != V2BroadcastIdx)
+ return false;
+ } else if (M >= 0) {
+ if (V1BroadcastIdx == -1)
+ V1BroadcastIdx = M;
+ else if (M != V1BroadcastIdx)
+ return false;
+ }
+ return true;
+ };
+ if (DoBothBroadcast())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+ DAG);
+
+ // If the inputs all stem from a single 128-bit lane of each input, then we
+ // split them rather than blending because the split will decompose to
+ // unusually few instructions.
+ int LaneCount = VT.getSizeInBits() / 128;
+ int LaneSize = Size / LaneCount;
+ SmallBitVector LaneInputs[2];
+ LaneInputs[0].resize(LaneCount, false);
+ LaneInputs[1].resize(LaneCount, false);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+ if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ // Otherwise, just fall back to decomposed shuffles and a blend. This requires
+ // that the decomposed single-input shuffles don't end up here.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
/// a permutation and blend of those lanes.
///
DAG.getConstant(PermMask, MVT::i8));
}
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This is only useful with multiple inputs.");
+
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int NumLanes = Size / LaneSize;
+ assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+ // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+ // check whether the in-128-bit lane shuffles share a repeating pattern.
+ SmallVector<int, 4> Lanes;
+ Lanes.resize(NumLanes, -1);
+ SmallVector<int, 4> InLaneMask;
+ InLaneMask.resize(LaneSize, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ int j = i / LaneSize;
+
+ if (Lanes[j] < 0) {
+ // First entry we've seen for this lane.
+ Lanes[j] = Mask[i] / LaneSize;
+ } else if (Lanes[j] != Mask[i] / LaneSize) {
+ // This doesn't match the lane selected previously!
+ return SDValue();
+ }
+
+ // Check that within each lane we have a consistent shuffle mask.
+ int k = i % LaneSize;
+ if (InLaneMask[k] < 0) {
+ InLaneMask[k] = Mask[i] % LaneSize;
+ } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+ // This doesn't fit a repeating in-lane mask.
+ return SDValue();
+ }
+ }
+
+ // First shuffle the lanes into place.
+ MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+ VT.getSizeInBits() / 64);
+ SmallVector<int, 8> LaneMask;
+ LaneMask.resize(NumLanes * 2, -1);
+ for (int i = 0; i < NumLanes; ++i)
+ if (Lanes[i] >= 0) {
+ LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+ LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+ }
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+ SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+ // Cast it back to the type we actually want.
+ LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+
+ // Now do a simple shuffle that isn't lane crossing.
+ SmallVector<int, 8> NewMask;
+ NewMask.resize(Size, -1);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+ assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+ "Must not introduce lane crosses at this point!");
+
+ return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+ return false;
+
+ return true;
+}
+
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
DAG.getConstant(SHUFPDMask, MVT::i8));
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because an v4 we
+ // can fully permute the elements.
+ if (Subtarget->hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic blend lowering.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
Mask, DAG);
DAG);
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because at v8 we
+ // can fully permute the elements.
+ if (Subtarget->hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
}
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic blend lowering.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
Mask, DAG);
Mask, Subtarget, DAG))
return Broadcast;
- // There are no generalized cross-lane shuffle operations available on i16
- // element types.
- if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG);
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
Mask, Subtarget, DAG))
return Broadcast;
- // There are no generalized cross-lane shuffle operations available on i8
- // element types.
- if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
- Mask, DAG);
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG))
return Blend;
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+ Mask, DAG);
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 32; ++i)
PSHUFBMask[i] =
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v32i8, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
}
/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
// When the number of V1 and V2 elements are the same, try to minimize the
// number of uses of V2 in the low half of the vector. When that is tied,
// ensure that the sum of indices for V1 is equal to or lower than the sum
- // indices for V2.
+ // indices for V2. When those are equal, try to ensure that the number of odd
+ // indices for V1 is lower than the number of odd indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
for (int M : SVOp->getMask().slice(0, NumElements / 2))
SumV2Indices += i;
else if (SVOp->getMask()[i] >= 0)
SumV1Indices += i;
- if (SumV2Indices < SumV1Indices)
+ if (SumV2Indices < SumV1Indices) {
return DAG.getCommutedVectorShuffle(*SVOp);
+ } else if (SumV2Indices == SumV1Indices) {
+ int NumV1OddIndices = 0, NumV2OddIndices = 0;
+ for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+ if (SVOp->getMask()[i] >= NumElements)
+ NumV2OddIndices += i % 2;
+ else if (SVOp->getMask()[i] >= 0)
+ NumV1OddIndices += i % 2;
+ if (NumV2OddIndices < NumV1OddIndices)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ }
}
}
/// Insert one bit to mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
-SDValue
+SDValue
X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
// insert element and then truncate the result.
MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
- SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+ SDLoc dl(Op);
- if (SrcVT.isVector())
+ if (SrcVT.isVector()) {
+ if (SrcVT.getVectorElementType() == MVT::i1) {
+ MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
+ Op.getOperand(0)));
+ }
return SDValue();
+ }
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
return Op;
}
- SDLoc dl(Op);
unsigned Size = SrcVT.getSizeInBits()/8;
MachineFunction &MF = DAG.getMachineFunction();
int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
EVT VecIntVT = V.getValueType();
bool Is128 = VecIntVT == MVT::v4i32;
EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+ // If we convert to something else than the supported type, e.g., to v4f64,
+ // abort early.
+ if (VecFloatVT != Op->getValueType(0))
+ return SDValue();
+
unsigned NumElts = VecIntVT.getVectorNumElements();
assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type");
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
InVT = ExtVT;
}
-
+
SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
SDValue CP = DAG.getConstantPool(C, getPointerTy());
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
}
-
+
unsigned EltBits = EltVT.getSizeInBits();
LLVMContext *Context = DAG.getContext();
// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
}
-
+
// If not vector, then scalar.
unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
- // First get the sign bit of second operand.
- SmallVector<Constant*,4> CV;
- if (SrcVT == MVT::f64) {
- const fltSemantics &Sem = APFloat::IEEEdouble;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
- } else {
- const fltSemantics &Sem = APFloat::IEEEsingle;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- }
+ const fltSemantics &Sem =
+ VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+ const unsigned SizeInBits = VT.getSizeInBits();
+
+ SmallVector<Constant *, 4> CV(
+ VT == MVT::f64 ? 2 : 4,
+ ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
+
+ // First, clear all bits but the sign bit from the second operand (sign).
+ CV[0] = ConstantFP::get(*Context,
+ APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
Constant *C = ConstantVector::get(CV);
SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
false, false, false, 16);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
- // Shift sign bit right or left if the two operands have different types.
- if (SrcVT.bitsGT(VT)) {
- // Op0 is MVT::f32, Op1 is MVT::f64.
- SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
- SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
- DAG.getConstant(32, MVT::i32));
- SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
- SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
- DAG.getIntPtrConstant(0));
- }
-
- // Clear first operand sign bit.
- CV.clear();
- if (VT == MVT::f64) {
- const fltSemantics &Sem = APFloat::IEEEdouble;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
- APInt(64, ~(1ULL << 63)))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
- } else {
- const fltSemantics &Sem = APFloat::IEEEsingle;
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
- APInt(32, ~(1U << 31)))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
- }
+ // Next, clear the sign bit from the first operand (magnitude).
+ CV[0] = ConstantFP::get(
+ *Context, APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
false, false, false, 16);
SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
- // Or the value with the sign bit.
+ // OR the magnitude value with the sign bit.
return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
}
if (Op0.getValueType() == MVT::i1)
llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
}
-
+
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
- // Do the comparison at i32 if it's smaller, besides the Atom case.
- // This avoids subregister aliasing issues. Keep the smaller reference
- // if we're optimizing for size, however, as that'll allow better folding
+ // Do the comparison at i32 if it's smaller, besides the Atom case.
+ // This avoids subregister aliasing issues. Keep the smaller reference
+ // if we're optimizing for size, however, as that'll allow better folding
// of memory operations.
if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
!DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
return SDValue();
EVT VT = Op.getValueType();
-
+
// SSE1 has rsqrtss and rsqrtps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// significant digits in the divisor.
if (!Subtarget->useReciprocalEst())
return SDValue();
-
+
EVT VT = Op.getValueType();
-
+
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// along with FMA, this could be a throughput win.
if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
(Subtarget->hasAVX() && VT == MVT::v8f32)) {
- // TODO: Expose this as a user-configurable parameter to allow for
- // speed vs. accuracy flexibility.
- RefinementSteps = 1;
+ RefinementSteps = ReciprocalEstimateRefinementSteps;
return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
}
return SDValue();
cast<ConstantSDNode>(Op1)->isNullValue() &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
- if (NewSetCC.getNode())
+ if (NewSetCC.getNode()) {
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
return NewSetCC;
+ }
}
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
((Subtarget->hasDQI() && Subtarget->hasVLX() &&
VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
-
+
((Subtarget->hasDQI() && VT.is512BitVector() &&
VTElt.getSizeInBits() >= 32))))
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
-
+
unsigned int NumElts = VT.getVectorNumElements();
if (NumElts != 8 && NumElts != 16)
return SDValue();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+ if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
+ return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ }
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
- bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+ bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
SplitStack;
SDLoc dl(Op);
static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
SDValue SrcOp, SDValue ShAmt,
SelectionDAG &DAG) {
- assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+ MVT SVT = ShAmt.getSimpleValueType();
+ assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
// Catch shift-by-constant.
if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
}
- // Need to build a vector containing shift amount
- // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
- SDValue ShOps[4];
- ShOps[0] = ShAmt;
- ShOps[1] = DAG.getConstant(0, MVT::i32);
- ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
+ const X86Subtarget &Subtarget =
+ DAG.getTarget().getSubtarget<X86Subtarget>();
+ if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+ ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+ // Let the shuffle legalizer expand this shift amount node.
+ SDValue Op0 = ShAmt.getOperand(0);
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+ ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+ } else {
+ // Need to build a vector containing shift amount.
+ // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+ SmallVector<SDValue, 4> ShOps;
+ ShOps.push_back(ShAmt);
+ if (SVT == MVT::i32) {
+ ShOps.push_back(DAG.getConstant(0, SVT));
+ ShOps.push_back(DAG.getUNDEF(SVT));
+ }
+ ShOps.push_back(DAG.getUNDEF(SVT));
+
+ MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+ }
// The return type has to be a 128-bit type with the same element
// type as the input type.
return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
}
+/// \brief Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is comming as MVT::i8 and it should be truncated
+/// to MVT::i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node for
+/// a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (isAllOnes(Mask))
+ return Op;
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ // The mask should be of type MVT::i1
+ SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+}
+
static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
RoundingMode),
Mask, Src0, Subtarget, DAG);
}
-
+ case INTR_TYPE_SCALAR_MASK_RM: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src0 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ SDValue RoundingMode = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ RoundingMode),
+ Mask, Src0, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK: {
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+ Op.getOperand(2)),
+ Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+ }
case CMP_MASK:
case CMP_MASK_CC: {
// Comparison intrinsics with masks.
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
+ case VSHIFT_MASK:
+ return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
+ Op.getSimpleValueType(),
+ Op.getOperand(1),
+ Op.getOperand(2), DAG),
+ Op.getOperand(4), Op.getOperand(3), Subtarget,
+ DAG);
+ case COMPRESS_EXPAND_IN_REG: {
+ SDValue Mask = Op.getOperand(3);
+ SDValue DataToCompress = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ if (isAllOnes(Mask)) // return data as is
+ return Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDLoc dl(Op);
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
+ PassThru);
+ }
+ case BLEND: {
+ SDValue Mask = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDLoc dl(Op);
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ case FMA_OP_MASK:
+ {
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Op.getOperand(1),
+ Op.getOperand(2),
+ Op.getOperand(3)),
+ Op.getOperand(4), Op.getOperand(1),
+ Subtarget, DAG);
+ }
default:
break;
}
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
- // Arithmetic intrinsics.
- case Intrinsic::x86_sse2_pmulu_dq:
- case Intrinsic::x86_avx2_pmulu_dq:
- return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse41_pmuldq:
- case Intrinsic::x86_avx2_pmul_dq:
- return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pmulhu_w:
- case Intrinsic::x86_avx2_pmulhu_w:
- return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pmulh_w:
- case Intrinsic::x86_avx2_pmulh_w:
- return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- // SSE/SSE2/AVX floating point max/min intrinsics.
- case Intrinsic::x86_sse_max_ps:
- case Intrinsic::x86_sse2_max_pd:
- case Intrinsic::x86_avx_max_ps_256:
- case Intrinsic::x86_avx_max_pd_256:
- case Intrinsic::x86_sse_min_ps:
- case Intrinsic::x86_sse2_min_pd:
- case Intrinsic::x86_avx_min_ps_256:
- case Intrinsic::x86_avx_min_pd_256: {
- unsigned Opcode;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_sse_max_ps:
- case Intrinsic::x86_sse2_max_pd:
- case Intrinsic::x86_avx_max_ps_256:
- case Intrinsic::x86_avx_max_pd_256:
- Opcode = X86ISD::FMAX;
- break;
- case Intrinsic::x86_sse_min_ps:
- case Intrinsic::x86_sse2_min_pd:
- case Intrinsic::x86_avx_min_ps_256:
- case Intrinsic::x86_avx_min_pd_256:
- Opcode = X86ISD::FMIN;
- break;
- }
- return DAG.getNode(Opcode, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
- }
-
- // AVX2 variable shift intrinsics
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256: {
- unsigned Opcode;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q_256:
- Opcode = ISD::SHL;
- break;
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q_256:
- Opcode = ISD::SRL;
- break;
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- Opcode = ISD::SRA;
- break;
- }
- return DAG.getNode(Opcode, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
- }
-
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packsswb:
- return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_packuswb_128:
- case Intrinsic::x86_sse41_packusdw:
- case Intrinsic::x86_avx2_packuswb:
- case Intrinsic::x86_avx2_packusdw:
- return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_ssse3_pshuf_b_128:
- case Intrinsic::x86_avx2_pshuf_b:
- return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pshuf_d:
- return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pshufl_w:
- return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_sse2_pshufh_w:
- return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_ssse3_psign_b_128:
- case Intrinsic::x86_ssse3_psign_w_128:
- case Intrinsic::x86_ssse3_psign_d_128:
- case Intrinsic::x86_avx2_psign_b:
- case Intrinsic::x86_avx2_psign_w:
- case Intrinsic::x86_avx2_psign_d:
- return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::x86_avx2_permd:
- case Intrinsic::x86_avx2_permps:
- // Operands intentionally swapped. Mask is last operand to intrinsic,
- // but second operand for node/instruction.
- return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
- Op.getOperand(2), Op.getOperand(1));
-
case Intrinsic::x86_avx512_mask_valign_q_512:
case Intrinsic::x86_avx512_mask_valign_d_512:
// Vector source operands are swapped.
switch(IntrData->Type) {
default:
llvm_unreachable("Unknown Intrinsic Type");
- break;
+ break;
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
Results.push_back(Store);
return DAG.getMergeValues(Results, dl);
}
+ case COMPRESS_TO_MEM: {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue DataToCompress = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+
+ if (isAllOnes(Mask)) // return just a store
+ return DAG.getStore(Chain, dl, DataToCompress, Addr,
+ MachinePointerInfo(), false, false, 0);
+
+ EVT VT = DataToCompress.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,
+ DataToCompress, DAG.getUNDEF(VT));
+ return DAG.getStore(Chain, dl, Compressed, Addr,
+ MachinePointerInfo(), false, false, 0);
+ }
+ case EXPAND_FROM_MEM: {
+ SDLoc dl(Op);
+ SDValue Mask = Op.getOperand(4);
+ SDValue PathThru = Op.getOperand(3);
+ SDValue Addr = Op.getOperand(2);
+ SDValue Chain = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+
+ if (isAllOnes(Mask)) // return just a load
+ return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
+ false, 0);
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
+ false, false, false, 0);
+
+ SmallVector<SDValue, 2> Results;
+ Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
+ PathThru));
+ Results.push_back(Chain);
+ return DAG.getMergeValues(Results, dl);
+ }
}
}
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
DAG.getSubtarget().getRegisterInfo());
- unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
+ DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue BaseShAmt;
EVT EltVT = VT.getVectorElementType();
- if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
- unsigned NumElts = VT.getVectorNumElements();
- unsigned i, j;
- for (i = 0; i != NumElts; ++i) {
- if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
- continue;
- break;
- }
- for (j = i; j != NumElts; ++j) {
- SDValue Arg = Amt.getOperand(j);
- if (Arg.getOpcode() == ISD::UNDEF) continue;
- if (Arg != Amt.getOperand(i))
- break;
- }
- if (i != NumElts && j == NumElts)
- BaseShAmt = Amt.getOperand(i);
+ if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+ // Check if this build_vector node is doing a splat.
+ // If so, then set BaseShAmt equal to the splat value.
+ BaseShAmt = BV->getSplatValue();
+ if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+ BaseShAmt = SDValue();
} else {
if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
Amt = Amt.getOperand(0);
- if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
- cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
+
+ ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
+ if (SVN && SVN->isSplat()) {
+ unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
SDValue InVec = Amt.getOperand(0);
if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
- unsigned NumElts = InVec.getValueType().getVectorNumElements();
- unsigned i = 0;
- for (; i != NumElts; ++i) {
- SDValue Arg = InVec.getOperand(i);
- if (Arg.getOpcode() == ISD::UNDEF) continue;
- BaseShAmt = Arg;
- break;
- }
+ assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
+ "Unexpected shuffle index found!");
+ BaseShAmt = InVec.getOperand(SplatIdx);
} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
if (ConstantSDNode *C =
dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
- unsigned SplatIdx =
- cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
if (C->getZExtValue() == SplatIdx)
BaseShAmt = InVec.getOperand(1);
}
}
- if (!BaseShAmt.getNode())
- BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
- DAG.getIntPtrConstant(0));
+
+ if (!BaseShAmt)
+ // Avoid introducing an extract element from a shuffle.
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
+ DAG.getIntPtrConstant(SplatIdx));
}
}
if (BaseShAmt.getNode()) {
- if (EltVT.bitsGT(MVT::i32))
- BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+ assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+ if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
else if (EltVT.bitsLT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
// Do this only if the vector shift count is a constant build_vector.
- if (Op.getOpcode() == ISD::SHL &&
+ if (Op.getOpcode() == ISD::SHL &&
(VT == MVT::v8i16 || VT == MVT::v4i32 ||
(Subtarget->hasInt256() && VT == MVT::v16i16)) &&
ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
CanBeSimplified = Amt2 == Amt->getOperand(j);
}
}
-
+
if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
isa<ConstantSDNode>(Amt2)) {
// Replace this node with two shifts followed by a MOVSS/MOVSD.
EVT CastVT = MVT::v4i32;
- SDValue Splat1 =
+ SDValue Splat1 =
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
- SDValue Splat2 =
+ SDValue Splat2 =
DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
if (TargetOpcode == X86ISD::MOVSD)
return SDValue();
}
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+
+ Op = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "CTPOP lowering only implemented for 128/256-bit wide vector types");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ unsigned Len = EltVT.getSizeInBits();
+
+ // This is the vectorized version of the "best" algorithm from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ // with a minor tweak to use a series of adds + shifts instead of vector
+ // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
+ //
+ // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
+ // v8i32 => Always profitable
+ //
+ // FIXME: There a couple of possible improvements:
+ //
+ // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
+ // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
+ //
+ assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
+ "CTPOP not implemented for this vector element type.");
+
+ // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
+ // extra legalization.
+ bool NeedsBitcast = EltVT == MVT::i32;
+ MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
+
+ SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
+ SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
+ SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
+ SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
+ if (NeedsBitcast)
+ Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+
+ SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
+ SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
+ if (NeedsBitcast)
+ M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
+
+ SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
+
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
+ SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
+ SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
+ SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
+
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
+ if (NeedsBitcast) {
+ Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+ M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
+ Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
+ }
+
+ SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
+ SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
+ if (VT != AndRHS.getValueType()) {
+ AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
+ AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
+ }
+ SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
+
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
+ SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
+ Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+
+ SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
+ SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
+ if (NeedsBitcast) {
+ Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+ M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
+ }
+ And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+ // The algorithm mentioned above uses:
+ // v = (v * 0x01010101...) >> (Len - 8)
+ //
+ // Change it to use vector adds + vector shifts which yield faster results on
+ // Haswell than using vector integer multiplication.
+ //
+ // For i32 elements:
+ // v = v + (v >> 8)
+ // v = v + (v >> 16)
+ //
+ // For i64 elements:
+ // v = v + (v >> 8)
+ // v = v + (v >> 16)
+ // v = v + (v >> 32)
+ //
+ Add = And;
+ SmallVector<SDValue, 8> Csts;
+ for (unsigned i = 8; i <= Len/2; i *= 2) {
+ Csts.assign(NumElts, DAG.getConstant(i, EltVT));
+ SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
+ Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+ Csts.clear();
+ }
+
+ // The result is on the least significant 6-bits on i32 and 7-bits on i64.
+ SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
+ SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
+ SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
+ if (NeedsBitcast) {
+ Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+ M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
+ }
+ And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+ return And;
+}
+
static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
SDNode *Node = Op.getNode();
SDLoc dl(Node);
SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
Type *RetTy = isF64
- ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
+ ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
: (Type*)VectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
+ case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
+ case X86ISD::EXPAND: return "X86ISD::EXPAND";
+ case X86ISD::SELECT: return "X86ISD::SELECT";
}
}
return (SVT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, SVT) ||
+ isCommutedMOVLMask(M, SVT) ||
isMOVHLPSMask(M, SVT) ||
isSHUFPMask(M, SVT) ||
+ isSHUFPMask(M, SVT, /* Commuted */ true) ||
isPSHUFDMask(M, SVT) ||
+ isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
isPALIGNRMask(M, SVT, Subtarget) ||
const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
- assert(!Subtarget->isTargetMacho());
+ assert(!Subtarget->isTargetMachO());
// The lowering is pretty easy: we're just emitting the call to _alloca. The
// non-trivial part is impdef of ESP.
.addReg(X86::RAX);
}
} else {
- const char *StackProbeSymbol =
- Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
+ const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
+ Subtarget->isTargetWindowsItanium())
+ ? "_chkstk"
+ : "_alloca";
BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
.addExternalSymbol(StackProbeSymbol)
// v = phi(main, restore)
//
// restoreMBB:
+ // if base pointer being used, load it from frame
// v_restore = 1
MachineBasicBlock *thisMBB = MBB;
.addReg(restoreDstReg).addMBB(restoreMBB);
// restoreMBB:
+ if (RegInfo->hasBasePointer(*MF)) {
+ const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
+ const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ X86FI->setRestoreBasePointer(MF);
+ unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+ unsigned BasePtr = RegInfo->getBaseRegister();
+ unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+ addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+ FramePtr, true, X86FI->getRestoreBasePointerOffset())
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
restoreMBB->addSuccessor(sinkMBB);
// Replace 213-type (isel default) FMA3 instructions with 231-type for
// accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.
+// to remove extra copies in the loop.
MachineBasicBlock *
X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
MachineBasicBlock *MBB) const {
case X86::EH_SjLj_LongJmp64:
return emitEHSjLjLongJmp(MI, BB);
+ case TargetOpcode::STATEPOINT:
+ // As an implementation detail, STATEPOINT shares the STACKMAP format at
+ // this point in the process. We diverge later.
+ return emitPatchPoint(MI, BB);
+
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
EVT SVT = BC0.getValueType();
unsigned Opcode = BC0.getOpcode();
unsigned NumElts = VT.getVectorNumElements();
-
+
if (BC0.hasOneUse() && SVT.isVector() &&
SVT.getVectorNumElements() * 2 == NumElts &&
TLI.isOperationLegal(Opcode, VT)) {
/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts
-/// to a simple store and scalar loads to extract the elements.
+/// into a somewhat faster sequence. For i686, the best sequence is apparently
+/// storing the value and loading scalars back, while for x64 we should
+/// use 64-bit extracts and shifts.
static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
return SDValue();
// Ok, we've now decided to do the transformation.
+ // If 64-bit shifts are legal, use the extract-shift sequence,
+ // otherwise bounce the vector off the cache.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Vals[4];
SDLoc dl(InputVector);
- // Store the value to a temporary stack slot.
- SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
- SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
- MachinePointerInfo(), false, false, 0);
+ if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+ SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
+ EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+ SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(0, VecIdxTy));
+ SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+ DAG.getConstant(1, VecIdxTy));
+
+ SDValue ShAmt = DAG.getConstant(32,
+ DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
+ Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+ Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+ Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+ Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+ DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+ } else {
+ // Store the value to a temporary stack slot.
+ SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+ MachinePointerInfo(), false, false, 0);
- // Replace each use (extract) with a load of the appropriate element.
- for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
- UE = Uses.end(); UI != UE; ++UI) {
- SDNode *Extract = *UI;
+ EVT ElementType = InputVector.getValueType().getVectorElementType();
+ unsigned EltSize = ElementType.getSizeInBits() / 8;
- // cOMpute the element's address.
- SDValue Idx = Extract->getOperand(1);
- unsigned EltSize =
- InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
- uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+ // Replace each use (extract) with a load of the appropriate element.
+ for (unsigned i = 0; i < 4; ++i) {
+ uint64_t Offset = EltSize * i;
+ SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
- SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
- StackPtr, OffsetVal);
+ SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+ StackPtr, OffsetVal);
- // Load the scalar.
- SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
- ScalarAddr, MachinePointerInfo(),
- false, false, false, 0);
+ // Load the scalar.
+ Vals[i] = DAG.getLoad(ElementType, dl, Ch,
+ ScalarAddr, MachinePointerInfo(),
+ false, false, false, 0);
+
+ }
+ }
+
+ // Replace the extracts
+ for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+ UE = Uses.end(); UI != UE; ++UI) {
+ SDNode *Extract = *UI;
- // Replace the exact with the load.
- DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+ SDValue Idx = Extract->getOperand(1);
+ uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
}
// The replacement was made in place; don't return anything.
bool NeedSplit = false;
switch (VT.getSimpleVT().SimpleTy) {
default: return std::make_pair(0, false);
+ case MVT::v4i64:
+ case MVT::v2i64:
+ if (!Subtarget->hasVLX())
+ return std::make_pair(0, false);
+ break;
+ case MVT::v64i8:
+ case MVT::v32i16:
+ if (!Subtarget->hasBWI())
+ return std::make_pair(0, false);
+ break;
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (!Subtarget->hasAVX512())
+ return std::make_pair(0, false);
+ break;
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
}
static SDValue
-TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc dl(N);
SDValue Cond = N->getOperand(0);
Cond = CondSrc->getOperand(0);
}
- MVT VT = N->getSimpleValueType(0);
- MVT EltVT = VT.getVectorElementType();
- unsigned NumElems = VT.getVectorNumElements();
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return SDValue();
-
- if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
- return SDValue();
- if (!Subtarget->hasInt256() && VT == MVT::v16i16)
- return SDValue();
-
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
return SDValue();
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> ShuffleMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
// Be sure we emit undef where we can.
ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
}
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+ return SDValue();
return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
}
}
}
- // Try to fold this VSELECT into a MOVSS/MOVSD
- if (N->getOpcode() == ISD::VSELECT &&
- Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
- if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
- (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
- bool CanFold = false;
- unsigned NumElems = Cond.getNumOperands();
- SDValue A = LHS;
- SDValue B = RHS;
-
- if (isZero(Cond.getOperand(0))) {
- CanFold = true;
-
- // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
- // fold (vselect <0,-1> -> (movsd A, B)
- for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
- CanFold = isAllOnes(Cond.getOperand(i));
- } else if (isAllOnes(Cond.getOperand(0))) {
- CanFold = true;
- std::swap(A, B);
-
- // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
- // fold (vselect <-1,0> -> (movsd B, A)
- for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
- CanFold = isZero(Cond.getOperand(i));
- }
-
- if (CanFold) {
- if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
- return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
- }
-
- if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
- // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
- // (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
- // (v2i64 (bitcast B)))))
- //
- // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
- // (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
- // (v2f64 (bitcast B)))))
- //
- // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
- // (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
- // (v2i64 (bitcast A)))))
- //
- // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
- // (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
- // (v2f64 (bitcast A)))))
-
- CanFold = (isZero(Cond.getOperand(0)) &&
- isZero(Cond.getOperand(1)) &&
- isAllOnes(Cond.getOperand(2)) &&
- isAllOnes(Cond.getOperand(3)));
-
- if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
- isAllOnes(Cond.getOperand(1)) &&
- isZero(Cond.getOperand(2)) &&
- isZero(Cond.getOperand(3))) {
- CanFold = true;
- std::swap(LHS, RHS);
- }
-
- if (CanFold) {
- EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
- SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
- SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
- SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
- NewB, DAG);
- return DAG.getNode(ISD::BITCAST, DL, VT, Select);
- }
- }
- }
- }
-
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
if ((N->getOpcode() == ISD::VSELECT ||
N->getOpcode() == X86ISD::SHRUNKBLEND) &&
!DCI.isBeforeLegalize()) {
- SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
if (Shuffle.getNode())
return Shuffle;
}
// fold (blend A, B, allOnes) -> B
if (ISD::isBuildVectorAllOnes(Mask.getNode()))
return Op1;
-
+
// Simplify the case where the mask is a constant i32 value.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
if (C->isNullValue())
SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // On Sandybridge unaligned 256bit loads are inefficient.
+ // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+ // into two 16-byte operations.
ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
- if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we are saving a concatenation of two XMM registers, perform two stores.
- // On Sandy Bridge, 256-bit memory operations are executed by two
- // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
- // memory operation.
+ // If we are saving a concatenation of two XMM registers and 32-byte stores
+ // are slow, such as on Sandy Bridge, perform two 16-byte stores.
unsigned Alignment = St->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
- if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
StVT == VT && !IsAligned) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
}
-/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// Return 'true' if this vector operation is "horizontal"
/// and return the operands for the horizontal operation in LHS and RHS. A
/// horizontal operation performs the binary operation on successive elements
/// of its first operand, then on successive elements of its second operand,
return true;
}
-/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+/// Do target-specific dag combines on floating point adds.
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
return SDValue();
}
-/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+/// Do target-specific dag combines on floating point subs.
static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
return SDValue();
}
-/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
-/// X86ISD::FXOR nodes.
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
// F[X]OR(0.0, x) -> x
return SDValue();
}
-/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
-/// X86ISD::FMAX nodes.
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
N->getOperand(0), N->getOperand(1));
}
-/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+/// Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
// FAND(x, 0.0) -> 0.0
return SDValue();
}
-/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
+/// Do target-specific dag combines on X86ISD::FANDN nodes
static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
// FANDN(x, 0.0) -> 0.0
// FANDN(0.0, x) -> x
// "load" ports instead of the dedicated "store" port.
// E.g., on Haswell:
// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
- // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+ // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
if (isLegalAddressingMode(AM, Ty))
// Scale represents reg2 * scale, thus account for 1
// as soon as we use a second register.