setOperationAction(ISD::ADD, MVT::v8i16, Legal);
setOperationAction(ISD::ADD, MVT::v4i32, Legal);
setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::SUB, MVT::v16i8, Legal);
setOperationAction(ISD::SUB, MVT::v8i16, Legal);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
//
setPrefFunctionAlignment(4); // 2^4 bytes.
}
-
EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
if (!VT.isVector()) return MVT::i8;
return VT.changeVectorElementTypeToInteger();
}
-
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
/// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If
-/// 'IsZeroVal' is true, that means it's safe to return a
-/// non-scalar-integer type, e.g. empty string source, constant, or loaded
-/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-/// constant so it does not need to be loaded.
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
EVT
X86TargetLowering::getOptimalMemOpType(uint64_t Size,
unsigned DstAlign, unsigned SrcAlign,
- bool IsZeroVal,
+ bool IsMemset, bool ZeroMemset,
bool MemcpyStrSrc,
MachineFunction &MF) const {
const Function *F = MF.getFunction();
- if (IsZeroVal &&
- !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
+ if ((!IsMemset || ZeroMemset) &&
+ !F->getFnAttributes().hasAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
return MVT::i32;
}
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+ if (VT == MVT::f32)
+ return X86ScalarSSEf32;
+ else if (VT == MVT::f64)
+ return X86ScalarSSEf64;
+ return true;
+}
+
bool
X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
if (Fast)
// FIXME: Why this routine is here? Move to RegInfo!
std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(EVT VT) const{
+X86TargetLowering::findRepresentativeClass(MVT VT) const{
const TargetRegisterClass *RRC = 0;
uint8_t Cost = 1;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default:
return TargetLowering::findRepresentativeClass(VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
return true;
}
-
//===----------------------------------------------------------------------===//
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
return true;
}
-EVT
-X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+MVT
+X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
ISD::NodeType ExtendKind) const {
MVT ReturnMVT;
// TODO: Is this also valid on 32-bit?
else
ReturnMVT = MVT::i32;
- EVT MinVT = getRegisterType(Context, ReturnMVT);
+ MVT MinVT = getRegisterType(ReturnMVT);
return VT.bitsLT(MinVT) ? MinVT : VT;
}
return Chain;
}
-
//===----------------------------------------------------------------------===//
// C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
TotalNumIntRegs);
bool NoImplicitFloatOps = Fn->getFnAttributes().
- hasAttribute(Attributes::NoImplicitFloat);
+ hasAttribute(Attribute::NoImplicitFloat);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
} else if (Subtarget->isPICStyleRIPRel() &&
isa<Function>(GV) &&
cast<Function>(GV)->getFnAttributes().
- hasAttribute(Attributes::NonLazyBind)) {
+ hasAttribute(Attribute::NonLazyBind)) {
// If the function is marked as non-lazy, generate an indirect call
// which loads from the GOT directly. This avoids runtime overhead
// at the cost of eager binding (and one extra byte of encoding).
Ins, dl, DAG, InVals);
}
-
//===----------------------------------------------------------------------===//
// Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//
return X86::createFastISel(funcInfo, libInfo);
}
-
//===----------------------------------------------------------------------===//
// Other Lowering Hooks
//===----------------------------------------------------------------------===//
return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}
-
bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
bool hasSymbolicDisplacement) {
// Offset should fit into 32 bit immediate field.
bool HasInt256 = Subtarget->hasInt256();
MachineFunction &MF = DAG.getMachineFunction();
bool OptForSize = MF.getFunction()->getFnAttributes().
- hasAttribute(Attributes::OptimizeForSize);
+ hasAttribute(Attribute::OptimizeForSize);
assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
getShuffleCLImmediate(SVOp), DAG);
-
//===--------------------------------------------------------------------===//
// Since no target specific shuffle was selected for this generic one,
// lower it into other known shuffles. FIXME: this isn't true yet, but
return SDValue();
}
-
SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
DebugLoc DL = Op.getDebugLoc();
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
-
// With PIC, the address is actually $g + Offset.
if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
!Subtarget->is64Bit()) {
llvm_unreachable("TLS not implemented for this target.");
}
-
/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
}
-
SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond;
SDValue Op0 = Op.getOperand(0);
if (VT == MVT::v2i64) {
if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
return SDValue();
- if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
- return SDValue();
+ if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+ // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+ // pcmpeqd + pshufd + pand.
+ assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+ // First cast everything to the right type,
+ Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+ // Do the compare.
+ SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+ // Make sure the lower and upper halves are both all-ones.
+ const int Mask[] = { 1, 0, 3, 2 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+ Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
}
// Since SSE has no unsigned integer comparisons, we need to flip the sign
Chain, Dest, CC, Cond);
}
-
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
// Calls to _alloca is needed to probe the stack when allocating more than 4k
// bytes in one go. Touching the stack at 4K increments is necessary to ensure
assert(!getTargetMachine().Options.UseSoftFloat &&
!(DAG.getMachineFunction()
.getFunction()->getFnAttributes()
- .hasAttribute(Attributes::NoImplicitFloat)) &&
+ .hasAttribute(Attribute::NoImplicitFloat)) &&
Subtarget->hasSSE1());
}
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ // SSE2/AVX2 sub with unsigned saturation intrinsics
+ case Intrinsic::x86_sse2_psubus_b:
+ case Intrinsic::x86_sse2_psubus_w:
+ case Intrinsic::x86_avx2_psubus_b:
+ case Intrinsic::x86_avx2_psubus_w:
+ return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// SSE3/AVX horizontal add/sub intrinsics
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
Op.getOperand(1), Op.getOperand(2));
}
+ // SSE2/SSE41/AVX2 integer max/min intrinsics.
+ case Intrinsic::x86_sse2_pmaxu_b:
+ case Intrinsic::x86_sse41_pmaxuw:
+ case Intrinsic::x86_sse41_pmaxud:
+ case Intrinsic::x86_avx2_pmaxu_b:
+ case Intrinsic::x86_avx2_pmaxu_w:
+ case Intrinsic::x86_avx2_pmaxu_d:
+ return DAG.getNode(X86ISD::UMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse2_pminu_b:
+ case Intrinsic::x86_sse41_pminuw:
+ case Intrinsic::x86_sse41_pminud:
+ case Intrinsic::x86_avx2_pminu_b:
+ case Intrinsic::x86_avx2_pminu_w:
+ case Intrinsic::x86_avx2_pminu_d:
+ return DAG.getNode(X86ISD::UMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse41_pmaxsb:
+ case Intrinsic::x86_sse2_pmaxs_w:
+ case Intrinsic::x86_sse41_pmaxsd:
+ case Intrinsic::x86_avx2_pmaxs_b:
+ case Intrinsic::x86_avx2_pmaxs_w:
+ case Intrinsic::x86_avx2_pmaxs_d:
+ return DAG.getNode(X86ISD::SMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse41_pminsb:
+ case Intrinsic::x86_sse2_pmins_w:
+ case Intrinsic::x86_sse41_pminsd:
+ case Intrinsic::x86_avx2_pmins_b:
+ case Intrinsic::x86_avx2_pmins_w:
+ case Intrinsic::x86_avx2_pmins_d:
+ return DAG.getNode(X86ISD::SMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// AVX2 variable shift intrinsics
case Intrinsic::x86_avx2_psllv_d:
case Intrinsic::x86_avx2_psllv_q:
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
- if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
+ if (Attrs.getParamAttributes(Idx).hasAttribute(Attribute::InReg))
// FIXME: should only count parameters that are lowered to integers.
InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
-
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
MachineMemOperand::MOStore, 2, 2);
DAG.getConstant(1, MVT::i16)),
DAG.getConstant(3, MVT::i16));
-
return DAG.getNode((VT.getSizeInBits() < 16 ?
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
}
static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType();
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget->hasInt256())
return Lower256IntArith(Op, DAG);
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+ if (VT == MVT::v4i32) {
+ assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+ "Should not custom lower when pmuldq is available!");
+
+ // Extract the odd parts.
+ const int UnpackMask[] = { 1, -1, 3, -1 };
+ SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+ SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+ // Multiply the even parts.
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+ // Now multiply odd parts.
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+ Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
+ Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+
+ // Merge the two vectors back together with a shuffle. This expands into 2
+ // shuffles.
+ const int ShufMask[] = { 0, 4, 2, 6 };
+ return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+ }
+
assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
"Only know how to lower V2I64/V4I64 multiply");
- DebugLoc dl = Op.getDebugLoc();
-
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
// AhiBlo = psllqi(AhiBlo, 32);
// return AloBlo + AloBhi + AhiBlo;
- SDValue A = Op.getOperand(0);
- SDValue B = Op.getOperand(1);
-
SDValue ShAmt = DAG.getConstant(32, MVT::i32);
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
}
}
-
static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
}
-
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
EVT T = Op.getValueType();
SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const {
DebugLoc dl = N->getDebugLoc();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
return;
}
case ISD::FP_ROUND: {
+ if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+ return;
SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
Results.push_back(V);
return;
case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDV: return "X86ISD::BLENDV";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
case X86ISD::FHADD: return "X86ISD::FHADD";
case X86ISD::FHSUB: return "X86ISD::FHSUB";
+ case X86ISD::UMAX: return "X86ISD::UMAX";
+ case X86ISD::UMIN: return "X86ISD::UMIN";
+ case X86ISD::SMAX: return "X86ISD::SMAX";
+ case X86ISD::SMIN: return "X86ISD::SMIN";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::ANDN: return "X86ISD::ANDN";
case X86ISD::BLSI: return "X86ISD::BLSI";
case X86ISD::BLSMSK: return "X86ISD::BLSMSK";
case X86ISD::BLSR: return "X86ISD::BLSR";
return true;
}
-
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
}
-
/// PerformTruncateCombine - Converts truncate operation to
/// a sequence of vector shuffle operations.
/// It is possible when we truncate 256-bit vector to 128-bit vector
if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
+ // On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget->hasInt256()) {
- // AVX2: v4i64 -> v4i32
-
- // VPERMD
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-
Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
ShufMask);
-
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
DAG.getIntPtrConstant(0));
}
- // AVX: v4i64 -> v4i32
+ // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
DAG.getIntPtrConstant(0));
-
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
DAG.getIntPtrConstant(2));
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
- // PSHUFD
+ // The PSHUFD mask:
static const int ShufMask1[] = {0, 2, 0, 0};
-
SDValue Undef = DAG.getUNDEF(VT);
OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
- // MOVLHPS
+ // The MOVLHPS mask:
static const int ShufMask2[] = {0, 1, 4, 5};
-
return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
}
if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+ // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
if (Subtarget->hasInt256()) {
- // AVX2: v8i32 -> v8i16
-
Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
- // PSHUFB
SmallVector<SDValue,32> pshufbMask;
for (unsigned i = 0; i < 2; ++i) {
pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
&pshufbMask[0], 32);
Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
-
Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
static const int ShufMask[] = {0, 2, -1, -1};
Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64),
&ShufMask[0]);
-
Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
DAG.getIntPtrConstant(0));
-
return DAG.getNode(ISD::BITCAST, dl, VT, Op);
}
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
- // PSHUFB
+ // The PSHUFB mask:
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1};
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
- // MOVLHPS
+ // The MOVLHPS Mask:
static const int ShufMask2[] = {0, 1, 4, 5};
-
SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
}
return SDValue();
}
+/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
+static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ if (!VT.isVector())
+ return 0;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ if (!Subtarget->hasAVX2())
+ return 0;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ if (!Subtarget->hasSSE2())
+ return 0;
+ }
+
+ // SSE2 has only a small subset of the operations.
+ bool hasUnsigned = Subtarget->hasSSE41() ||
+ (Subtarget->hasSSE2() && VT == MVT::v16i8);
+ bool hasSigned = Subtarget->hasSSE41() ||
+ (Subtarget->hasSSE2() && VT == MVT::v8i16);
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ return hasUnsigned ? X86ISD::UMIN : 0;
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ return hasUnsigned ? X86ISD::UMAX : 0;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ return hasSigned ? X86ISD::SMIN : 0;
+ case ISD::SETGT:
+ case ISD::SETGE:
+ return hasSigned ? X86ISD::SMAX : 0;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ return hasUnsigned ? X86ISD::UMAX : 0;
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ return hasUnsigned ? X86ISD::UMIN : 0;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ return hasSigned ? X86ISD::SMAX : 0;
+ case ISD::SETGT:
+ case ISD::SETGE:
+ return hasSigned ? X86ISD::SMIN : 0;
+ }
+ }
+
+ return 0;
+}
+
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // Match VSELECTs into subs with unsigned saturation.
+ if (!DCI.isBeforeLegalize() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+ ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+ // left side invert the predicate to simplify logic below.
+ SDValue Other;
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ Other = RHS;
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ Other = LHS;
+ }
+
+ if (Other.getNode() && Other->getNumOperands() == 2 &&
+ DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+ SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+ SDValue CondRHS = Cond->getOperand(1);
+
+ // Look for a general sub with unsigned saturation first.
+ // x >= y ? x-y : 0 --> subus x, y
+ // x > y ? x-y : 0 --> subus x, y
+ if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+ Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+ // If the RHS is a constant we have to reverse the const canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (CondRHS.getConstantOperandVal(0) == -A-1) {
+ SmallVector<SDValue, 32> V(VT.getVectorNumElements(),
+ DAG.getConstant(-A, VT.getScalarType()));
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
+ V.data(), V.size()));
+ }
+ }
+
+ // Another special case: If C was a sign bit, the sub has been
+ // canonicalized into a xor.
+ // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+ // it's safe to decanonicalize the xor?
+ // x s< 0 ? x^C : 0 --> subus x, C
+ if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (A.isSignBit())
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+ }
+ }
+ }
+
+ // Try to match a min/max vector operation.
+ if (!DCI.isBeforeLegalize() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
+ if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
+ return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
return SDValue();
}
-
/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.
}
}
-
// Hardware support for vector shifts is sparse which makes us scalarize the
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl.
}
}
-
// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
// and friends. Likewise for OR -> CMPNEQSS.
EVT VT = N->getValueType(0);
- // Create ANDN, BLSI, and BLSR instructions
+ // Create BLSI, and BLSR instructions
// BLSI is X & (-X)
// BLSR is X & (X-1)
if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
SDValue N1 = N->getOperand(1);
DebugLoc DL = N->getDebugLoc();
- // Check LHS for not
- if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
- return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
- // Check RHS for not
- if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
- return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
-
// Check LHS for neg
if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
isZero(N0.getOperand(0)))
ISD::LoadExtType Ext = Ld->getExtensionType();
// If this is a vector EXT Load then attempt to optimize it using a
- // shuffle. We need SSSE3 shuffles.
+ // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+ // expansion is still better than scalar code.
+ // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+ // emit a shuffle and a arithmetic shift.
// TODO: It is possible to support ZExt by zeroing the undef values
// during the shuffle phase or after the shuffle.
- if (RegVT.isVector() && RegVT.isInteger() &&
- Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
+ if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+ (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
+ return SDValue();
+
// All sizes must be a power of two.
if (!isPowerOf2_32(RegSz * MemSz * NumElems))
return SDValue();
// Calculate the number of scalar loads that we need to perform
// in order to load our vector from memory.
unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+ if (Ext == ISD::SEXTLOAD && NumLoads > 1)
+ return SDValue();
+
+ unsigned loadRegZize = RegSz;
+ if (Ext == ISD::SEXTLOAD && RegSz == 256)
+ loadRegZize /= 2;
// Represent our vector as a sequence of elements which are the
// largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
- RegSz/SclrLoadTy.getSizeInBits());
+ loadRegZize/SclrLoadTy.getSizeInBits());
// Represent the data using the same element type that is stored in
// memory. In practice, we ''widen'' MemVT.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- RegSz/MemVT.getScalarType().getSizeInBits());
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ loadRegZize/MemVT.getScalarType().getSizeInBits());
assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type");
SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
unsigned SizeRatio = RegSz/MemSz;
+ if (Ext == ISD::SEXTLOAD) {
+ // If we have SSE4.1 we can directly emit a VSEXT node.
+ if (Subtarget->hasSSE41()) {
+ SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ return DCI.CombineTo(N, Sext, TF, true);
+ }
+
+ // Otherwise we'll shuffle the small elements in the high bits of the
+ // larger type and perform an arithmetic shift. If the shift is not legal
+ // it's better to scalarize.
+ if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+ return SDValue();
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+ // Build the arithmetic shift.
+ unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+ MemVT.getVectorElementType().getSizeInBits();
+ SmallVector<SDValue, 8> C(NumElems,
+ DAG.getConstant(Amt, RegVT.getScalarType()));
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
+ Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
+
+ return DCI.CombineTo(N, Shuff, TF, true);
+ }
+
// Redistribute the loaded elements into the different locations.
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
Chains.size());
}
-
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
const Function *F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F->getFnAttributes().
- hasAttribute(Attributes::NoImplicitFloat);
+ hasAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
&& Subtarget->hasSSE2();
if ((VT.isVector() ||
N->getOperand(0), N->getOperand(1));
}
-
/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
return false;
}
-
-
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
X86TargetLowering::ConstraintType
return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
}
+unsigned
+X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment,
+ unsigned AddressSpace) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src);
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
+
+ const X86Subtarget &ST =
+ TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+ // Each load/store unit costs 1.
+ unsigned Cost = LT.first * 1;
+
+ // On Sandybridge 256bit load/stores are double pumped
+ // (but not on Haswell).
+ if (LT.second.getSizeInBits() > 128 && !ST.hasAVX2())
+ Cost*=2;
+
+ return Cost;
+}
+
unsigned
X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
unsigned Index) const {
{ ISD::SETCC, MVT::v32i8, 1 },
};
- if (ST.hasSSE42()) {
- int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+ if (ST.hasAVX2()) {
+ int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
if (Idx != -1)
- return LT.first * SSE42CostTbl[Idx].Cost;
+ return LT.first * AVX2CostTbl[Idx].Cost;
}
if (ST.hasAVX()) {
return LT.first * AVX1CostTbl[Idx].Cost;
}
- if (ST.hasAVX2()) {
- int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+ if (ST.hasSSE42()) {
+ int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
if (Idx != -1)
- return LT.first * AVX2CostTbl[Idx].Cost;
+ return LT.first * SSE42CostTbl[Idx].Cost;
}
return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);