setPrefFunctionAlignment(4); // 2^4 bytes.
- InitIntrinsicTables();
+ verifyIntrinsicTables();
}
// This has so far only been implemented for 64-bit MachO.
int V2Index =
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
Mask.begin();
+
+ // Check for whether we can use INSERTPS to perform the blend. We only use
+ // INSERTPS when the V1 elements are already in the correct locations
+ // because otherwise we can just always use two SHUFPS instructions which
+ // are much smaller to encode than a SHUFPS and an INSERTPS.
+ if (Subtarget->hasSSE41()) {
+ // When using INSERTPS we can zero any lane of the destination. Collect
+ // the zero inputs into a mask and drop them from the lanes of V1 which
+ // actually need to be present as inputs to the INSERTPS.
+ unsigned ZMask = 0;
+ if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+ ZMask = 0xF ^ (1 << V2Index);
+ } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+ for (int i = 0; i < 4; ++i) {
+ int M = Mask[i];
+ if (M >= 4)
+ continue;
+ if (M > -1) {
+ SDValue Input = V1.getOperand(M);
+ if (Input.getOpcode() != ISD::UNDEF &&
+ !X86::isZeroNode(Input)) {
+ // A non-zero input!
+ ZMask = 0;
+ break;
+ }
+ }
+ ZMask |= 1 << i;
+ }
+ }
+
+ // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
+ int InsertShuffleMask[4] = {-1, -1, -1, -1};
+ for (int i = 0; i < 4; ++i)
+ if (i != V2Index && (ZMask & (1 << i)) == 0)
+ InsertShuffleMask[i] = Mask[i];
+
+ if (isNoopShuffleMask(InsertShuffleMask)) {
+ // Replace V1 with undef if nothing from V1 survives the INSERTPS.
+ if ((ZMask | 1 << V2Index) == 0xF)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ // Insert the V2 element into the desired position.
+ SDValue InsertPSMask =
+ DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4 | ZMask);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ InsertPSMask);
+ }
+ }
+
// Compute the index adjacent to V2Index and in the same half by toggling
// the low bit.
int V2AdjIndex = V2Index ^ 1;
In, DAG.getUNDEF(SVT)));
}
-static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
- LLVMContext *Context = DAG.getContext();
- SDLoc dl(Op);
- MVT VT = Op.getSimpleValueType();
- MVT EltVT = VT;
- unsigned NumElts = VT == MVT::f64 ? 2 : 4;
- if (VT.isVector()) {
- EltVT = VT.getVectorElementType();
- NumElts = VT.getVectorNumElements();
- }
-
- unsigned EltBits = EltVT.getSizeInBits();
- Constant *C = ConstantInt::get(*Context, APInt::getSignedMaxValue(EltBits));
- C = ConstantVector::getSplat(NumElts, C);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
- unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
- SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
- if (VT.isVector()) {
- MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(ISD::AND, dl, ANDVT,
- DAG.getNode(ISD::BITCAST, dl, ANDVT,
- Op.getOperand(0)),
- DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
- }
- return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
-}
+// The only differences between FABS and FNEG are the mask and the logic op.
+static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
+ assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
+ "Wrong opcode for lowering FABS or FNEG.");
-static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
- LLVMContext *Context = DAG.getContext();
+ bool IsFABS = (Op.getOpcode() == ISD::FABS);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
+ // Assume scalar op for initialization; update for vector if needed.
+ // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
+ // generate a 16-byte vector constant and logic op even for the scalar case.
+ // Using a 16-byte mask allows folding the load of the mask with
+ // the logic op, so it can save (~4 bytes) on code size.
MVT EltVT = VT;
unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+ // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
+ // decide if we should generate a 16-byte constant mask when we only need 4 or
+ // 8 bytes for the scalar case.
if (VT.isVector()) {
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
}
unsigned EltBits = EltVT.getSizeInBits();
- Constant *C = ConstantInt::get(*Context, APInt::getSignBit(EltBits));
+ LLVMContext *Context = DAG.getContext();
+ // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
+ APInt MaskElt =
+ IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+ Constant *C = ConstantInt::get(*Context, MaskElt);
C = ConstantVector::getSplat(NumElts, C);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
false, false, false, Alignment);
+
if (VT.isVector()) {
- MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
+ // For a vector, cast operands to a vector type, perform the logic op,
+ // and cast the result back to the original value type.
+ MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ SDValue Op0Casted = DAG.getNode(ISD::BITCAST, dl, VecVT, Op.getOperand(0));
+ SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
+ unsigned LogicOp = IsFABS ? ISD::AND : ISD::XOR;
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(ISD::XOR, dl, XORVT,
- DAG.getNode(ISD::BITCAST, dl, XORVT,
- Op.getOperand(0)),
- DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
+ DAG.getNode(LogicOp, dl, VecVT, Op0Casted, MaskCasted));
}
-
- return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+ // If not vector, then scalar.
+ unsigned LogicOp = IsFABS ? X86ISD::FAND : X86ISD::FXOR;
+ return DAG.getNode(LogicOp, dl, VT, Op.getOperand(0), Mask);
}
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- const IntrinsicData* IntrData = GetIntrinsicWithoutChain(IntNo);
+ const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
switch(IntrData->Type) {
case INTR_TYPE_1OP:
SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- const IntrinsicData* IntrData = GetIntrinsicWithChain(IntNo);
+ const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData)
return SDValue();
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
- case ISD::FABS: return LowerFABS(Op, DAG);
- case ISD::FNEG: return LowerFNEG(Op, DAG);
+ case ISD::FABS:
+ case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);