// Also handle the case where we explicitly require zeros in the top
// elements. This is a vector shuffle from the zero vector.
if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
- N.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+ // Check to see if the top elements are all zeros (or bitcast of zeros).
+ ISD::isBuildVectorAllZeros(N.getOperand(0).Val) &&
N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR &&
N.getOperand(1).Val->hasOneUse() &&
ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
N.getOperand(1).getOperand(0).hasOneUse()) {
- // Check to see if the BUILD_VECTOR is building a zero vector.
- SDOperand BV = N.getOperand(0);
- for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i)
- if (!isZeroNode(BV.getOperand(i)) &&
- BV.getOperand(i).getOpcode() != ISD::UNDEF)
- return false; // Not a zero/undef vector.
// Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
// from the LHS.
- unsigned VecWidth = BV.getNumOperands();
+ unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType());
SDOperand ShufMask = N.getOperand(2);
assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
return true;
}
-/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as
/// values in ther permute mask.
static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
SDOperand &V2, SDOperand &Mask,
unsigned NumElems = Mask.getNumOperands();
for (unsigned i = 0; i != NumElems; ++i) {
SDOperand Arg = Mask.getOperand(i);
- if (Arg.getOpcode() != ISD::UNDEF) {
- unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
- if (Idx < NumElems) {
- unsigned Opc = V1.Val->getOpcode();
- if (Opc == ISD::UNDEF)
- continue;
- if (Opc != ISD::BUILD_VECTOR ||
- !isZeroNode(V1.Val->getOperand(Idx)))
- return false;
- } else if (Idx >= NumElems) {
- unsigned Opc = V2.Val->getOpcode();
- if (Opc == ISD::UNDEF)
- continue;
- if (Opc != ISD::BUILD_VECTOR ||
- !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
- return false;
- }
+ if (Arg.getOpcode() == ISD::UNDEF)
+ continue;
+
+ unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
+ if (Idx < NumElems) {
+ unsigned Opc = V1.Val->getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.Val))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !isZeroNode(V1.Val->getOperand(Idx)))
+ return false;
+ } else if (Idx >= NumElems) {
+ unsigned Opc = V2.Val->getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.Val))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
+ return false;
}
}
return true;
///
static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) {
assert(MVT::isVector(VT) && "Expected a vector type");
- unsigned NumElems = MVT::getVectorNumElements(VT);
- MVT::ValueType EVT = MVT::getVectorElementType(VT);
- bool isFP = MVT::isFloatingPoint(EVT);
- SDOperand Zero = isFP ? DAG.getConstantFP(0.0, EVT) : DAG.getConstant(0, EVT);
- SmallVector<SDOperand, 8> ZeroVec(NumElems, Zero);
- return DAG.getNode(ISD::BUILD_VECTOR, VT, &ZeroVec[0], ZeroVec.size());
+
+ // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd.
+ SDOperand Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDOperand Vec;
+ if (MVT::getSizeInBits(VT) == 64) // MMX
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+ else // SSE
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
+ return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDOperand getOnesVector(MVT::ValueType VT, SelectionDAG &DAG) {
+ assert(MVT::isVector(VT) && "Expected a vector type");
+
+ // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd.
+ SDOperand Cst = DAG.getTargetConstant(~0U, MVT::i32);
+ SDOperand Vec;
+ if (MVT::getSizeInBits(VT) == 64) // MMX
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+ else // SSE
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
+ return DAG.getNode(ISD::BIT_CONVERT, VT, Vec);
}
+
/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
/// that point to V2 points to its first element.
static SDOperand NormalizeMask(SDOperand Mask, SelectionDAG &DAG) {
}
V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
- MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
- Mask = getZeroVector(MaskVT, DAG);
+ Mask = getZeroVector(MVT::v4i32, DAG);
SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1,
DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask);
return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
}
/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
-/// vector of zero or undef vector.
+/// vector of zero or undef vector. This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT,
unsigned NumElems, unsigned Idx,
bool isZero, SelectionDAG &DAG) {
SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
- SDOperand Zero = DAG.getConstant(0, EVT);
- SmallVector<SDOperand, 8> MaskVec(NumElems, Zero);
- MaskVec[Idx] = DAG.getConstant(NumElems, EVT);
+ SmallVector<SDOperand, 16> MaskVec;
+ for (unsigned i = 0; i != NumElems; ++i)
+ if (i == Idx) // If this is the insertion idx, put the low elt of V2 here.
+ MaskVec.push_back(DAG.getConstant(NumElems, EVT));
+ else
+ MaskVec.push_back(DAG.getConstant(i, EVT));
SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
&MaskVec[0], MaskVec.size());
return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
SDOperand
X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
- // All zero's are handled with pxor.
- if (ISD::isBuildVectorAllZeros(Op.Val))
- return Op;
+ // All zero's are handled with pxor, all one's are handled with pcmpeqd.
+ if (ISD::isBuildVectorAllZeros(Op.Val) || ISD::isBuildVectorAllOnes(Op.Val)) {
+ // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+ // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
+ // eliminated on x86-32 hosts.
+ if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+ return Op;
- // All one's are handled with pcmpeqd.
- if (ISD::isBuildVectorAllOnes(Op.Val))
- return Op;
+ if (ISD::isBuildVectorAllOnes(Op.Val))
+ return getOnesVector(Op.getValueType(), DAG);
+ return getZeroVector(Op.getValueType(), DAG);
+ }
MVT::ValueType VT = Op.getValueType();
MVT::ValueType EVT = MVT::getVectorElementType(VT);
}
if (NumNonZero == 0) {
- if (NumZero == 0)
- // All undef vector. Return an UNDEF.
- return DAG.getNode(ISD::UNDEF, VT);
- else
- // A mix of zero and undef. Return a zero vector.
- return getZeroVector(VT, DAG);
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ return DAG.getNode(ISD::UNDEF, VT);
}
// Splat is obviously ok. Let legalizer expand it to a shuffle.
return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
bool Commuted = false;
+ // FIXME: This should also accept a bitcast of a splat? Be careful, not
+ // 1,1,1,1 -> v8i16 though.
V1IsSplat = isSplatVector(V1.Val);
V2IsSplat = isSplatVector(V2.Val);
+
+ // Canonicalize the splat or undef, if present, to be on the RHS.
if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
std::swap(V1IsSplat, V2IsSplat);
//===----------------------------------------------------------------------===//
// Alias instructions that map zero vector to pxor.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let isReMaterializable = 1 in {
def MMX_V_SET0 : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins),
"pxor\t$dst, $dst",
- [(set VR64:$dst, (v1i64 immAllZerosV))]>;
+ [(set VR64:$dst, (v2i32 immAllZerosV))]>;
def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins),
"pcmpeqd\t$dst, $dst",
- [(set VR64:$dst, (v1i64 immAllOnesV))]>;
+ [(set VR64:$dst, (v2i32 immAllOnesV))]>;
}
//===----------------------------------------------------------------------===//
def : Pat<(store (v1i64 VR64:$src), addr:$dst),
(MMX_MOVQ64mr addr:$dst, VR64:$src)>;
-// 64-bit vector all zero's.
-def : Pat<(v8i8 immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v2i32 immAllZerosV), (MMX_V_SET0)>;
-def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
-
-// 64-bit vector all one's.
-def : Pat<(v8i8 immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v4i16 immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v2i32 immAllOnesV), (MMX_V_SETALLONES)>;
-def : Pat<(v1i64 immAllOnesV), (MMX_V_SETALLONES)>;
-
// Bit convert.
def : Pat<(v8i8 (bitconvert (v1i64 VR64:$src))), (v8i8 VR64:$src)>;
def : Pat<(v8i8 (bitconvert (v2i32 VR64:$src))), (v8i8 VR64:$src)>;
// Move scalar to XMM zero-extended
// movd to XMM register zero-extends
let AddedComplexity = 15 in {
- def : Pat<(v8i8 (vector_shuffle immAllZerosV,
+ def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
(v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
(MMX_MOVZDI2PDIrr GR32:$src)>;
- def : Pat<(v4i16 (vector_shuffle immAllZerosV,
+ def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
(v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
(MMX_MOVZDI2PDIrr GR32:$src)>;
def : Pat<(v2i32 (vector_shuffle immAllZerosV,
def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
VR64:$src2)),
(MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
VR64:$src2)),
(MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV_bc))),
VR64:$src2)),
(MMX_PANDNrr VR64:$src1, VR64:$src2)>;
def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
(load addr:$src2))),
(MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
(load addr:$src2))),
(MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV))),
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV_bc))),
(load addr:$src2))),
(MMX_PANDNrm VR64:$src1, addr:$src2)>;
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
// Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let isReMaterializable = 1 in
def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
"xorps\t$dst, $dst",
- [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+ [(set VR128:$dst, (v4i32 immAllZerosV))]>;
// FR32 to 128-bit vector conversion.
def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
let AddedComplexity = 20 in
def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
"movss\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV,
+ [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV_bc,
(v4f32 (scalar_to_vector (loadf32 addr:$src))),
MOVL_shuffle_mask)))]>;
// Alias instructions that map zero vector to pxor / xorp* for sse.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let isReMaterializable = 1 in
def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
"pcmpeqd\t$dst, $dst",
- [(set VR128:$dst, (v2f64 immAllOnesV))]>;
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
// FR64 to 128-bit vector conversion.
def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"movsd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (vector_shuffle immAllZerosV,
+ (v2f64 (vector_shuffle immAllZerosV_bc,
(v2f64 (scalar_to_vector
(loadf64 addr:$src))),
MOVL_shuffle_mask)))]>;
def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
-// 128-bit vector all zero's.
-def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
-
-// 128-bit vector all one's.
-def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>;
-
-
// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
// 16-bits matter.
def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
// Move scalar to XMM zero-extended
// movd to XMM register zero-extends
let AddedComplexity = 15 in {
-def : Pat<(v8i16 (vector_shuffle immAllZerosV,
+def : Pat<(v8i16 (vector_shuffle immAllZerosV_bc,
(v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
(MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v16i8 (vector_shuffle immAllZerosV,
+def : Pat<(v16i8 (vector_shuffle immAllZerosV_bc,
(v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
(MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (vector_shuffle immAllZerosV,
+def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc,
(v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
(MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4f32 (vector_shuffle immAllZerosV,
+def : Pat<(v4f32 (vector_shuffle immAllZerosV_bc,
(v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
(MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
}
// Set lowest element and zero upper elements.
let AddedComplexity = 20 in
-def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV,
+def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV_bc,
(v2f64 (scalar_to_vector (loadf64 addr:$src))),
MOVL_shuffle_mask)),
(MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>;
--- /dev/null
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep pxor | count 1
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep xorps | count 1
+; RUN: llvm-as < %s | llc -relocation-model=static -mcpu=yonah | grep pcmpeqd | count 2
+
+@M1 = external global <1 x i64>
+@M2 = external global <2 x i32>
+
+@S1 = external global <2 x i64>
+@S2 = external global <4 x i32>
+
+define void @test() {
+ store <1 x i64> zeroinitializer, <1 x i64>* @M1
+ store <2 x i32> zeroinitializer, <2 x i32>* @M2
+ ret void
+}
+
+define void @test2() {
+ store <1 x i64> < i64 -1 >, <1 x i64>* @M1
+ store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2
+ ret void
+}
+
+define void @test3() {
+ store <2 x i64> zeroinitializer, <2 x i64>* @S1
+ store <4 x i32> zeroinitializer, <4 x i32>* @S2
+ ret void
+}
+
+define void @test4() {
+ store <2 x i64> < i64 -1, i64 -1>, <2 x i64>* @S1
+ store <4 x i32> < i32 -1, i32 -1, i32 -1, i32 -1 >, <4 x i32>* @S2
+ ret void
+}
+
+