ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operansd into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
- // Whenever we can lower this as a zext, that instruction is strictly faster
- // than any alternative.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
- return ZExt;
-
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
}
define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: shuffle_v4i32_0u1u:
-; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE-NEXT: retq
+; SSE2-LABEL: shuffle_v4i32_0u1u:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v4i32_0u1u:
+; SSE3: # BB#0:
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0u1u:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v4i32_0u1u:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0u1u:
; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vpmovzxdq %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
ret <4 x i32> %shuffle