assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+ // If we have a blend of two PACKUS operations an the blend aligns with the
+ // low and half halves, we can just merge the PACKUS operations. This is
+ // particularly important as it lets us merge shuffles that this routine itself
+ // creates.
+ auto GetPackNode = [](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+ };
+ if (SDValue V1Pack = GetPackNode(V1))
+ if (SDValue V2Pack = GetPackNode(V2))
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+ Mask[0] == 0 ? V1Pack.getOperand(0)
+ : V1Pack.getOperand(1),
+ Mask[1] == 2 ? V2Pack.getOperand(0)
+ : V2Pack.getOperand(1)));
+
// Try to use shift instructions.
if (SDValue Shift =
lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> OrigMask = SVOp->getMask();
- assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Try to use shift instructions.
if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return ZExt;
- int MaskStorage[16] = {
- OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
- OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
- OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
- OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
- MutableArrayRef<int> Mask(MaskStorage);
- MutableArrayRef<int> LoMask = Mask.slice(0, 8);
- MutableArrayRef<int> HiMask = Mask.slice(8, 8);
-
int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
return Result;
}
- int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ // Handle multi-input cases by blending single-input shuffles.
+ if (NumV2Elements > 0)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+ Mask, DAG);
- auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
- MutableArrayRef<int> V1HalfBlendMask,
- MutableArrayRef<int> V2HalfBlendMask) {
- for (int i = 0; i < 8; ++i)
- if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
- V1HalfBlendMask[i] = HalfMask[i];
- HalfMask[i] = i;
- } else if (HalfMask[i] >= 16) {
- V2HalfBlendMask[i] = HalfMask[i] - 16;
- HalfMask[i] = i + 8;
- }
- };
- buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
- buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+ // The fallback path for single-input shuffles widens this into two v8i16
+ // vectors with unpacks, shuffles those, and then pulls them back together
+ // with a pack.
+ SDValue V = V1;
- SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+ int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0)
+ (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
- auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
- MutableArrayRef<int> HiBlendMask) {
- SDValue V1, V2;
- // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
- // them out and avoid using UNPCK{L,H} to extract the elements of V as
- // i16s.
- if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; }) &&
- std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; })) {
- // Use a mask to drop the high bytes.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
- V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
- DAG.getConstant(0x00FF, MVT::v8i16));
-
- // This will be a single vector shuffle instead of a blend so nuke V2.
- V2 = DAG.getUNDEF(MVT::v8i16);
-
- // Squash the masks to point directly into V1.
- for (int &M : LoBlendMask)
- if (M >= 0)
- M /= 2;
- for (int &M : HiBlendMask)
- if (M >= 0)
- M /= 2;
- } else {
- // Otherwise just unpack the low half of V into V1 and the high half into
- // V2 so that we can blend them as i16s.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
- }
+ SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
- SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- return std::make_pair(BlendedLo, BlendedHi);
- };
- SDValue V1Lo, V1Hi, V2Lo, V2Hi;
- std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
- std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+ SDValue VLoHalf, VHiHalf;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+ DAG.getConstant(0x00FF, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+ VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into VLoHalf.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into VLoHalf and the high half into
+ // VHiHalf so that we can blend them as i16s.
+ VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
}
define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
; X64-LABEL: t16:
; X64: ## BB#0: ## %entry
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
-; X64-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT: pand %xmm0, %xmm1
-; X64-NEXT: pandn %xmm3, %xmm0
-; X64-NEXT: por %xmm1, %xmm0
-; X64-NEXT: packuswb %xmm0, %xmm0
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT: pslld $16, %xmm0
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-NEXT: pand %xmm1, %xmm2
+; X64-NEXT: pandn %xmm0, %xmm1
+; X64-NEXT: por %xmm2, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
entry:
%tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; SSE2: # BB#0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm3, %xmm4
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
; SSE2: # BB#0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
; SSE2-LABEL: PR20540:
; SSE2: # BB#0:
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR20540:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,1,4,5,6,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,1,4,5,6,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7]
; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: