return SDValue();
}
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
+/// SSE2 and AVX2 logical bit-shift instructions. The function matches
+/// elements from one of the input vectors shuffled to the left or right
+/// with zeroable elements 'shifted in'.
+static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ int Size = Mask.size();
+ assert(Size == VT.getVectorNumElements() && "Unexpected mask size");
+
+ // PSRL : (little-endian) right bit shift.
+ // [ 1, zz, 3, zz]
+ // [ -1, -1, 7, zz]
+ // PSHL : (little-endian) left bit shift.
+ // [ zz, 0, zz, 2 ]
+ // [ -1, 4, zz, -1 ]
+ auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
+ MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+ MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ assert(TLI.isTypeLegal(ShiftVT) && "Illegal integer vector type");
+
+ bool MatchLeft = true, MatchRight = true;
+ for (int i = 0; i != Size; i += Scale) {
+ for (int j = 0; j != Shift; j++) {
+ MatchLeft &= Zeroable[i + j];
+ }
+ for (int j = Scale - Shift; j != Scale; j++) {
+ MatchRight &= Zeroable[i + j];
+ }
+ }
+ if (!(MatchLeft || MatchRight))
+ return SDValue();
+
+ bool MatchV1 = true, MatchV2 = true;
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = MatchLeft ? i + Shift : i;
+ unsigned Low = MatchLeft ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
+ MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
+ }
+ if (!(MatchV1 || MatchV2))
+ return SDValue();
+
+ // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
+ unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
+ int ShiftAmt = Shift * VT.getScalarSizeInBits();
+ SDValue V = MatchV1 ? V1 : V2;
+ V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+ V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; Shift++)
+ if (SDValue BitShift = MatchBitShift(Shift, Scale))
+ return BitShift;
+
+ // no match
+ return SDValue();
+}
+
/// \brief Lower a vector shuffle as a zero or any extension.
///
/// Given a specific number of elements, element bit width, and extension
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v4i32, V1, V2, Mask, DAG))
Mask, Subtarget, DAG))
return Broadcast;
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v8i16, V, V, Mask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v8i16, V, V, Mask, DAG))
assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.");
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v8i16, V1, V2, Mask, DAG))
ArrayRef<int> OrigMask = SVOp->getMask();
assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v16i8, V1, V2, OrigMask, DAG))
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v16i16, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v32i8, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
; shuffle instruction when the shuffle indexes are not compatible.
define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test17:\r
-; CHECK: # BB#0:\r
-; CHECK-NEXT: xorps %xmm2, %xmm2\r
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2]\r
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]\r
-; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero\r
-; CHECK-NEXT: orps %xmm2, %xmm0\r
-; CHECK-NEXT: retq\r
- %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>\r
- %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>\r
+; CHECK-LABEL: test17:
+; CHECK: # BB#0:
+; CHECK-NEXT: psllq $32, %xmm0
+; CHECK-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
ret <4 x i32> %or
}
define <16 x i8> @t5(<16 x i8> %x) nounwind {
; CHECK-LABEL: t5:
; CHECK: # BB#0:
-; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: psrlw $8, %xmm0
; CHECK-NEXT: retl
%s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
ret <16 x i8> %s
define <16 x i8> @t6(<16 x i8> %x) nounwind {
; CHECK-LABEL: t6:
; CHECK: # BB#0:
-; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: psrlw $8, %xmm0
; CHECK-NEXT: retl
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i8> %s
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
ret <16 x i8> %s
}
+
+define <16 x i8> @t8(<16 x i8> %x) nounwind {
+; CHECK-LABEL: t8:
+; CHECK: # BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
+ %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
+ ret <16 x i8> %s
+}
+
+define <16 x i8> @t9(<16 x i8> %x) nounwind {
+; CHECK-LABEL: t9:
+; CHECK: # BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
+ %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
+ ret <16 x i8> %s
+}
; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSSE3: # BB#0:
; SSSE3-NEXT: movd %edi, %xmm0
-; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSSE3-NEXT: pslld $24, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE41: # BB#0:
; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSE41-NEXT: pslld $24, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
; AVX-NEXT: vmovd %edi, %xmm0
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX-NEXT: vpslld $24, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 3
ret void
}
+;
+; Shuffle to logical bit shifts
+;
+
+define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; SSE: # BB#0:
+; SSE-NEXT: psllw $8, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllw $8, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $24, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX: # BB#0:
+; AVX-NEXT: vpslld $24, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
+; SSE: # BB#0:
+; SSE-NEXT: psllq $56, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq $56, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
+; SSE: # BB#0:
+; SSE-NEXT: psllq $8, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq $8, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
+; SSE: # BB#0:
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlw $8, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
+; SSE: # BB#0:
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrld $16, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
+ ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
+; SSE: # BB#0:
+; SSE-NEXT: psrlq $56, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlq $56, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
+ ret <16 x i8> %shuffle
+}
+
define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
; SSE2-LABEL: PR12412:
; SSE2: # BB#0: # %entry
%shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shuffle
}
+
+;
+; Shuffle to logical bit shifts
+;
+
+define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
+; SSE-LABEL: shuffle_v4i32_z0zX:
+; SSE: # BB#0:
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_z0zX:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq $32, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
+; SSE-LABEL: shuffle_v4i32_1z3z:
+; SSE: # BB#0:
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_1z3z:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlq $32, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+ ret <4 x i32> %shuffle
+}
ret <8 x i16> %shuffle
}
+;
+; Shuffle to logical bit shifts
+;
+define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_z0z2z4z6:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_z0z2z4z6:
+; AVX: # BB#0:
+; AVX-NEXT: vpslld $16, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_zzz0zzz4:
+; SSE: # BB#0:
+; SSE-NEXT: psllq $48, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_zzz0zzz4:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq $48, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 8, i32 0, i32 8, i32 8, i32 8, i32 4>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_zz01zX4X:
+; SSE: # BB#0:
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_zz01zX4X:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq $32, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 0, i32 1, i32 8, i32 undef, i32 4, i32 undef>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_z0X2z456:
+; SSE: # BB#0:
+; SSE-NEXT: psllq $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_z0X2z456:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq $16, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 undef, i32 2, i32 8, i32 4, i32 5, i32 6>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_1z3zXz7z:
+; SSE: # BB#0:
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_1z3zXz7z:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrld $16, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 undef, i32 8, i32 7, i32 8>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_1X3z567z:
+; SSE: # BB#0:
+; SSE-NEXT: psrlq $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_1X3z567z:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlq $16, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 8>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_23zz67zz:
+; SSE: # BB#0:
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_23zz67zz:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlq $32, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 2, i32 3, i32 8, i32 8, i32 6, i32 7, i32 8, i32 8>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) {
+; SSE-LABEL: shuffle_v8i16_3zXXXzzz:
+; SSE: # BB#0:
+; SSE-NEXT: psrlq $48, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_3zXXXzzz:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlq $48, %xmm0
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 3, i32 8, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8>
+ ret <8 x i16> %shuffle
+}
+
define <8 x i16> @shuffle_v8i16_01u3zzuz(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_01u3zzuz:
; SSE: # BB#0:
ret <16 x i16> %shuffle
}
+;
+; Shuffle to logical bit shifts
+;
+
+define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpslld $16, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllq $48, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrld $16, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
+ ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,2,3,2,3,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlq $32, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15, i32 16, i32 16>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
; AVX1: # BB#0:
ret <32 x i8> %shuffle
}
+;
+; Shuffle to logical bit shifts
+;
+
+define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllw $8, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 0, i32 32, i32 2, i32 32, i32 4, i32 32, i32 6, i32 32, i32 8, i32 32, i32 10, i32 32, i32 12, i32 32, i32 14, i32 32, i32 16, i32 32, i32 18, i32 32, i32 20, i32 32, i32 22, i32 32, i32 24, i32 32, i32 26, i32 32, i32 28, i32 32, i32 30>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,0,1,128,128,4,5,128,128,8,9,128,128,12,13]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpslld $16, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 12, i32 13, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 20, i32 21, i32 32, i32 32, i32 24, i32 25, i32 32, i32 32, i32 28, i32 29>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,0,1,128,128,128,128,128,128,8,9]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,0,0,0,0,0],zero,zero,xmm3[0,0,0,0,0,0],zero,zero
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllq $48, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlw $8, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 32, i32 17, i32 32, i32 19, i32 32, i32 21, i32 32, i32 23, i32 32, i32 25, i32 32, i32 27, i32 32, i32 29, i32 32, i32 31, i32 32>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,128,128,6,7,128,128,10,11,128,128,14,15,128,128]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0],zero,zero,xmm3[0,0]
+; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrld $16, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 32, i32 32, i32 18, i32 19, i32 32, i32 32, i32 22, i32 23, i32 32, i32 32, i32 26, i32 27, i32 32, i32 32, i32 30, i32 31, i32 32, i32 32>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,128,128,128,15,128,128,128,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm3[0,0,0],zero,xmm3[0,0,0,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlq $56, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 23, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ ret <32 x i8> %shuffle
+}
+
define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
; AVX1: # BB#0:
ret <8 x float> %1
}
+;
+; Shuffle to logical bit shifts
+;
+
+define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_z0U2zUz6:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_z0U2zUz6:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllq $32, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 undef, i32 2, i32 8, i32 undef, i32 8, i32 6>
+ ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
+; AVX1-LABEL: shuffle_v8i32_1U3z5zUU:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_1U3z5zUU:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlq $32, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 8, i32 undef, i32 undef>
+ ret <8 x i32> %shuffle
+}
+
define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_1:
; ALL: # BB#0: # %entry