From: Elena Demikhovsky Date: Thu, 17 Sep 2015 06:53:12 +0000 (+0000) Subject: AVX-512: shufflevector for i1 vectors <2 x i1> .. <64 x i1> X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=2e5bf5535ce4c9853c0c99c1fa795e747eb2232e;p=oota-llvm.git AVX-512: shufflevector for i1 vectors <2 x i1> .. <64 x i1> AVX-512 does not provide an instruction that shuffles mask register. So I do the following way: mask-2-simd , shuffle simd , simd-2-mask Differential Revision: http://reviews.llvm.org/D12727 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247876 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6f337984336..dc5c0d1e352 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1382,6 +1382,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); @@ -1601,6 +1603,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1646,6 +1650,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v2i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -4417,14 +4423,18 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, - SDLoc dl) { +static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); SDValue Vec; - if (VT.is256BitVector()) { - if (HasInt256) { // AVX2 + if (VT.is512BitVector()) { + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + } else if (VT.is256BitVector()) { + if (Subtarget->hasInt256()) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX @@ -6047,7 +6057,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return Op; if (!VT.is512BitVector()) - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + return getOnesVector(VT, Subtarget, DAG, dl); } BuildVectorSDNode *BV = cast(Op.getNode()); @@ -10762,6 +10772,61 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } +// Lower vXi1 vector shuffles. +// There is no a dedicated instruction on AVX-512 that shuffles the masks. +// The only way to shuffle bits is to sign-extend the mask vector to SIMD +// vector, shuffle and then truncate it back. +static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/o basic ISA!"); + EVT ExtVT; + switch (VT.SimpleTy) { + default: + assert(false && "Expected a vector of i1 elements"); + break; + case MVT::v2i1: + ExtVT = MVT::v2i64; + break; + case MVT::v4i1: + ExtVT = MVT::v4i32; + break; + case MVT::v8i1: + ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + break; + case MVT::v16i1: + ExtVT = MVT::v16i32; + break; + case MVT::v32i1: + ExtVT = MVT::v32i16; + break; + case MVT::v64i1: + ExtVT = MVT::v64i8; + break; + } + + if (ISD::isBuildVectorAllZeros(V1.getNode())) + V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V1.getNode())) + V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); + + if (V2.isUndef()) + V2 = DAG.getUNDEF(ExtVT); + else if (ISD::isBuildVectorAllZeros(V2.getNode())) + V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V2.getNode())) + V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); +} /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -10778,8 +10843,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc dl(Op); + bool Is1BitVector = (VT.getScalarType() == MVT::i1); - assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + assert((VT.getSizeInBits() != 64 || Is1BitVector) && + "Can't lower MMX shuffles"); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -10817,7 +10884,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; - if (VT.getScalarSizeInBits() < 64 && + if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(Mask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) @@ -10894,6 +10961,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, if (VT.getSizeInBits() == 512) return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (Is1BitVector) + return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 446d4bce155..d1498bb72bc 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -4807,8 +4807,12 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return true; case X86::KSET0B: case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); + case X86::KSET0D: return Expand2AddrUndef(MIB, get(X86::KXORDrr)); + case X86::KSET0Q: return Expand2AddrUndef(MIB, get(X86::KXORQrr)); case X86::KSET1B: case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); + case X86::KSET1D: return Expand2AddrUndef(MIB, get(X86::KXNORDrr)); + case X86::KSET1Q: return Expand2AddrUndef(MIB, get(X86::KXNORQrr)); case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 509baf9dd82..d4ec23699a0 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -361,39 +361,6 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ret <8 x i1>%d } -; KNL-LABEL: test19 -; KNL: movzbl %dil, %eax -; KNL: kmovw %eax, %k0 -; KNL: kshiftlw $13, %k0, %k0 -; KNL: kshiftrw $15, %k0, %k0 -; KNL: kmovw %k0, %eax -; KNL: andl $1, %eax -; KNL: testb %al, %al - -define <8 x i1> @test19(i8 %a) { - %b = bitcast i8 %a to <8 x i1> - %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> - ret <8 x i1> %c -} - -; KNL-LABEL: test20 -; KNL: movzbl %dil, %eax -; KNL: kmovw %eax, %k0 -; KNL: kshiftlw $13, %k0, %k1 -; KNL: kshiftrw $15, %k1, %k1 -; KNL: kshiftlw $12, %k0, %k0 -; KNL: kshiftrw $15, %k0, %k0 -; KNL: kshiftlw $4, %k0, %k0 -; KNL: kshiftlw $1, %k1, %k2 -; KNL: korw %k0, %k2, %k0 -; KNL: kshiftlw $6, %k1, %k1 -; KNL: korw %k1, %k0, %k1 -define <8 x i1> @test20(i8 %a, i16 %y) { - %b = bitcast i8 %a to <8 x i1> - %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> - ret <8 x i1> %c -} - ; KNL-LABEL: test21 ; KNL: vpand %ymm ; KNL: vextracti128 $1, %ymm2 diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll new file mode 100644 index 00000000000..f9af320a665 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -0,0 +1,401 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ + +target triple = "x86_64-unknown-unknown" + +define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { +; AVX512F-LABEL: shuf2i1_1_0: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf2i1_1_0: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> + ret <2 x i1> %b +} + +define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { +; AVX512F-LABEL: shuf2i1_1_2: +; AVX512F: # BB#0: +; AVX512F-NEXT: movl $1, %eax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf2i1_1_2: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 +; VL_BW_DQ-NEXT: movb $1, %al +; VL_BW_DQ-NEXT: kmovb %eax, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 +; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0 +; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = shufflevector <2 x i1> %a, <2 x i1> , <2 x i32> + ret <2 x i1> %b +} + + +define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { +; AVX512F-LABEL: shuf4i1_3_2_10: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf4i1_3_2_10: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + ret <4 x i1> %b +} + +define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { +; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %a2 = icmp eq <8 x i64> %a, %a1 + %b2 = icmp eq <8 x i64> %b, %b1 + %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> + ret <8 x i1> %c +} + +define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) { +; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm1 {%k2} {z} +; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 +; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0 +; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %a2 = icmp eq <16 x i32> %a, %a1 + %b2 = icmp eq <16 x i32> %b, %b1 + %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> + ret <16 x i1> %c +} + +define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { +; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: +; AVX512F: # BB#0: +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 +; VL_BW_DQ-NEXT: retq + %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> + ret <32 x i1> %b +} + +define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { +; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 +; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> + ret <8 x i1> %c +} + +define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { +; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> +; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> + %d = bitcast <8 x i1> %c to i8 + ret i8 %d +} + +define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { +; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u> +; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> + %d = bitcast <8 x i1> %c to i8 + ret i8 %d +} + +define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { +; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> + %d = bitcast <8 x i1>%c to i8 + ret i8 %d +} + +define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { +; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] +; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] +; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> + %d = bitcast <8 x i1>%c to i8 + ret i8 %d +} + +define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { +; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: movb $51, %al +; AVX512F-NEXT: movzbl %al, %eax +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k2} {z} +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: movb $51, %al +; VL_BW_DQ-NEXT: kmovb %eax, %k1 +; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] +; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector <8 x i1> , <8 x i1> %b, <8 x i32> + %c1 = bitcast <8 x i1>%c to i8 + ret i8 %c1 +} + +define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { +; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vpandq %zmm3, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 +; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: retq + %c = shufflevector <8 x i1> , <8 x i1> %a, <8 x i32> + %c1 = bitcast <8 x i1>%c to i8 + ret i8 %c1 +} + + +define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { +; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: +; AVX512F: # BB#0: +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 +; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovw %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 +; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 +; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 +; VL_BW_DQ-NEXT: kmovw %k0, %eax +; VL_BW_DQ-NEXT: retq + %b = bitcast i16 %a to <16 x i1> + %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer + %d = bitcast <16 x i1> %c to i16 + ret i16 %d +} + +define i64 @shuf64i1_zero(i64 %a) { +; VL_BW_DQ-LABEL: shuf64i1_zero: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kxorq %k0, %k0, %k0 +; VL_BW_DQ-NEXT: kmovq %k0, %rax +; VL_BW_DQ-NEXT: retq + %b = bitcast i64 %a to <64 x i1> + %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer + %d = bitcast <64 x i1> %c to i64 + ret i64 %d +}